diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml
index a0428336d300f..9cf64417d3cb2 100644
--- a/.github/new-prs-labeler.yml
+++ b/.github/new-prs-labeler.yml
@@ -1,3 +1,9 @@
+ClangIR:
+  - clang/include/clang/CIR/**/*
+  - clang/lib/CIR/**/*
+  - clang/tools/cir-*/**/*
+  - clang/test/CIR/**/*
+
 clang:dataflow:
   - clang/include/clang/Analysis/FlowSensitive/**/*
   - clang/lib/Analysis/FlowSensitive/**/*
@@ -938,3 +944,6 @@ openmp:libomptarget:
 
 bazel:
   - utils/bazel/**
+
+offload:
+  - offload/**
diff --git a/bolt/include/bolt/Rewrite/RewriteInstance.h b/bolt/include/bolt/Rewrite/RewriteInstance.h
index 826677cd63b22..af832b4c7c84c 100644
--- a/bolt/include/bolt/Rewrite/RewriteInstance.h
+++ b/bolt/include/bolt/Rewrite/RewriteInstance.h
@@ -368,13 +368,6 @@ class RewriteInstance {
   /// rewritten binary.
   void patchBuildID();
 
-  /// Return file offset corresponding to a given virtual address.
-  uint64_t getFileOffsetFor(uint64_t Address) {
-    assert(Address >= NewTextSegmentAddress &&
-           "address in not in the new text segment");
-    return Address - NewTextSegmentAddress + NewTextSegmentOffset;
-  }
-
   /// Return file offset corresponding to a virtual \p Address.
   /// Return 0 if the address has no mapping in the file, including being
   /// part of .bss section.
@@ -398,9 +391,6 @@ class RewriteInstance {
   /// Return true if the section holds debug information.
   static bool isDebugSection(StringRef SectionName);
 
-  /// Return true if the section holds linux kernel symbol information.
-  static bool isKSymtabSection(StringRef SectionName);
-
   /// Adds Debug section to overwrite.
   static void addToDebugSectionsToOverwrite(const char *Section) {
     DebugSectionsToOverwrite.emplace_back(Section);
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index fd2477231142e..4e0096cf988ae 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -5767,10 +5767,3 @@ bool RewriteInstance::isDebugSection(StringRef SectionName) {
 
   return false;
 }
-
-bool RewriteInstance::isKSymtabSection(StringRef SectionName) {
-  if (SectionName.starts_with("__ksymtab"))
-    return true;
-
-  return false;
-}
diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
index 2fa7cd0baf98f..c507043c367a8 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
@@ -85,10 +85,10 @@ void UnnecessaryValueParamCheck::check(const MatchFinder::MatchResult &Result) {
 
   TraversalKindScope RAII(*Result.Context, TK_AsIs);
 
-  FunctionParmMutationAnalyzer &Analyzer =
-      MutationAnalyzers.try_emplace(Function, *Function, *Result.Context)
-          .first->second;
-  if (Analyzer.isMutated(Param))
+  FunctionParmMutationAnalyzer *Analyzer =
+      FunctionParmMutationAnalyzer::getFunctionParmMutationAnalyzer(
+          *Function, *Result.Context, MutationAnalyzerCache);
+  if (Analyzer->isMutated(Param))
     return;
 
   const bool IsConstQualified =
@@ -169,7 +169,7 @@ void UnnecessaryValueParamCheck::storeOptions(
 }
 
 void UnnecessaryValueParamCheck::onEndOfTranslationUnit() {
-  MutationAnalyzers.clear();
+  MutationAnalyzerCache.clear();
 }
 
 void UnnecessaryValueParamCheck::handleMoveFix(const ParmVarDecl &Var,
diff --git a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
index 1872e3bc9bf29..7250bffd20b2f 100644
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.h
@@ -37,8 +37,7 @@ class UnnecessaryValueParamCheck : public ClangTidyCheck {
   void handleMoveFix(const ParmVarDecl &Var, const DeclRefExpr &CopyArgument,
                      const ASTContext &Context);
 
-  llvm::DenseMap<const FunctionDecl *, FunctionParmMutationAnalyzer>
-      MutationAnalyzers;
+  ExprMutationAnalyzer::Memoized MutationAnalyzerCache;
   utils::IncludeInserter Inserter;
   const std::vector<StringRef> AllowedTypes;
 };
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 4dfbd8ca49ab9..7095c564444fe 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -221,6 +221,10 @@ Changes in existing checks
   <clang-tidy/checks/llvm/header-guard>` check by replacing the local
   option `HeaderFileExtensions` by the global option of the same name.
 
+- Improved :doc:`misc-const-correctness
+  <clang-tidy/checks/misc/const-correctness>` check by avoiding infinite recursion
+  for recursive forwarding reference.
+
 - Improved :doc:`misc-definitions-in-headers
   <clang-tidy/checks/misc/definitions-in-headers>` check by replacing the local
   option `HeaderFileExtensions` by the global option of the same name.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst
index a3e88b837d375..c37df1706eb4e 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst
@@ -190,6 +190,6 @@ Options
 
 .. option:: WarnOnSizeOfPointerToAggregate
 
-   When `true, the check will warn on an expression like
+   When `true`, the check will warn on an expression like
    ``sizeof(expr)`` where the expression is a pointer
    to aggregate. Default is `true`.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-templates.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-templates.cpp
index 9da468128743e..248374a71dd40 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-templates.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-templates.cpp
@@ -58,3 +58,18 @@ void concatenate3(Args... args)
     (..., (stream << args));
 }
 } // namespace gh70323
+
+namespace gh60895 {
+
+template <class T> void f1(T &&a);
+template <class T> void f2(T &&a);
+template <class T> void f1(T &&a) { f2<T>(a); }
+template <class T> void f2(T &&a) { f1<T>(a); }
+void f() {
+  int x = 0;
+  // CHECK-MESSAGES:[[@LINE-1]]:3: warning: variable 'x' of type 'int' can be declared 'const'
+  // CHECK-FIXES: int const x = 0;
+  f1(x);
+}
+
+} // namespace gh60895
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 05c8f765b5569..3bead159c8f94 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -3466,6 +3466,54 @@ Query for this feature with ``__has_builtin(__builtin_trap)``.
 
 ``__builtin_arm_trap`` is lowered to the ``llvm.aarch64.break`` builtin, and then to ``brk #payload``.
 
+``__builtin_allow_runtime_check``
+---------------------------------
+
+``__builtin_allow_runtime_check`` return true if the check at the current
+program location should be executed. It is expected to be used to implement
+``assert`` like checks which can be safely removed by optimizer.
+
+**Syntax**:
+
+.. code-block:: c++
+
+    bool __builtin_allow_runtime_check(const char* kind)
+
+**Example of use**:
+
+.. code-block:: c++
+
+  if (__builtin_allow_runtime_check("mycheck") && !ExpensiveCheck()) {
+     abort();
+  }
+
+**Description**
+
+``__builtin_allow_runtime_check`` is lowered to ` ``llvm.allow.runtime.check``
+<https://llvm.org/docs/LangRef.html#llvm-allow-runtime-check-intrinsic>`_
+builtin.
+
+The ``__builtin_allow_runtime_check()`` is expected to be used with control
+flow conditions such as in ``if`` to guard expensive runtime checks. The
+specific rules for selecting permitted checks can differ and are controlled by
+the compiler options.
+
+Flags to control checks:
+* ``-mllvm -lower-allow-check-percentile-cutoff-hot=N`` where N is PGO hotness
+cutoff in range ``[0, 999999]`` to disallow checks in hot code.
+* ``-mllvm -lower-allow-check-random-rate=P`` where P is number in range
+``[0.0, 1.0]`` representation probability of keeping a check.
+* If both flags are specified, ``-lower-allow-check-random-rate`` takes
+precedence.
+* If none is specified, ``__builtin_allow_runtime_check`` is lowered as
+``true``, allowing all checks.
+
+Parameter ``kind`` is a string literal representing a user selected kind for
+guarded check. It's unused now. It will enable kind-specific lowering in future.
+E.g. a higher hotness cutoff can be used for more expensive kind of check.
+
+Query for this feature with ``__has_builtin(__builtin_allow_runtime_check)``.
+
 ``__builtin_nondeterministic_value``
 ------------------------------------
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index dc108785f6cc9..c19ad9fba58f3 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -68,7 +68,7 @@ AST Dumping Potentially Breaking Changes
 
 Clang Frontend Potentially Breaking Changes
 -------------------------------------------
-- Removed support for constructing on-stack ``TemplateArgumentList``s; interfaces should instead
+- Removed support for constructing on-stack ``TemplateArgumentList``\ s; interfaces should instead
   use ``ArrayRef<TemplateArgument>`` to pass template arguments. Transitioning internal uses to
   ``ArrayRef<TemplateArgument>`` reduces AST memory usage by 0.4% when compiling clang, and is
   expected to show similar improvements on other workloads.
@@ -104,8 +104,7 @@ C++20 Feature Support
 
 - Clang now implements [module.import]p7 fully. Clang now will import module
   units transitively for the module units coming from the same module of the
-  current module units.
-  Fixes `#84002 <https://github.com/llvm/llvm-project/issues/84002>`_.
+  current module units. Fixes #GH84002
 
 - Initial support for class template argument deduction (CTAD) for type alias
   templates (`P1814R0 <https://wg21.link/p1814r0>`_).
@@ -135,8 +134,7 @@ C++2c Feature Support
 Resolutions to C++ Defect Reports
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 - Substitute template parameter pack, when it is not explicitly specified
-  in the template parameters, but is deduced from a previous argument.
-  (`#78449: <https://github.com/llvm/llvm-project/issues/78449>`_).
+  in the template parameters, but is deduced from a previous argument. (#GH78449)
 
 - Type qualifications are now ignored when evaluating layout compatibility
   of two types.
@@ -176,8 +174,7 @@ C23 Feature Support
 
 - Clang now generates predefined macros of the form ``__TYPE_FMTB__`` and
   ``__TYPE_FMTb__`` (e.g., ``__UINT_FAST64_FMTB__``) in C23 mode for use with
-  macros typically exposed from ``<inttypes.h>``, such as ``PRIb8``.
-  (`#81896: <https://github.com/llvm/llvm-project/issues/81896>`_).
+  macros typically exposed from ``<inttypes.h>``, such as ``PRIb8``. (#GH81896)
 
 - Clang now supports `N3018 The constexpr specifier for object definitions`
   <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3018.htm>`_.
@@ -215,7 +212,10 @@ New Compiler Flags
 
 - ``-Wmissing-designated-field-initializers``, grouped under ``-Wmissing-field-initializers``.
   This diagnostic can be disabled to make ``-Wmissing-field-initializers`` behave
-  like it did before Clang 18.x. Fixes (`#56628 <https://github.com/llvm/llvm-project/issues/68933>`_)
+  like it did before Clang 18.x. Fixes #GH56628
+
+- ``-fexperimental-modules-reduced-bmi`` enables the Reduced BMI for C++20 named modules.
+  See the document of standard C++ modules for details.
 
 Deprecated Compiler Flags
 -------------------------
@@ -254,8 +254,7 @@ Removed Compiler Flags
 
 - The ``-freroll-loops`` flag has been removed. It had no effect since Clang 13.
 - ``-m[no-]unaligned-access`` is removed for RISC-V and LoongArch.
-  ``-m[no-]strict-align``, also supported by GCC, should be used instead.
-  (`#85350 <https://github.com/llvm/llvm-project/pull/85350>`_.)
+  ``-m[no-]strict-align``, also supported by GCC, should be used instead. (#GH85350)
 
 Attribute Changes in Clang
 --------------------------
@@ -325,8 +324,7 @@ Improvements to Clang's diagnostics
   Fixes #GH82512.
 
 - Clang now provides improved warnings for the ``cleanup`` attribute to detect misuse scenarios,
-  such as attempting to call ``free`` on an unallocated object. Fixes
-  `#79443 <https://github.com/llvm/llvm-project/issues/79443>`_.
+  such as attempting to call ``free`` on an unallocated object. Fixes #GH79443.
 
 - Clang no longer warns when the ``bitand`` operator is used with boolean
   operands, distinguishing it from potential typographical errors or unintended
@@ -366,17 +364,18 @@ Improvements to Clang's diagnostics
 - Clang now uses the correct type-parameter-key (``class`` or ``typename``) when printing
   template template parameter declarations.
 
+- Clang now diagnoses requires expressions with explicit object parameters.
+
 Improvements to Clang's time-trace
 ----------------------------------
 
 Bug Fixes in This Version
 -------------------------
 - Clang's ``-Wundefined-func-template`` no longer warns on pure virtual
-  functions.
-  (`#74016 <https://github.com/llvm/llvm-project/issues/74016>`_)
+  functions. (#GH74016)
 
 - Fixed missing warnings when comparing mismatched enumeration constants
-  in C (`#29217 <https://github.com/llvm/llvm-project/issues/29217>`).
+  in C (#GH29217)
 
 - Clang now accepts elaborated-type-specifiers that explicitly specialize
   a member class template for an implicit instantiation of a class template.
@@ -415,7 +414,7 @@ Bug Fixes in This Version
   type only rather than to the complex type (e.g. ``_Complex float / int`` is now evaluated
   as ``_Complex float / float`` rather than ``_Complex float / _Complex float``), as mandated
   by the C standard. This significantly improves codegen of `*` and `/` especially.
-  Fixes (`#31205 <https://github.com/llvm/llvm-project/issues/31205>`_).
+  Fixes #GH31205.
 
 - Fixes an assertion failure on invalid code when trying to define member
   functions in lambdas.
@@ -423,6 +422,8 @@ Bug Fixes in This Version
 - Fixed a regression in CTAD that a friend declaration that befriends itself may cause
   incorrect constraint substitution. (#GH86769).
 
+- Fixed an assertion failure on invalid InitListExpr in C89 mode (#GH88008).
+
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -464,8 +465,7 @@ Bug Fixes to C++ Support
 - Fix a crash when trying to call a varargs function that also has an explicit object parameter. (#GH80971)
 - Fixed a bug where abbreviated function templates would append their invented template parameters to
   an empty template parameter lists.
-- Fix parsing of abominable function types inside type traits.
-  Fixes (`#77585 <https://github.com/llvm/llvm-project/issues/77585>`_)
+- Fix parsing of abominable function types inside type traits. Fixes #GH77585
 - Clang now classifies aggregate initialization in C++17 and newer as constant
   or non-constant more accurately. Previously, only a subset of the initializer
   elements were considered, misclassifying some initializers as constant. Partially fixes
@@ -506,9 +506,7 @@ Bug Fixes to C++ Support
 - Fix a bug where overload resolution falsely reported an ambiguity when it was comparing
   a member-function against a non member function or a member-function with an
   explicit object parameter against a member function with no explicit object parameter
-  when one of the function had more specialized templates.
-  Fixes (`#82509 <https://github.com/llvm/llvm-project/issues/82509>`_)
-  and (`#74494 <https://github.com/llvm/llvm-project/issues/74494>`_)
+  when one of the function had more specialized templates. Fixes #GH82509 and #GH74494
 - Clang now supports direct lambda calls inside of a type alias template declarations.
   This addresses (#GH70601), (#GH76674), (#GH79555), (#GH81145) and (#GH82104).
 - Allow access to a public template alias declaration that refers to friend's
@@ -530,14 +528,14 @@ Bug Fixes to C++ Support
 - Fixed a bug that prevented member function templates of class templates declared with a deduced return type
   from being explicitly specialized for a given implicit instantiation of the class template.
 
-- Fix crash when inheriting from a cv-qualified type. Fixes:
-  (`#35603 <https://github.com/llvm/llvm-project/issues/35603>`_)
+- Fix crash when inheriting from a cv-qualified type. Fixes #GH35603
 - Fix a crash when the using enum declaration uses an anonymous enumeration. Fixes (#GH86790).
 - Handled an edge case in ``getFullyPackExpandedSize`` so that we now avoid a false-positive diagnostic. (#GH84220)
 - Clang now correctly tracks type dependence of by-value captures in lambdas with an explicit
   object parameter.
   Fixes (#GH70604), (#GH79754), (#GH84163), (#GH84425), (#GH86054), (#GH86398), and (#GH86399).
 - Fix a crash when deducing ``auto`` from an invalid dereference (#GH88329).
+- Fix a crash in requires expression with templated base class member function. Fixes (#GH84020).
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -551,8 +549,7 @@ Miscellaneous Clang Crashes Fixed
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 - Do not attempt to dump the layout of dependent types or invalid declarations
-  when ``-fdump-record-layouts-complete`` is passed.
-  Fixes (`#83684 <https://github.com/llvm/llvm-project/issues/83684>`_).
+  when ``-fdump-record-layouts-complete`` is passed. Fixes #GH83684.
 
 OpenACC Specific Changes
 ------------------------
@@ -602,8 +599,7 @@ Windows Support
   would only be included if AVX was enabled at compile time. This was done to work
   around include times from MSVC STL including ``intrin.h`` under clang-cl.
   Clang-cl now provides ``intrin0.h`` for MSVC STL and therefore all intrinsic
-  features without requiring enablement at compile time.
-  Fixes: (`#53520 <https://github.com/llvm/llvm-project/issues/53520>`_)
+  features without requiring enablement at compile time. Fixes #GH53520
 
 - Improved compile times with MSVC STL. MSVC provides ``intrin0.h`` which is a
   header that only includes intrinsics that are used by MSVC STL to avoid the
@@ -684,6 +680,8 @@ Static Analyzer
   but not under any case blocks if ``unroll-loops=true`` analyzer config is
   set. (#GH68819)
 - Support C++23 static operator calls. (#GH84972)
+- Fixed a crash in ``security.cert.env.InvalidPtr`` checker when accidentally
+  matched user-defined ``strerror`` and similar library functions. (GH#88181)
 
 New features
 ^^^^^^^^^^^^
diff --git a/clang/docs/StandardCPlusPlusModules.rst b/clang/docs/StandardCPlusPlusModules.rst
index c5478bba45f38..8d5529d5d37db 100644
--- a/clang/docs/StandardCPlusPlusModules.rst
+++ b/clang/docs/StandardCPlusPlusModules.rst
@@ -520,6 +520,112 @@ is attached to the global module fragments. For example:
 
 Now the linkage name of ``NS::foo()`` will be ``_ZN2NS3fooEv``.
 
+Reduced BMI
+-----------
+
+To support the 2 phase compilation model, Clang chose to put everything needed to
+produce an object into the BMI. But every consumer of the BMI, except itself, doesn't
+need such informations. It makes the BMI to larger and so may introduce unnecessary
+dependencies into the BMI. To mitigate the problem, we decided to reduce the information
+contained in the BMI.
+
+To be clear, we call the default BMI as Full BMI and the new introduced BMI as Reduced
+BMI.
+
+Users can use ``-fexperimental-modules-reduced-bmi`` flag to enable the Reduced BMI.
+
+For one phase compilation model (CMake implements this model), with
+``-fexperimental-modules-reduced-bmi``, the generated BMI will be Reduced BMI automatically.
+(The output path of the BMI is specified by ``-fmodule-output=`` as usual one phase
+compilation model).
+
+It is still possible to support Reduced BMI in two phase compilation model. With
+``-fexperimental-modules-reduced-bmi``, ``--precompile`` and ``-fmodule-output=`` specified,
+the generated BMI specified by ``-o`` will be full BMI and the BMI specified by
+``-fmodule-output=`` will be Reduced BMI. The dependency graph may be:
+
+.. code-block:: none
+
+  module-unit.cppm --> module-unit.full.pcm -> module-unit.o
+                    |
+                    -> module-unit.reduced.pcm -> consumer1.cpp
+                                               -> consumer2.cpp
+                                               -> ...
+                                               -> consumer_n.cpp
+
+We don't emit diagnostics if ``-fexperimental-modules-reduced-bmi`` is used with a non-module
+unit. This design helps the end users of one phase compilation model to perform experiments
+early without asking for the help of build systems. The users of build systems which supports
+two phase compilation model still need helps from build systems.
+
+Within Reduced BMI, we won't write unreachable entities from GMF, definitions of non-inline
+functions and non-inline variables. This may not be a transparent change.
+`[module.global.frag]ex2 <https://eel.is/c++draft/module.global.frag#example-2>`_ may be a good
+example:
+
+.. code-block:: c++
+
+  // foo.h
+  namespace N {
+    struct X {};
+    int d();
+    int e();
+    inline int f(X, int = d()) { return e(); }
+    int g(X);
+    int h(X);
+  }
+
+  // M.cppm
+  module;
+  #include "foo.h"
+  export module M;
+  template<typename T> int use_f() {
+    N::X x;                       // N::X, N, and :: are decl-reachable from use_f
+    return f(x, 123);             // N::f is decl-reachable from use_f,
+                                  // N::e is indirectly decl-reachable from use_f
+                                  //   because it is decl-reachable from N::f, and
+                                  // N::d is decl-reachable from use_f
+                                  //   because it is decl-reachable from N::f
+                                  //   even though it is not used in this call
+  }
+  template<typename T> int use_g() {
+    N::X x;                       // N::X, N, and :: are decl-reachable from use_g
+    return g((T(), x));           // N::g is not decl-reachable from use_g
+  }
+  template<typename T> int use_h() {
+    N::X x;                       // N::X, N, and :: are decl-reachable from use_h
+    return h((T(), x));           // N::h is not decl-reachable from use_h, but
+                                  // N::h is decl-reachable from use_h<int>
+  }
+  int k = use_h<int>();
+    // use_h<int> is decl-reachable from k, so
+    // N::h is decl-reachable from k
+
+  // M-impl.cpp
+  module M;
+  int a = use_f<int>();           // OK
+  int b = use_g<int>();           // error: no viable function for call to g;
+                                  // g is not decl-reachable from purview of
+                                  // module M's interface, so is discarded
+  int c = use_h<int>();           // OK
+
+In the above example, the function definition of ``N::g`` is elided from the Reduced
+BMI of ``M.cppm``. Then the use of ``use_g<int>`` in ``M-impl.cpp`` fails
+to instantiate. For such issues, users can add references to ``N::g`` in the module purview
+of ``M.cppm`` to make sure it is reachable, e.g., ``using N::g;``.
+
+We think the Reduced BMI is the correct direction. But given it is a drastic change,
+we'd like to make it experimental first to avoid breaking existing users. The roadmap
+of Reduced BMI may be:
+
+1. ``-fexperimental-modules-reduced-bmi`` is opt in for 1~2 releases. The period depends
+on testing feedbacks.
+2. We would announce Reduced BMI is not experimental and introduce ``-fmodules-reduced-bmi``.
+and suggest users to enable this mode. This may takes 1~2 releases too.
+3. Finally we will enable this by default. When that time comes, the term BMI will refer to
+the reduced BMI today and the Full BMI will only be meaningful to build systems which
+loves to support two phase compilations.
+
 Performance Tips
 ----------------
 
diff --git a/clang/docs/tools/clang-formatted-files.txt b/clang/docs/tools/clang-formatted-files.txt
index 3089438c23d94..2252d0ccde96d 100644
--- a/clang/docs/tools/clang-formatted-files.txt
+++ b/clang/docs/tools/clang-formatted-files.txt
@@ -123,6 +123,7 @@ clang/include/clang/Analysis/Analyses/CalledOnceCheck.h
 clang/include/clang/Analysis/Analyses/CFGReachabilityAnalysis.h
 clang/include/clang/Analysis/Analyses/ExprMutationAnalyzer.h
 clang/include/clang/Analysis/FlowSensitive/AdornedCFG.h
+clang/include/clang/Analysis/FlowSensitive/ASTOps.h
 clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
 clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
 clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
@@ -307,6 +308,7 @@ clang/lib/Analysis/CalledOnceCheck.cpp
 clang/lib/Analysis/CloneDetection.cpp
 clang/lib/Analysis/CodeInjector.cpp
 clang/lib/Analysis/FlowSensitive/AdornedCFG.cpp
+clang/lib/Analysis/FlowSensitive/ASTOps.cpp
 clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
 clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
 clang/lib/Analysis/FlowSensitive/DebugSupport.cpp
diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 2194d268fa86f..1079993f49694 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -672,16 +672,6 @@ class alignas(8) Decl {
   /// Whether this declaration comes from explicit global module.
   bool isFromExplicitGlobalModule() const;
 
-  /// Check if we should skip checking ODRHash for declaration \param D.
-  ///
-  /// The existing ODRHash mechanism seems to be not stable enough and
-  /// the false positive ODR violation reports are annoying and we rarely see
-  /// true ODR violation reports. Also we learned that MSVC disabled ODR checks
-  /// for declarations in GMF. So we try to disable ODR checks in the GMF to
-  /// get better user experiences before we make the ODR violation checks stable
-  /// enough.
-  bool shouldSkipCheckingODR() const;
-
   /// Return true if this declaration has an attribute which acts as
   /// definition of the entity, such as 'alias' or 'ifunc'.
   bool hasDefiningAttr() const;
diff --git a/clang/include/clang/AST/OpenACCClause.h b/clang/include/clang/AST/OpenACCClause.h
index 401b8e904a1b7..07587849eb121 100644
--- a/clang/include/clang/AST/OpenACCClause.h
+++ b/clang/include/clang/AST/OpenACCClause.h
@@ -145,6 +145,17 @@ class OpenACCIfClause : public OpenACCClauseWithCondition {
                                  SourceLocation EndLoc);
 };
 
+/// A 'self' clause, which has an optional condition expression.
+class OpenACCSelfClause : public OpenACCClauseWithCondition {
+  OpenACCSelfClause(SourceLocation BeginLoc, SourceLocation LParenLoc,
+                    Expr *ConditionExpr, SourceLocation EndLoc);
+
+public:
+  static OpenACCSelfClause *Create(const ASTContext &C, SourceLocation BeginLoc,
+                                   SourceLocation LParenLoc,
+                                   Expr *ConditionExpr, SourceLocation EndLoc);
+};
+
 template <class Impl> class OpenACCClauseVisitor {
   Impl &getDerived() { return static_cast<Impl &>(*this); }
 
@@ -159,53 +170,13 @@ template <class Impl> class OpenACCClauseVisitor {
       return;
 
     switch (C->getClauseKind()) {
-    case OpenACCClauseKind::Default:
-      VisitDefaultClause(*cast<OpenACCDefaultClause>(C));
-      return;
-    case OpenACCClauseKind::If:
-      VisitIfClause(*cast<OpenACCIfClause>(C));
-      return;
-    case OpenACCClauseKind::Finalize:
-    case OpenACCClauseKind::IfPresent:
-    case OpenACCClauseKind::Seq:
-    case OpenACCClauseKind::Independent:
-    case OpenACCClauseKind::Auto:
-    case OpenACCClauseKind::Worker:
-    case OpenACCClauseKind::Vector:
-    case OpenACCClauseKind::NoHost:
-    case OpenACCClauseKind::Self:
-    case OpenACCClauseKind::Copy:
-    case OpenACCClauseKind::UseDevice:
-    case OpenACCClauseKind::Attach:
-    case OpenACCClauseKind::Delete:
-    case OpenACCClauseKind::Detach:
-    case OpenACCClauseKind::Device:
-    case OpenACCClauseKind::DevicePtr:
-    case OpenACCClauseKind::DeviceResident:
-    case OpenACCClauseKind::FirstPrivate:
-    case OpenACCClauseKind::Host:
-    case OpenACCClauseKind::Link:
-    case OpenACCClauseKind::NoCreate:
-    case OpenACCClauseKind::Present:
-    case OpenACCClauseKind::Private:
-    case OpenACCClauseKind::CopyOut:
-    case OpenACCClauseKind::CopyIn:
-    case OpenACCClauseKind::Create:
-    case OpenACCClauseKind::Reduction:
-    case OpenACCClauseKind::Collapse:
-    case OpenACCClauseKind::Bind:
-    case OpenACCClauseKind::VectorLength:
-    case OpenACCClauseKind::NumGangs:
-    case OpenACCClauseKind::NumWorkers:
-    case OpenACCClauseKind::DeviceNum:
-    case OpenACCClauseKind::DefaultAsync:
-    case OpenACCClauseKind::DeviceType:
-    case OpenACCClauseKind::DType:
-    case OpenACCClauseKind::Async:
-    case OpenACCClauseKind::Tile:
-    case OpenACCClauseKind::Gang:
-    case OpenACCClauseKind::Wait:
-    case OpenACCClauseKind::Invalid:
+#define VISIT_CLAUSE(CLAUSE_NAME)                                              \
+  case OpenACCClauseKind::CLAUSE_NAME:                                         \
+    Visit##CLAUSE_NAME##Clause(*cast<OpenACC##CLAUSE_NAME##Clause>(C));        \
+    return;
+#include "clang/Basic/OpenACCClauses.def"
+
+    default:
       llvm_unreachable("Clause visitor not yet implemented");
     }
     llvm_unreachable("Invalid Clause kind");
diff --git a/clang/include/clang/AST/StmtOpenACC.h b/clang/include/clang/AST/StmtOpenACC.h
index 419cb6cada0bc..66f8f844e0b29 100644
--- a/clang/include/clang/AST/StmtOpenACC.h
+++ b/clang/include/clang/AST/StmtOpenACC.h
@@ -142,9 +142,7 @@ class OpenACCComputeConstruct final
                           Stmt *StructuredBlock)
       : OpenACCAssociatedStmtConstruct(OpenACCComputeConstructClass, K, Start,
                                        End, StructuredBlock) {
-    assert((K == OpenACCDirectiveKind::Parallel ||
-            K == OpenACCDirectiveKind::Serial ||
-            K == OpenACCDirectiveKind::Kernels) &&
+    assert(isOpenACCComputeDirectiveKind(K) &&
            "Only parallel, serial, and kernels constructs should be "
            "represented by this type");
 
diff --git a/clang/include/clang/Analysis/Analyses/ExprMutationAnalyzer.h b/clang/include/clang/Analysis/Analyses/ExprMutationAnalyzer.h
index 1ceef944fbc34..117173ba9a095 100644
--- a/clang/include/clang/Analysis/Analyses/ExprMutationAnalyzer.h
+++ b/clang/include/clang/Analysis/Analyses/ExprMutationAnalyzer.h
@@ -8,11 +8,9 @@
 #ifndef LLVM_CLANG_ANALYSIS_ANALYSES_EXPRMUTATIONANALYZER_H
 #define LLVM_CLANG_ANALYSIS_ANALYSES_EXPRMUTATIONANALYZER_H
 
-#include <type_traits>
-
-#include "clang/AST/AST.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
 #include "llvm/ADT/DenseMap.h"
+#include <memory>
 
 namespace clang {
 
@@ -21,14 +19,74 @@ class FunctionParmMutationAnalyzer;
 /// Analyzes whether any mutative operations are applied to an expression within
 /// a given statement.
 class ExprMutationAnalyzer {
+  friend class FunctionParmMutationAnalyzer;
+
 public:
+  struct Memoized {
+    using ResultMap = llvm::DenseMap<const Expr *, const Stmt *>;
+    using FunctionParaAnalyzerMap =
+        llvm::SmallDenseMap<const FunctionDecl *,
+                            std::unique_ptr<FunctionParmMutationAnalyzer>>;
+
+    ResultMap Results;
+    ResultMap PointeeResults;
+    FunctionParaAnalyzerMap FuncParmAnalyzer;
+
+    void clear() {
+      Results.clear();
+      PointeeResults.clear();
+      FuncParmAnalyzer.clear();
+    }
+  };
+  struct Analyzer {
+    Analyzer(const Stmt &Stm, ASTContext &Context, Memoized &Memorized)
+        : Stm(Stm), Context(Context), Memorized(Memorized) {}
+
+    const Stmt *findMutation(const Expr *Exp);
+    const Stmt *findMutation(const Decl *Dec);
+
+    const Stmt *findPointeeMutation(const Expr *Exp);
+    const Stmt *findPointeeMutation(const Decl *Dec);
+    static bool isUnevaluated(const Stmt *Smt, const Stmt &Stm,
+                              ASTContext &Context);
+
+  private:
+    using MutationFinder = const Stmt *(Analyzer::*)(const Expr *);
+
+    const Stmt *findMutationMemoized(const Expr *Exp,
+                                     llvm::ArrayRef<MutationFinder> Finders,
+                                     Memoized::ResultMap &MemoizedResults);
+    const Stmt *tryEachDeclRef(const Decl *Dec, MutationFinder Finder);
+
+    bool isUnevaluated(const Expr *Exp);
+
+    const Stmt *findExprMutation(ArrayRef<ast_matchers::BoundNodes> Matches);
+    const Stmt *findDeclMutation(ArrayRef<ast_matchers::BoundNodes> Matches);
+    const Stmt *
+    findExprPointeeMutation(ArrayRef<ast_matchers::BoundNodes> Matches);
+    const Stmt *
+    findDeclPointeeMutation(ArrayRef<ast_matchers::BoundNodes> Matches);
+
+    const Stmt *findDirectMutation(const Expr *Exp);
+    const Stmt *findMemberMutation(const Expr *Exp);
+    const Stmt *findArrayElementMutation(const Expr *Exp);
+    const Stmt *findCastMutation(const Expr *Exp);
+    const Stmt *findRangeLoopMutation(const Expr *Exp);
+    const Stmt *findReferenceMutation(const Expr *Exp);
+    const Stmt *findFunctionArgMutation(const Expr *Exp);
+
+    const Stmt &Stm;
+    ASTContext &Context;
+    Memoized &Memorized;
+  };
+
   ExprMutationAnalyzer(const Stmt &Stm, ASTContext &Context)
-      : Stm(Stm), Context(Context) {}
+      : Memorized(), A(Stm, Context, Memorized) {}
 
   bool isMutated(const Expr *Exp) { return findMutation(Exp) != nullptr; }
   bool isMutated(const Decl *Dec) { return findMutation(Dec) != nullptr; }
-  const Stmt *findMutation(const Expr *Exp);
-  const Stmt *findMutation(const Decl *Dec);
+  const Stmt *findMutation(const Expr *Exp) { return A.findMutation(Exp); }
+  const Stmt *findMutation(const Decl *Dec) { return A.findMutation(Dec); }
 
   bool isPointeeMutated(const Expr *Exp) {
     return findPointeeMutation(Exp) != nullptr;
@@ -36,51 +94,40 @@ class ExprMutationAnalyzer {
   bool isPointeeMutated(const Decl *Dec) {
     return findPointeeMutation(Dec) != nullptr;
   }
-  const Stmt *findPointeeMutation(const Expr *Exp);
-  const Stmt *findPointeeMutation(const Decl *Dec);
+  const Stmt *findPointeeMutation(const Expr *Exp) {
+    return A.findPointeeMutation(Exp);
+  }
+  const Stmt *findPointeeMutation(const Decl *Dec) {
+    return A.findPointeeMutation(Dec);
+  }
+
   static bool isUnevaluated(const Stmt *Smt, const Stmt &Stm,
-                            ASTContext &Context);
+                            ASTContext &Context) {
+    return Analyzer::isUnevaluated(Smt, Stm, Context);
+  }
 
 private:
-  using MutationFinder = const Stmt *(ExprMutationAnalyzer::*)(const Expr *);
-  using ResultMap = llvm::DenseMap<const Expr *, const Stmt *>;
-
-  const Stmt *findMutationMemoized(const Expr *Exp,
-                                   llvm::ArrayRef<MutationFinder> Finders,
-                                   ResultMap &MemoizedResults);
-  const Stmt *tryEachDeclRef(const Decl *Dec, MutationFinder Finder);
-
-  bool isUnevaluated(const Expr *Exp);
-
-  const Stmt *findExprMutation(ArrayRef<ast_matchers::BoundNodes> Matches);
-  const Stmt *findDeclMutation(ArrayRef<ast_matchers::BoundNodes> Matches);
-  const Stmt *
-  findExprPointeeMutation(ArrayRef<ast_matchers::BoundNodes> Matches);
-  const Stmt *
-  findDeclPointeeMutation(ArrayRef<ast_matchers::BoundNodes> Matches);
-
-  const Stmt *findDirectMutation(const Expr *Exp);
-  const Stmt *findMemberMutation(const Expr *Exp);
-  const Stmt *findArrayElementMutation(const Expr *Exp);
-  const Stmt *findCastMutation(const Expr *Exp);
-  const Stmt *findRangeLoopMutation(const Expr *Exp);
-  const Stmt *findReferenceMutation(const Expr *Exp);
-  const Stmt *findFunctionArgMutation(const Expr *Exp);
-
-  const Stmt &Stm;
-  ASTContext &Context;
-  llvm::DenseMap<const FunctionDecl *,
-                 std::unique_ptr<FunctionParmMutationAnalyzer>>
-      FuncParmAnalyzer;
-  ResultMap Results;
-  ResultMap PointeeResults;
+  Memoized Memorized;
+  Analyzer A;
 };
 
 // A convenient wrapper around ExprMutationAnalyzer for analyzing function
 // params.
 class FunctionParmMutationAnalyzer {
 public:
-  FunctionParmMutationAnalyzer(const FunctionDecl &Func, ASTContext &Context);
+  static FunctionParmMutationAnalyzer *
+  getFunctionParmMutationAnalyzer(const FunctionDecl &Func, ASTContext &Context,
+                                  ExprMutationAnalyzer::Memoized &Memorized) {
+    auto it = Memorized.FuncParmAnalyzer.find(&Func);
+    if (it == Memorized.FuncParmAnalyzer.end())
+      it =
+          Memorized.FuncParmAnalyzer
+              .try_emplace(&Func, std::unique_ptr<FunctionParmMutationAnalyzer>(
+                                      new FunctionParmMutationAnalyzer(
+                                          Func, Context, Memorized)))
+              .first;
+    return it->getSecond().get();
+  }
 
   bool isMutated(const ParmVarDecl *Parm) {
     return findMutation(Parm) != nullptr;
@@ -88,8 +135,11 @@ class FunctionParmMutationAnalyzer {
   const Stmt *findMutation(const ParmVarDecl *Parm);
 
 private:
-  ExprMutationAnalyzer BodyAnalyzer;
+  ExprMutationAnalyzer::Analyzer BodyAnalyzer;
   llvm::DenseMap<const ParmVarDecl *, const Stmt *> Results;
+
+  FunctionParmMutationAnalyzer(const FunctionDecl &Func, ASTContext &Context,
+                               ExprMutationAnalyzer::Memoized &Memorized);
 };
 
 } // namespace clang
diff --git a/clang/include/clang/Analysis/FlowSensitive/ASTOps.h b/clang/include/clang/Analysis/FlowSensitive/ASTOps.h
new file mode 100644
index 0000000000000..27ad32c1694f7
--- /dev/null
+++ b/clang/include/clang/Analysis/FlowSensitive/ASTOps.h
@@ -0,0 +1,98 @@
+//===-- ASTOps.h -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  Operations on AST nodes that are used in flow-sensitive analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_ASTOPS_H
+#define LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_ASTOPS_H
+
+#include "clang/AST/Decl.h"
+#include "clang/AST/Expr.h"
+#include "clang/AST/Type.h"
+#include "clang/Analysis/FlowSensitive/StorageLocation.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+
+namespace clang {
+namespace dataflow {
+
+/// Skip past nodes that the CFG does not emit. These nodes are invisible to
+/// flow-sensitive analysis, and should be ignored as they will effectively not
+/// exist.
+///
+///   * `ParenExpr` - The CFG takes the operator precedence into account, but
+///   otherwise omits the node afterwards.
+///
+///   * `ExprWithCleanups` - The CFG will generate the appropriate calls to
+///   destructors and then omit the node.
+///
+const Expr &ignoreCFGOmittedNodes(const Expr &E);
+const Stmt &ignoreCFGOmittedNodes(const Stmt &S);
+
+/// A set of `FieldDecl *`. Use `SmallSetVector` to guarantee deterministic
+/// iteration order.
+using FieldSet = llvm::SmallSetVector<const FieldDecl *, 4>;
+
+/// Returns the set of all fields in the type.
+FieldSet getObjectFields(QualType Type);
+
+/// Returns whether `Fields` and `FieldLocs` contain the same fields.
+bool containsSameFields(const FieldSet &Fields,
+                        const RecordStorageLocation::FieldToLoc &FieldLocs);
+
+/// Helper class for initialization of a record with an `InitListExpr`.
+/// `InitListExpr::inits()` contains the initializers for both the base classes
+/// and the fields of the record; this helper class separates these out into two
+/// different lists. In addition, it deals with special cases associated with
+/// unions.
+class RecordInitListHelper {
+public:
+  // `InitList` must have record type.
+  RecordInitListHelper(const InitListExpr *InitList);
+
+  // Base classes with their associated initializer expressions.
+  ArrayRef<std::pair<const CXXBaseSpecifier *, Expr *>> base_inits() const {
+    return BaseInits;
+  }
+
+  // Fields with their associated initializer expressions.
+  ArrayRef<std::pair<const FieldDecl *, Expr *>> field_inits() const {
+    return FieldInits;
+  }
+
+private:
+  SmallVector<std::pair<const CXXBaseSpecifier *, Expr *>> BaseInits;
+  SmallVector<std::pair<const FieldDecl *, Expr *>> FieldInits;
+
+  // We potentially synthesize an `ImplicitValueInitExpr` for unions. It's a
+  // member variable because we store a pointer to it in `FieldInits`.
+  std::optional<ImplicitValueInitExpr> ImplicitValueInitForUnion;
+};
+
+/// A collection of several types of declarations, all referenced from the same
+/// function.
+struct ReferencedDecls {
+  /// Non-static member variables.
+  FieldSet Fields;
+  /// All variables with static storage duration, notably including static
+  /// member variables and static variables declared within a function.
+  llvm::DenseSet<const VarDecl *> Globals;
+  /// Free functions and member functions which are referenced (but not
+  /// necessarily called).
+  llvm::DenseSet<const FunctionDecl *> Functions;
+};
+
+/// Returns declarations that are declared in or referenced from `FD`.
+ReferencedDecls getReferencedDecls(const FunctionDecl &FD);
+
+} // namespace dataflow
+} // namespace clang
+
+#endif // LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_ASTOPS_H
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
index 909a91059438c..aa2c366cb164a 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h
@@ -18,6 +18,7 @@
 #include "clang/AST/Decl.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/TypeOrdering.h"
+#include "clang/Analysis/FlowSensitive/ASTOps.h"
 #include "clang/Analysis/FlowSensitive/AdornedCFG.h"
 #include "clang/Analysis/FlowSensitive/Arena.h"
 #include "clang/Analysis/FlowSensitive/Solver.h"
@@ -30,38 +31,11 @@
 #include <cassert>
 #include <memory>
 #include <optional>
-#include <type_traits>
-#include <utility>
-#include <vector>
 
 namespace clang {
 namespace dataflow {
 class Logger;
 
-/// Skip past nodes that the CFG does not emit. These nodes are invisible to
-/// flow-sensitive analysis, and should be ignored as they will effectively not
-/// exist.
-///
-///   * `ParenExpr` - The CFG takes the operator precedence into account, but
-///   otherwise omits the node afterwards.
-///
-///   * `ExprWithCleanups` - The CFG will generate the appropriate calls to
-///   destructors and then omit the node.
-///
-const Expr &ignoreCFGOmittedNodes(const Expr &E);
-const Stmt &ignoreCFGOmittedNodes(const Stmt &S);
-
-/// A set of `FieldDecl *`. Use `SmallSetVector` to guarantee deterministic
-/// iteration order.
-using FieldSet = llvm::SmallSetVector<const FieldDecl *, 4>;
-
-/// Returns the set of all fields in the type.
-FieldSet getObjectFields(QualType Type);
-
-/// Returns whether `Fields` and `FieldLocs` contain the same fields.
-bool containsSameFields(const FieldSet &Fields,
-                        const RecordStorageLocation::FieldToLoc &FieldLocs);
-
 struct ContextSensitiveOptions {
   /// The maximum depth to analyze. A value of zero is equivalent to disabling
   /// context-sensitive analysis entirely.
diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
index 706664d7db1c2..4277792219c0a 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h
@@ -775,42 +775,6 @@ RecordStorageLocation *getImplicitObjectLocation(const CXXMemberCallExpr &MCE,
 RecordStorageLocation *getBaseObjectLocation(const MemberExpr &ME,
                                              const Environment &Env);
 
-/// Returns the fields of a `RecordDecl` that are initialized by an
-/// `InitListExpr`, in the order in which they appear in
-/// `InitListExpr::inits()`.
-/// `Init->getType()` must be a record type.
-std::vector<const FieldDecl *>
-getFieldsForInitListExpr(const InitListExpr *InitList);
-
-/// Helper class for initialization of a record with an `InitListExpr`.
-/// `InitListExpr::inits()` contains the initializers for both the base classes
-/// and the fields of the record; this helper class separates these out into two
-/// different lists. In addition, it deals with special cases associated with
-/// unions.
-class RecordInitListHelper {
-public:
-  // `InitList` must have record type.
-  RecordInitListHelper(const InitListExpr *InitList);
-
-  // Base classes with their associated initializer expressions.
-  ArrayRef<std::pair<const CXXBaseSpecifier *, Expr *>> base_inits() const {
-    return BaseInits;
-  }
-
-  // Fields with their associated initializer expressions.
-  ArrayRef<std::pair<const FieldDecl *, Expr *>> field_inits() const {
-    return FieldInits;
-  }
-
-private:
-  SmallVector<std::pair<const CXXBaseSpecifier *, Expr *>> BaseInits;
-  SmallVector<std::pair<const FieldDecl *, Expr *>> FieldInits;
-
-  // We potentially synthesize an `ImplicitValueInitExpr` for unions. It's a
-  // member variable because we store a pointer to it in `FieldInits`.
-  std::optional<ImplicitValueInitExpr> ImplicitValueInitForUnion;
-};
-
 /// Associates a new `RecordValue` with `Loc` and returns the new value.
 RecordValue &refreshRecordValue(RecordStorageLocation &Loc, Environment &Env);
 
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index d6ceb450bd106..de721a87b3341 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1164,6 +1164,12 @@ def Unreachable : Builtin {
   let Prototype = "void()";
 }
 
+def AllowRuntimeCheck : Builtin {
+  let Spellings = ["__builtin_allow_runtime_check"];
+  let Attributes = [NoThrow, Pure, Const];
+  let Prototype = "bool(char const*)";
+}
+
 def ShuffleVector : Builtin {
   let Spellings = ["__builtin_shufflevector"];
   let Attributes = [NoThrow, Const, CustomTypeChecking];
diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h
index 38f30543a0f66..ba0e4465a0f5a 100644
--- a/clang/include/clang/Basic/Cuda.h
+++ b/clang/include/clang/Basic/Cuda.h
@@ -50,17 +50,15 @@ const char *CudaVersionToString(CudaVersion V);
 // Input is "Major.Minor"
 CudaVersion CudaStringToVersion(const llvm::Twine &S);
 
-// We have a name conflict with sys/mac.h on AIX
-#ifdef SM_32
-#undef SM_32
-#endif
 enum class CudaArch {
   UNUSED,
   UNKNOWN,
+  // TODO: Deprecate and remove GPU architectures older than sm_52.
   SM_20,
   SM_21,
   SM_30,
-  SM_32,
+  // This has a name conflict with sys/mac.h on AIX, rename it as a workaround.
+  SM_32_,
   SM_35,
   SM_37,
   SM_50,
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 47747d8704b6c..5251774ff4efd 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -1412,9 +1412,6 @@ def MultiGPU: DiagGroup<"multi-gpu">;
 // libc and the CRT to be skipped.
 def AVRRtlibLinkingQuirks : DiagGroup<"avr-rtlib-linking-quirks">;
 
-// A warning group related to AArch64 SME function attribues.
-def AArch64SMEAttributes : DiagGroup<"aarch64-sme-attributes">;
-
 // A warning group for things that will change semantics in the future.
 def FutureCompat : DiagGroup<"future-compat">;
 
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index bb9ca2a50cc06..66405095d51de 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -863,6 +863,8 @@ def err_empty_requires_expr : Error<
   "a requires expression must contain at least one requirement">;
 def err_requires_expr_parameter_list_ellipsis : Error<
   "varargs not allowed in requires expression">;
+def err_requires_expr_explicit_object_parameter: Error<
+  "a requires expression cannot have an explicit object parameter">;
 def err_expected_semi_requirement : Error<
   "expected ';' at end of requirement">;
 def err_requires_expr_missing_arrow : Error<
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 5ec0218aedfe8..30a8543489f48 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3751,16 +3751,6 @@ def err_sme_definition_using_za_in_non_sme_target : Error<
   "function using ZA state requires 'sme'">;
 def err_sme_definition_using_zt0_in_non_sme2_target : Error<
   "function using ZT0 state requires 'sme2'">;
-def warn_sme_streaming_pass_return_vl_to_non_streaming : Warning<
-  "passing a VL-dependent argument to/from a function that has a different"
-  " streaming-mode. The streaming and non-streaming vector lengths may be"
-  " different">,
-  InGroup<AArch64SMEAttributes>, DefaultIgnore;
-def warn_sme_locally_streaming_has_vl_args_returns : Warning<
-  "passing/returning a VL-dependent argument to/from a __arm_locally_streaming"
-  " function. The streaming and non-streaming vector"
-  " lengths may be different">,
-  InGroup<AArch64SMEAttributes>, DefaultIgnore;
 def err_conflicting_attributes_arm_state : Error<
   "conflicting attributes for state '%0'">;
 def err_sme_streaming_cannot_be_multiversioned : Error<
@@ -12274,4 +12264,8 @@ def note_acc_branch_into_compute_construct
     : Note<"invalid branch into OpenACC Compute Construct">;
 def note_acc_branch_out_of_compute_construct
     : Note<"invalid branch out of OpenACC Compute Construct">;
+def warn_acc_if_self_conflict
+    : Warning<"OpenACC construct 'self' has no effect when an 'if' clause "
+              "evaluates to true">,
+      InGroup<DiagGroup<"openacc-self-if-potential-conflict">>;
 } // end of sema component.
diff --git a/clang/include/clang/Basic/OpenACCClauses.def b/clang/include/clang/Basic/OpenACCClauses.def
index 7fd2720e02ce2..378495d2c0909 100644
--- a/clang/include/clang/Basic/OpenACCClauses.def
+++ b/clang/include/clang/Basic/OpenACCClauses.def
@@ -17,5 +17,6 @@
 
 VISIT_CLAUSE(Default)
 VISIT_CLAUSE(If)
+VISIT_CLAUSE(Self)
 
 #undef VISIT_CLAUSE
diff --git a/clang/include/clang/Basic/OpenACCKinds.h b/clang/include/clang/Basic/OpenACCKinds.h
index 3414df9999170..e3f7417843328 100644
--- a/clang/include/clang/Basic/OpenACCKinds.h
+++ b/clang/include/clang/Basic/OpenACCKinds.h
@@ -146,6 +146,12 @@ inline llvm::raw_ostream &operator<<(llvm::raw_ostream &Out,
   return printOpenACCDirectiveKind(Out, K);
 }
 
+inline bool isOpenACCComputeDirectiveKind(OpenACCDirectiveKind K) {
+  return K == OpenACCDirectiveKind::Parallel ||
+         K == OpenACCDirectiveKind::Serial ||
+         K == OpenACCDirectiveKind::Kernels;
+}
+
 enum class OpenACCAtomicKind {
   Read,
   Write,
diff --git a/clang/include/clang/Basic/arm_fp16.td b/clang/include/clang/Basic/arm_fp16.td
index cb2a09303e8e1..d36b4617bef5d 100644
--- a/clang/include/clang/Basic/arm_fp16.td
+++ b/clang/include/clang/Basic/arm_fp16.td
@@ -14,7 +14,7 @@
 include "arm_neon_incl.td"
 
 // ARMv8.2-A FP16 intrinsics.
-let ArchGuard = "defined(__aarch64__)", TargetGuard = "fullfp16" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "fullfp16" in {
 
   // Negate
   def VNEGSH          : SInst<"vneg", "11", "Sh">;
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 7edac5afafaa9..6d655c39360d3 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -605,11 +605,11 @@ def VQDMULL_LANE  : SOpInst<"vqdmull_lane", "(>Q)..I", "si", OP_QDMULL_LN>;
 def VQDMULH_N     : SOpInst<"vqdmulh_n", "..1", "siQsQi", OP_QDMULH_N>;
 def VQRDMULH_N    : SOpInst<"vqrdmulh_n", "..1", "siQsQi", OP_QRDMULH_N>;
 
-let ArchGuard = "!defined(__aarch64__)" in {
+let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)" in {
 def VQDMULH_LANE  : SOpInst<"vqdmulh_lane", "..qI", "siQsQi", OP_QDMULH_LN>;
 def VQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "..qI", "siQsQi", OP_QRDMULH_LN>;
 }
-let ArchGuard = "defined(__aarch64__)" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)" in {
 def A64_VQDMULH_LANE  : SInst<"vqdmulh_lane", "..(!q)I", "siQsQi">;
 def A64_VQRDMULH_LANE : SInst<"vqrdmulh_lane", "..(!q)I", "siQsQi">;
 }
@@ -686,7 +686,7 @@ multiclass REINTERPRET_CROSS_TYPES<string TypesA, string TypesB> {
 
 // E.3.31 Vector reinterpret cast operations
 def VREINTERPRET : REINTERPRET_CROSS_SELF<"csilUcUsUiUlhfPcPsQcQsQiQlQUcQUsQUiQUlQhQfQPcQPs"> {
-  let ArchGuard = "!defined(__aarch64__)";
+  let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)";
   let BigEndianSafe = 1;
 }
 
@@ -714,7 +714,7 @@ def VADDP   : WInst<"vadd", "...", "PcPsPlQPcQPsQPl">;
 ////////////////////////////////////////////////////////////////////////////////
 // AArch64 Intrinsics
 
-let ArchGuard = "defined(__aarch64__)" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)" in {
 
 ////////////////////////////////////////////////////////////////////////////////
 // Load/Store
@@ -1091,14 +1091,14 @@ let isLaneQ = 1 in {
 def VQDMULH_LANEQ  : SInst<"vqdmulh_laneq", "..QI", "siQsQi">;
 def VQRDMULH_LANEQ : SInst<"vqrdmulh_laneq", "..QI", "siQsQi">;
 }
-let ArchGuard = "defined(__aarch64__)", TargetGuard = "v8.1a" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.1a" in {
 def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "...QI", "siQsQi", OP_QRDMLAH_LN> {
   let isLaneQ = 1;
 }
 def VQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "...QI", "siQsQi", OP_QRDMLSH_LN> {
   let isLaneQ = 1;
 }
-} // ArchGuard = "defined(__aarch64__)", TargetGuard = "v8.1a"
+} // ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.1a"
 
 // Note: d type implemented by SCALAR_VMULX_LANE
 def VMULX_LANE : IOpInst<"vmulx_lane", "..qI", "fQfQd", OP_MULX_LN>;
@@ -1143,7 +1143,7 @@ def SHA256H2 : SInst<"vsha256h2", "....", "QUi">;
 def SHA256SU1 : SInst<"vsha256su1", "....", "QUi">;
 }
 
-let ArchGuard = "defined(__aarch64__)", TargetGuard = "sha3" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sha3" in {
 def BCAX : SInst<"vbcax", "....", "QUcQUsQUiQUlQcQsQiQl">;
 def EOR3 : SInst<"veor3", "....", "QUcQUsQUiQUlQcQsQiQl">;
 def RAX1 : SInst<"vrax1", "...", "QUl">;
@@ -1153,14 +1153,14 @@ def XAR :  SInst<"vxar", "...I", "QUl">;
 }
 }
 
-let ArchGuard = "defined(__aarch64__)", TargetGuard = "sha3" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sha3" in {
 def SHA512SU0 : SInst<"vsha512su0", "...", "QUl">;
 def SHA512su1 : SInst<"vsha512su1", "....", "QUl">;
 def SHA512H : SInst<"vsha512h", "....", "QUl">;
 def SHA512H2 : SInst<"vsha512h2", "....", "QUl">;
 }
 
-let ArchGuard = "defined(__aarch64__)", TargetGuard = "sm4" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sm4" in {
 def SM3SS1 : SInst<"vsm3ss1", "....", "QUi">;
 def SM3TT1A : SInst<"vsm3tt1a", "....I", "QUi">;
 def SM3TT1B : SInst<"vsm3tt1b", "....I", "QUi">;
@@ -1170,7 +1170,7 @@ def SM3PARTW1 : SInst<"vsm3partw1", "....", "QUi">;
 def SM3PARTW2 : SInst<"vsm3partw2", "....", "QUi">;
 }
 
-let ArchGuard = "defined(__aarch64__)", TargetGuard = "sm4" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "sm4" in {
 def SM4E : SInst<"vsm4e", "...", "QUi">;
 def SM4EKEY : SInst<"vsm4ekey", "...", "QUi">;
 }
@@ -1193,7 +1193,7 @@ def FCVTAS_S32 : SInst<"vcvta_s32", "S.", "fQf">;
 def FCVTAU_S32 : SInst<"vcvta_u32", "U.", "fQf">;
 }
 
-let ArchGuard = "defined(__aarch64__)" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)" in {
 def FCVTNS_S64 : SInst<"vcvtn_s64", "S.", "dQd">;
 def FCVTNU_S64 : SInst<"vcvtn_u64", "U.", "dQd">;
 def FCVTPS_S64 : SInst<"vcvtp_s64", "S.", "dQd">;
@@ -1217,7 +1217,7 @@ def FRINTZ_S32 : SInst<"vrnd", "..", "fQf">;
 def FRINTI_S32 : SInst<"vrndi", "..", "fQf">;
 }
 
-let ArchGuard = "defined(__aarch64__) && defined(__ARM_FEATURE_DIRECTED_ROUNDING)" in {
+let ArchGuard = "(defined(__aarch64__) || defined(__arm64ec__)) && defined(__ARM_FEATURE_DIRECTED_ROUNDING)" in {
 def FRINTN_S64 : SInst<"vrndn", "..", "dQd">;
 def FRINTA_S64 : SInst<"vrnda", "..", "dQd">;
 def FRINTP_S64 : SInst<"vrndp", "..", "dQd">;
@@ -1227,7 +1227,7 @@ def FRINTZ_S64 : SInst<"vrnd", "..", "dQd">;
 def FRINTI_S64 : SInst<"vrndi", "..", "dQd">;
 }
 
-let ArchGuard = "defined(__aarch64__)", TargetGuard = "v8.5a" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.5a" in {
 def FRINT32X_S32 : SInst<"vrnd32x", "..", "fQf">;
 def FRINT32Z_S32 : SInst<"vrnd32z", "..", "fQf">;
 def FRINT64X_S32 : SInst<"vrnd64x", "..", "fQf">;
@@ -1247,7 +1247,7 @@ def FMAXNM_S32 : SInst<"vmaxnm", "...", "fQf">;
 def FMINNM_S32 : SInst<"vminnm", "...", "fQf">;
 }
 
-let ArchGuard = "defined(__aarch64__) && defined(__ARM_FEATURE_NUMERIC_MAXMIN)" in {
+let ArchGuard = "(defined(__aarch64__)  || defined(__arm64ec__)) && defined(__ARM_FEATURE_NUMERIC_MAXMIN)" in {
 def FMAXNM_S64 : SInst<"vmaxnm", "...", "dQd">;
 def FMINNM_S64 : SInst<"vminnm", "...", "dQd">;
 }
@@ -1289,7 +1289,7 @@ def VQTBX4_A64 : WInst<"vqtbx4", "..(4Q)U", "UccPcQUcQcQPc">;
 // itself during generation so, unlike all other intrinsics, this one should
 // include *all* types, not just additional ones.
 def VVREINTERPRET : REINTERPRET_CROSS_SELF<"csilUcUsUiUlhfdPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQdQPcQPsQPlQPk"> {
-  let ArchGuard = "defined(__aarch64__)";
+  let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)";
   let BigEndianSafe = 1;
 }
 
@@ -1401,7 +1401,7 @@ def SCALAR_SQDMULH : SInst<"vqdmulh", "111", "SsSi">;
 // Scalar Integer Saturating Rounding Doubling Multiply Half High
 def SCALAR_SQRDMULH : SInst<"vqrdmulh", "111", "SsSi">;
 
-let ArchGuard = "defined(__aarch64__)", TargetGuard = "v8.1a" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.1a" in {
 ////////////////////////////////////////////////////////////////////////////////
 // Signed Saturating Rounding Doubling Multiply Accumulate Returning High Half
 def SCALAR_SQRDMLAH : SInst<"vqrdmlah", "1111", "SsSi">;
@@ -1409,7 +1409,7 @@ def SCALAR_SQRDMLAH : SInst<"vqrdmlah", "1111", "SsSi">;
 ////////////////////////////////////////////////////////////////////////////////
 // Signed Saturating Rounding Doubling Multiply Subtract Returning High Half
 def SCALAR_SQRDMLSH : SInst<"vqrdmlsh", "1111", "SsSi">;
-} // ArchGuard = "defined(__aarch64__)", TargetGuard = "v8.1a"
+} // ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.1a"
 
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar Floating-point Multiply Extended
@@ -1651,7 +1651,7 @@ def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "1QI", "ScSsSiSlSfSdSUcSUsSUiSUlSPcS
   let isLaneQ = 1;
 }
 
-} // ArchGuard = "defined(__aarch64__)"
+} // ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)"
 
 // ARMv8.2-A FP16 vector intrinsics for A32/A64.
 let TargetGuard = "fullfp16" in {
@@ -1775,7 +1775,7 @@ def VEXTH      : WInst<"vext", "...I", "hQh">;
 def VREV64H    : WOpInst<"vrev64", "..", "hQh", OP_REV64>;
 
 // ARMv8.2-A FP16 vector intrinsics for A64 only.
-let ArchGuard = "defined(__aarch64__)", TargetGuard = "fullfp16" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "fullfp16" in {
 
   // Vector rounding
   def FRINTIH      : SInst<"vrndi", "..", "hQh">;
@@ -1856,7 +1856,7 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "fullfp16" in {
   def FMINNMVH : SInst<"vminnmv", "1.", "hQh">;
 }
 
-let ArchGuard = "defined(__aarch64__)" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)" in {
   // Permutation
   def VTRN1H     : SOpInst<"vtrn1", "...", "hQh", OP_TRN1>;
   def VZIP1H     : SOpInst<"vzip1", "...", "hQh", OP_ZIP1>;
@@ -1876,7 +1876,7 @@ let TargetGuard = "dotprod" in {
   def DOT : SInst<"vdot", "..(<<)(<<)", "iQiUiQUi">;
   def DOT_LANE : SOpInst<"vdot_lane", "..(<<)(<<q)I", "iUiQiQUi", OP_DOT_LN>;
 }
-let ArchGuard = "defined(__aarch64__)", TargetGuard = "dotprod" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "dotprod" in {
   // Variants indexing into a 128-bit vector are A64 only.
   def UDOT_LANEQ : SOpInst<"vdot_laneq", "..(<<)(<<Q)I", "iUiQiQUi", OP_DOT_LNQ> {
     let isLaneQ = 1;
@@ -1884,7 +1884,7 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "dotprod" in {
 }
 
 // v8.2-A FP16 fused multiply-add long instructions.
-let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp16fml" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "fp16fml" in {
   def VFMLAL_LOW  : SInst<"vfmlal_low",  ">>..", "hQh">;
   def VFMLSL_LOW  : SInst<"vfmlsl_low",  ">>..", "hQh">;
   def VFMLAL_HIGH : SInst<"vfmlal_high", ">>..", "hQh">;
@@ -1918,7 +1918,7 @@ let TargetGuard = "i8mm" in {
   def VUSDOT_LANE  : SOpInst<"vusdot_lane", "..(<<U)(<<q)I", "iQi", OP_USDOT_LN>;
   def VSUDOT_LANE  : SOpInst<"vsudot_lane", "..(<<)(<<qU)I", "iQi", OP_SUDOT_LN>;
 
-  let ArchGuard = "defined(__aarch64__)" in {
+  let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)" in {
     let isLaneQ = 1 in {
       def VUSDOT_LANEQ  : SOpInst<"vusdot_laneq", "..(<<U)(<<Q)I", "iQi", OP_USDOT_LNQ>;
       def VSUDOT_LANEQ  : SOpInst<"vsudot_laneq", "..(<<)(<<QU)I", "iQi", OP_SUDOT_LNQ>;
@@ -1986,7 +1986,7 @@ let TargetGuard = "v8.3a" in {
 
   defm VCMLA_F32        : VCMLA_ROTS<"f", "uint64x1_t", "uint64x2_t">;
 }
-let ArchGuard = "defined(__aarch64__)", TargetGuard = "v8.3a" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.3a" in {
   def VCADDQ_ROT90_FP64  : SInst<"vcaddq_rot90", "QQQ", "d">;
   def VCADDQ_ROT270_FP64 : SInst<"vcaddq_rot270", "QQQ", "d">;
 
@@ -2058,14 +2058,14 @@ let TargetGuard = "bf16" in {
   def SCALAR_CVT_F32_BF16 : SOpInst<"vcvtah_f32", "(1F>)(1!)", "b", OP_CVT_F32_BF16>;
 }
 
-let ArchGuard = "!defined(__aarch64__)", TargetGuard = "bf16" in {
+let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard = "bf16" in {
   def VCVT_BF16_F32_A32_INTERNAL : WInst<"__a32_vcvt_bf16", "BQ", "f">;
   def VCVT_BF16_F32_A32 : SOpInst<"vcvt_bf16", "BQ", "f", OP_VCVT_BF16_F32_A32>;
   def VCVT_LOW_BF16_F32_A32 : SOpInst<"vcvt_low_bf16",  "BQ", "Qf", OP_VCVT_BF16_F32_LO_A32>;
   def VCVT_HIGH_BF16_F32_A32 : SOpInst<"vcvt_high_bf16", "BBQ", "Qf", OP_VCVT_BF16_F32_HI_A32>;
 }
 
-let ArchGuard = "defined(__aarch64__)", TargetGuard = "bf16" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16" in {
   def VCVT_LOW_BF16_F32_A64_INTERNAL : WInst<"__a64_vcvtq_low_bf16", "BQ", "Hf">;
   def VCVT_LOW_BF16_F32_A64 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A64>;
   def VCVT_HIGH_BF16_F32_A64 : SInst<"vcvt_high_bf16", "BBQ", "Qf">;
@@ -2077,14 +2077,14 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "bf16" in {
   def COPYQ_LANEQ_BF16 : IOpInst<"vcopy_laneq", "..I.I", "Qb", OP_COPY_LN>;
 }
 
-let ArchGuard = "!defined(__aarch64__)", TargetGuard = "bf16" in {
+let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard = "bf16" in {
   let BigEndianSafe = 1 in {
     defm VREINTERPRET_BF : REINTERPRET_CROSS_TYPES<
         "csilUcUsUiUlhfPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQPcQPsQPl", "bQb">;
   }
 }
 
-let ArchGuard = "defined(__aarch64__)", TargetGuard = "bf16" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16" in {
   let BigEndianSafe = 1 in {
     defm VVREINTERPRET_BF : REINTERPRET_CROSS_TYPES<
         "csilUcUsUiUlhfdPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQdQPcQPsQPlQPk", "bQb">;
@@ -2092,7 +2092,7 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "bf16" in {
 }
 
 // v8.9a/v9.4a LRCPC3 intrinsics
-let ArchGuard = "defined(__aarch64__)", TargetGuard = "rcpc3" in {
+let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "rcpc3" in {
   def VLDAP1_LANE : WInst<"vldap1_lane", ".(c*!).I", "QUlQlUlldQdPlQPl">;
   def VSTL1_LANE  : WInst<"vstl1_lane", "v*(.!)I", "QUlQlUlldQdPlQPl">;
 }
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 5950dd74cfe83..23b268126de4e 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -18,6 +18,7 @@
 #include "clang/Lex/CodeCompletionHandler.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/Sema.h"
+#include "clang/Sema/SemaOpenMP.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Frontend/OpenMP/OMPContext.h"
 #include "llvm/Support/SaveAndRestore.h"
@@ -2537,7 +2538,7 @@ class Parser : public CodeCompletionHandler {
   /// Returns true for declaration, false for expression.
   bool isForInitDeclaration() {
     if (getLangOpts().OpenMP)
-      Actions.startOpenMPLoop();
+      Actions.OpenMP().startOpenMPLoop();
     if (getLangOpts().CPlusPlus)
       return Tok.is(tok::kw_using) ||
              isCXXSimpleDeclaration(/*AllowForRangeDecl=*/true);
@@ -3396,7 +3397,7 @@ class Parser : public CodeCompletionHandler {
                           SourceLocation Loc);
 
   /// Parse clauses for '#pragma omp [begin] declare target'.
-  void ParseOMPDeclareTargetClauses(Sema::DeclareTargetContextInfo &DTCI);
+  void ParseOMPDeclareTargetClauses(SemaOpenMP::DeclareTargetContextInfo &DTCI);
 
   /// Parse '#pragma omp end declare target'.
   void ParseOMPEndDeclareTargetDirective(OpenMPDirectiveKind BeginDKind,
@@ -3486,7 +3487,7 @@ class Parser : public CodeCompletionHandler {
   /// Parses indirect clause
   /// \param ParseOnly true to skip the clause's semantic actions and return
   // false;
-  bool ParseOpenMPIndirectClause(Sema::DeclareTargetContextInfo &DTCI,
+  bool ParseOpenMPIndirectClause(SemaOpenMP::DeclareTargetContextInfo &DTCI,
                                  bool ParseOnly);
   /// Parses clause with a single expression and an additional argument
   /// of a kind \a Kind.
@@ -3556,12 +3557,12 @@ class Parser : public CodeCompletionHandler {
 
   /// Parses a reserved locator like 'omp_all_memory'.
   bool ParseOpenMPReservedLocator(OpenMPClauseKind Kind,
-                                  Sema::OpenMPVarListDataTy &Data,
+                                  SemaOpenMP::OpenMPVarListDataTy &Data,
                                   const LangOptions &LangOpts);
   /// Parses clauses with list.
   bool ParseOpenMPVarList(OpenMPDirectiveKind DKind, OpenMPClauseKind Kind,
                           SmallVectorImpl<Expr *> &Vars,
-                          Sema::OpenMPVarListDataTy &Data);
+                          SemaOpenMP::OpenMPVarListDataTy &Data);
   bool ParseUnqualifiedId(CXXScopeSpec &SS, ParsedType ObjectType,
                           bool ObjectHadErrors, bool EnteringContext,
                           bool AllowDestructorName, bool AllowConstructorName,
@@ -3569,11 +3570,11 @@ class Parser : public CodeCompletionHandler {
                           SourceLocation *TemplateKWLoc, UnqualifiedId &Result);
 
   /// Parses the mapper modifier in map, to, and from clauses.
-  bool parseMapperModifier(Sema::OpenMPVarListDataTy &Data);
+  bool parseMapperModifier(SemaOpenMP::OpenMPVarListDataTy &Data);
   /// Parses map-type-modifiers in map clause.
   /// map([ [map-type-modifier[,] [map-type-modifier[,] ...] map-type : ] list)
   /// where, map-type-modifier ::= always | close | mapper(mapper-identifier)
-  bool parseMapTypeModifiers(Sema::OpenMPVarListDataTy &Data);
+  bool parseMapTypeModifiers(SemaOpenMP::OpenMPVarListDataTy &Data);
 
   //===--------------------------------------------------------------------===//
   // OpenACC Parsing.
diff --git a/clang/include/clang/Sema/Lookup.h b/clang/include/clang/Sema/Lookup.h
index 2f2f2607a937f..0db5b847038ff 100644
--- a/clang/include/clang/Sema/Lookup.h
+++ b/clang/include/clang/Sema/Lookup.h
@@ -153,28 +153,30 @@ class LookupResult {
 
   using iterator = UnresolvedSetImpl::iterator;
 
-  LookupResult(Sema &SemaRef, const DeclarationNameInfo &NameInfo,
-               Sema::LookupNameKind LookupKind,
-               Sema::RedeclarationKind Redecl = Sema::NotForRedeclaration)
+  LookupResult(
+      Sema &SemaRef, const DeclarationNameInfo &NameInfo,
+      Sema::LookupNameKind LookupKind,
+      RedeclarationKind Redecl = RedeclarationKind::NotForRedeclaration)
       : SemaPtr(&SemaRef), NameInfo(NameInfo), LookupKind(LookupKind),
-        Redecl(Redecl != Sema::NotForRedeclaration),
-        ExternalRedecl(Redecl == Sema::ForExternalRedeclaration),
-        DiagnoseAccess(Redecl == Sema::NotForRedeclaration),
-        DiagnoseAmbiguous(Redecl == Sema::NotForRedeclaration) {
+        Redecl(Redecl != RedeclarationKind::NotForRedeclaration),
+        ExternalRedecl(Redecl == RedeclarationKind::ForExternalRedeclaration),
+        DiagnoseAccess(Redecl == RedeclarationKind::NotForRedeclaration),
+        DiagnoseAmbiguous(Redecl == RedeclarationKind::NotForRedeclaration) {
     configure();
   }
 
   // TODO: consider whether this constructor should be restricted to take
   // as input a const IdentifierInfo* (instead of Name),
   // forcing other cases towards the constructor taking a DNInfo.
-  LookupResult(Sema &SemaRef, DeclarationName Name, SourceLocation NameLoc,
-               Sema::LookupNameKind LookupKind,
-               Sema::RedeclarationKind Redecl = Sema::NotForRedeclaration)
+  LookupResult(
+      Sema &SemaRef, DeclarationName Name, SourceLocation NameLoc,
+      Sema::LookupNameKind LookupKind,
+      RedeclarationKind Redecl = RedeclarationKind::NotForRedeclaration)
       : SemaPtr(&SemaRef), NameInfo(Name, NameLoc), LookupKind(LookupKind),
-        Redecl(Redecl != Sema::NotForRedeclaration),
-        ExternalRedecl(Redecl == Sema::ForExternalRedeclaration),
-        DiagnoseAccess(Redecl == Sema::NotForRedeclaration),
-        DiagnoseAmbiguous(Redecl == Sema::NotForRedeclaration) {
+        Redecl(Redecl != RedeclarationKind::NotForRedeclaration),
+        ExternalRedecl(Redecl == RedeclarationKind::ForExternalRedeclaration),
+        DiagnoseAccess(Redecl == RedeclarationKind::NotForRedeclaration),
+        DiagnoseAmbiguous(Redecl == RedeclarationKind::NotForRedeclaration) {
     configure();
   }
 
@@ -285,9 +287,10 @@ class LookupResult {
     return ExternalRedecl;
   }
 
-  Sema::RedeclarationKind redeclarationKind() const {
-    return ExternalRedecl ? Sema::ForExternalRedeclaration :
-           Redecl ? Sema::ForVisibleRedeclaration : Sema::NotForRedeclaration;
+  RedeclarationKind redeclarationKind() const {
+    return ExternalRedecl ? RedeclarationKind::ForExternalRedeclaration
+           : Redecl       ? RedeclarationKind::ForVisibleRedeclaration
+                          : RedeclarationKind::NotForRedeclaration;
   }
 
   /// Specify whether hidden declarations are visible, e.g.,
@@ -615,9 +618,9 @@ class LookupResult {
   }
 
   /// Change this lookup's redeclaration kind.
-  void setRedeclarationKind(Sema::RedeclarationKind RK) {
-    Redecl = (RK != Sema::NotForRedeclaration);
-    ExternalRedecl = (RK == Sema::ForExternalRedeclaration);
+  void setRedeclarationKind(RedeclarationKind RK) {
+    Redecl = (RK != RedeclarationKind::NotForRedeclaration);
+    ExternalRedecl = (RK == RedeclarationKind::ForExternalRedeclaration);
     configure();
   }
 
diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h
index e3857b2f07d9e..25a5fa05b21c7 100644
--- a/clang/include/clang/Sema/ParsedAttr.h
+++ b/clang/include/clang/Sema/ParsedAttr.h
@@ -94,7 +94,7 @@ struct PropertyData {
       : GetterId(getterId), SetterId(setterId) {}
 };
 
-} // namespace
+} // namespace detail
 
 /// Wraps an identifier and optional source location for the identifier.
 struct IdentifierLoc {
@@ -743,11 +743,6 @@ class AttributePool {
                      IdentifierInfo *scopeName, SourceLocation scopeLoc,
                      ArgsUnion *args, unsigned numArgs, ParsedAttr::Form form,
                      SourceLocation ellipsisLoc = SourceLocation()) {
-    size_t temp =
-        ParsedAttr::totalSizeToAlloc<ArgsUnion, detail::AvailabilityData,
-                                     detail::TypeTagForDatatypeData, ParsedType,
-                                     detail::PropertyData>(numArgs, 0, 0, 0, 0);
-    (void)temp;
     void *memory = allocate(
         ParsedAttr::totalSizeToAlloc<ArgsUnion, detail::AvailabilityData,
                                      detail::TypeTagForDatatypeData, ParsedType,
diff --git a/clang/include/clang/Sema/Redeclaration.h b/clang/include/clang/Sema/Redeclaration.h
new file mode 100644
index 0000000000000..ae18b922f5cd9
--- /dev/null
+++ b/clang/include/clang/Sema/Redeclaration.h
@@ -0,0 +1,31 @@
+//===- Redeclaration.h - Redeclarations--------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines RedeclarationKind enum.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_SEMA_REDECLARATION_H
+#define LLVM_CLANG_SEMA_REDECLARATION_H
+
+/// Specifies whether (or how) name lookup is being performed for a
+/// redeclaration (vs. a reference).
+enum class RedeclarationKind {
+  /// The lookup is a reference to this name that is not for the
+  /// purpose of redeclaring the name.
+  NotForRedeclaration = 0,
+  /// The lookup results will be used for redeclaration of a name,
+  /// if an entity by that name already exists and is visible.
+  ForVisibleRedeclaration,
+  /// The lookup results will be used for redeclaration of a name
+  /// with external linkage; non-visible lookup results with external linkage
+  /// may also be found.
+  ForExternalRedeclaration
+};
+
+#endif // LLVM_CLANG_SEMA_REDECLARATION_H
\ No newline at end of file
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index d93ac7863b721..1e89dfc58d92b 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -26,14 +26,12 @@
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/ExprConcepts.h"
 #include "clang/AST/ExprObjC.h"
-#include "clang/AST/ExprOpenMP.h"
 #include "clang/AST/ExternalASTSource.h"
 #include "clang/AST/LocInfoType.h"
 #include "clang/AST/MangleNumberingContext.h"
 #include "clang/AST/NSAPI.h"
 #include "clang/AST/PrettyPrinter.h"
 #include "clang/AST/StmtCXX.h"
-#include "clang/AST/StmtOpenMP.h"
 #include "clang/AST/TypeLoc.h"
 #include "clang/AST/TypeOrdering.h"
 #include "clang/Basic/BitmaskEnum.h"
@@ -43,7 +41,6 @@
 #include "clang/Basic/ExpressionTraits.h"
 #include "clang/Basic/Module.h"
 #include "clang/Basic/OpenCLOptions.h"
-#include "clang/Basic/OpenMPKinds.h"
 #include "clang/Basic/PragmaKinds.h"
 #include "clang/Basic/Specifiers.h"
 #include "clang/Basic/TemplateKinds.h"
@@ -55,9 +52,11 @@
 #include "clang/Sema/IdentifierResolver.h"
 #include "clang/Sema/ObjCMethodList.h"
 #include "clang/Sema/Ownership.h"
+#include "clang/Sema/Redeclaration.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/SemaBase.h"
 #include "clang/Sema/SemaConcept.h"
+#include "clang/Sema/SemaDiagnostic.h"
 #include "clang/Sema/TypoCorrection.h"
 #include "clang/Sema/Weak.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -68,7 +67,6 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/TinyPtrVector.h"
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include <deque>
 #include <memory>
 #include <optional>
@@ -167,12 +165,6 @@ class ObjCMessageExpr;
 class ObjCMethodDecl;
 class ObjCPropertyDecl;
 class ObjCProtocolDecl;
-class OMPThreadPrivateDecl;
-class OMPRequiresDecl;
-class OMPDeclareReductionDecl;
-class OMPDeclareSimdDecl;
-class OMPClause;
-struct OMPVarListLocTy;
 struct OverloadCandidate;
 enum class OverloadCandidateParamOrder : char;
 enum OverloadCandidateRewriteKind : unsigned;
@@ -187,6 +179,7 @@ class QualType;
 class SemaCUDA;
 class SemaHLSL;
 class SemaOpenACC;
+class SemaOpenMP;
 class SemaSYCL;
 class StandardConversionSequence;
 class Stmt;
@@ -360,6 +353,14 @@ class PreferredTypeBuilder {
   llvm::function_ref<QualType()> ComputeType;
 };
 
+struct SkipBodyInfo {
+  SkipBodyInfo() = default;
+  bool ShouldSkip = false;
+  bool CheckSameAsPrevious = false;
+  NamedDecl *Previous = nullptr;
+  NamedDecl *New = nullptr;
+};
+
 /// Describes the result of template argument deduction.
 ///
 /// The TemplateDeductionResult enumeration describes the result of
@@ -437,6 +438,20 @@ enum class CXXSpecialMemberKind {
   Invalid
 };
 
+/// The kind of conversion being performed.
+enum class CheckedConversionKind {
+  /// An implicit conversion.
+  Implicit,
+  /// A C-style cast.
+  CStyleCast,
+  /// A functional-style cast.
+  FunctionalCast,
+  /// A cast other than a C-style cast.
+  OtherCast,
+  /// A conversion for an operand of a builtin overloaded operator.
+  ForBuiltinOverloadedOp
+};
+
 /// Sema - This implements semantic analysis and AST building for C.
 /// \nosubgrouping
 class Sema final : public SemaBase {
@@ -480,7 +495,6 @@ class Sema final : public SemaBase {
   // 35. Code Completion (SemaCodeComplete.cpp)
   // 36. FixIt Helpers (SemaFixItUtils.cpp)
   // 37. Name Lookup for RISC-V Vector Intrinsic (SemaRISCVVectorLookup.cpp)
-  // 38. OpenMP Directives and Clauses (SemaOpenMP.cpp)
 
   /// \name Semantic Analysis
   /// Implementations are in Sema.cpp
@@ -701,28 +715,27 @@ class Sema final : public SemaBase {
   void checkTypeSupport(QualType Ty, SourceLocation Loc,
                         ValueDecl *D = nullptr);
 
-  /// The kind of conversion being performed.
-  enum CheckedConversionKind {
-    /// An implicit conversion.
-    CCK_ImplicitConversion,
-    /// A C-style cast.
-    CCK_CStyleCast,
-    /// A functional-style cast.
-    CCK_FunctionalCast,
-    /// A cast other than a C-style cast.
-    CCK_OtherCast,
-    /// A conversion for an operand of a builtin overloaded operator.
-    CCK_ForBuiltinOverloadedOp
-  };
+  // /// The kind of conversion being performed.
+  // enum CheckedConversionKind {
+  //   /// An implicit conversion.
+  //   CCK_ImplicitConversion,
+  //   /// A C-style cast.
+  //   CCK_CStyleCast,
+  //   /// A functional-style cast.
+  //   CCK_FunctionalCast,
+  //   /// A cast other than a C-style cast.
+  //   CCK_OtherCast,
+  //   /// A conversion for an operand of a builtin overloaded operator.
+  //   CCK_ForBuiltinOverloadedOp
+  // };
 
   /// ImpCastExprToType - If Expr is not of type 'Type', insert an implicit
   /// cast.  If there is already an implicit cast, merge into the existing one.
   /// If isLvalue, the result of the cast is an lvalue.
-  ExprResult
-  ImpCastExprToType(Expr *E, QualType Type, CastKind CK,
-                    ExprValueKind VK = VK_PRValue,
-                    const CXXCastPath *BasePath = nullptr,
-                    CheckedConversionKind CCK = CCK_ImplicitConversion);
+  ExprResult ImpCastExprToType(
+      Expr *E, QualType Type, CastKind CK, ExprValueKind VK = VK_PRValue,
+      const CXXCastPath *BasePath = nullptr,
+      CheckedConversionKind CCK = CheckedConversionKind::Implicit);
 
   /// ScalarTypeToBooleanCastKind - Returns the cast kind corresponding
   /// to the conversion from scalar type ScalarTy to the Boolean type.
@@ -997,6 +1010,11 @@ class Sema final : public SemaBase {
     return *OpenACCPtr;
   }
 
+  SemaOpenMP &OpenMP() {
+    assert(OpenMPPtr && "SemaOpenMP is dead");
+    return *OpenMPPtr;
+  }
+
   SemaSYCL &SYCL() {
     assert(SYCLPtr);
     return *SYCLPtr;
@@ -1035,6 +1053,7 @@ class Sema final : public SemaBase {
   std::unique_ptr<SemaCUDA> CUDAPtr;
   std::unique_ptr<SemaHLSL> HLSLPtr;
   std::unique_ptr<SemaOpenACC> OpenACCPtr;
+  std::unique_ptr<SemaOpenMP> OpenMPPtr;
   std::unique_ptr<SemaSYCL> SYCLPtr;
 
   ///@}
@@ -1776,8 +1795,9 @@ class Sema final : public SemaBase {
 
 public:
   static bool isCast(CheckedConversionKind CCK) {
-    return CCK == CCK_CStyleCast || CCK == CCK_FunctionalCast ||
-           CCK == CCK_OtherCast;
+    return CCK == CheckedConversionKind::CStyleCast ||
+           CCK == CheckedConversionKind::FunctionalCast ||
+           CCK == CheckedConversionKind::OtherCast;
   }
 
   /// ActOnCXXNamedCast - Parse
@@ -2630,14 +2650,6 @@ class Sema final : public SemaBase {
     return Entity->getOwningModule();
   }
 
-  struct SkipBodyInfo {
-    SkipBodyInfo() = default;
-    bool ShouldSkip = false;
-    bool CheckSameAsPrevious = false;
-    NamedDecl *Previous = nullptr;
-    NamedDecl *New = nullptr;
-  };
-
   DeclGroupPtrTy ConvertDeclToDeclGroup(Decl *Ptr, Decl *OwnedType = nullptr);
 
   ParsedType getTypeName(const IdentifierInfo &II, SourceLocation NameLoc,
@@ -3431,7 +3443,8 @@ class Sema final : public SemaBase {
       bool ConstexprSupported, bool CLinkageMayDiffer);
 
   /// type checking declaration initializers (C99 6.7.8)
-  bool CheckForConstantInitializer(Expr *e, QualType t);
+  bool CheckForConstantInitializer(
+      Expr *Init, unsigned DiagID = diag::err_init_element_not_constant);
 
   QualType deduceVarTypeFromInitializer(VarDecl *VDecl, DeclarationName Name,
                                         QualType Type, TypeSourceInfo *TSI,
@@ -3443,14 +3456,6 @@ class Sema final : public SemaBase {
 
   sema::LambdaScopeInfo *RebuildLambdaScopeInfo(CXXMethodDecl *CallOperator);
 
-  /// The declarator \p D defines a function in the scope \p S which is nested
-  /// in an `omp begin/end declare variant` scope. In this method we create a
-  /// declaration for \p D and rename \p D according to the OpenMP context
-  /// selector of the surrounding scope. Return all base functions in \p Bases.
-  void ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(
-      Scope *S, Declarator &D, MultiTemplateParamsArg TemplateParameterLists,
-      SmallVectorImpl<FunctionDecl *> &Bases);
-
   // Heuristically tells if the function is `get_return_object` member of a
   // coroutine promise_type by matching the function name.
   static bool CanBeGetReturnObject(const FunctionDecl *FD);
@@ -5533,32 +5538,6 @@ class Sema final : public SemaBase {
                                               Expr *ColumnIdx,
                                               SourceLocation RBLoc);
 
-  ExprResult ActOnOMPArraySectionExpr(Expr *Base, SourceLocation LBLoc,
-                                      Expr *LowerBound,
-                                      SourceLocation ColonLocFirst,
-                                      SourceLocation ColonLocSecond,
-                                      Expr *Length, Expr *Stride,
-                                      SourceLocation RBLoc);
-  ExprResult ActOnOMPArrayShapingExpr(Expr *Base, SourceLocation LParenLoc,
-                                      SourceLocation RParenLoc,
-                                      ArrayRef<Expr *> Dims,
-                                      ArrayRef<SourceRange> Brackets);
-
-  /// Data structure for iterator expression.
-  struct OMPIteratorData {
-    IdentifierInfo *DeclIdent = nullptr;
-    SourceLocation DeclIdentLoc;
-    ParsedType Type;
-    OMPIteratorExpr::IteratorRange Range;
-    SourceLocation AssignLoc;
-    SourceLocation ColonLoc;
-    SourceLocation SecColonLoc;
-  };
-
-  ExprResult ActOnOMPIteratorExpr(Scope *S, SourceLocation IteratorKwLoc,
-                                  SourceLocation LLoc, SourceLocation RLoc,
-                                  ArrayRef<OMPIteratorData> Data);
-
   bool ConvertArgumentsForCall(CallExpr *Call, Expr *Fn, FunctionDecl *FDecl,
                                const FunctionProtoType *Proto,
                                ArrayRef<Expr *> Args, SourceLocation RParenLoc,
@@ -6775,11 +6754,10 @@ class Sema final : public SemaBase {
 
   bool IsStringLiteralToNonConstPointerConversion(Expr *From, QualType ToType);
 
-  ExprResult
-  PerformImplicitConversion(Expr *From, QualType ToType,
-                            const ImplicitConversionSequence &ICS,
-                            AssignmentAction Action,
-                            CheckedConversionKind CCK = CCK_ImplicitConversion);
+  ExprResult PerformImplicitConversion(
+      Expr *From, QualType ToType, const ImplicitConversionSequence &ICS,
+      AssignmentAction Action,
+      CheckedConversionKind CCK = CheckedConversionKind::Implicit);
   ExprResult PerformImplicitConversion(Expr *From, QualType ToType,
                                        const StandardConversionSequence &SCS,
                                        AssignmentAction Action,
@@ -7100,7 +7078,7 @@ class Sema final : public SemaBase {
 
   ExprResult PerformQualificationConversion(
       Expr *E, QualType Ty, ExprValueKind VK = VK_PRValue,
-      CheckedConversionKind CCK = CCK_ImplicitConversion);
+      CheckedConversionKind CCK = CheckedConversionKind::Implicit);
 
   bool CanPerformCopyInitialization(const InitializedEntity &Entity,
                                     ExprResult Init);
@@ -7466,40 +7444,17 @@ class Sema final : public SemaBase {
   typedef std::function<ExprResult(Sema &, TypoExpr *, TypoCorrection)>
       TypoRecoveryCallback;
 
-  /// Specifies whether (or how) name lookup is being performed for a
-  /// redeclaration (vs. a reference).
-  enum RedeclarationKind {
-    /// The lookup is a reference to this name that is not for the
-    /// purpose of redeclaring the name.
-    NotForRedeclaration = 0,
-    /// The lookup results will be used for redeclaration of a name,
-    /// if an entity by that name already exists and is visible.
-    ForVisibleRedeclaration,
-    /// The lookup results will be used for redeclaration of a name
-    /// with external linkage; non-visible lookup results with external linkage
-    /// may also be found.
-    ForExternalRedeclaration
-  };
-
-  RedeclarationKind forRedeclarationInCurContext() const {
-    // A declaration with an owning module for linkage can never link against
-    // anything that is not visible. We don't need to check linkage here; if
-    // the context has internal linkage, redeclaration lookup won't find things
-    // from other TUs, and we can't safely compute linkage yet in general.
-    if (cast<Decl>(CurContext)
-            ->getOwningModuleForLinkage(/*IgnoreLinkage*/ true))
-      return ForVisibleRedeclaration;
-    return ForExternalRedeclaration;
-  }
+  RedeclarationKind forRedeclarationInCurContext() const;
 
   /// Look up a name, looking for a single declaration.  Return
   /// null if the results were absent, ambiguous, or overloaded.
   ///
   /// It is preferable to use the elaborated form and explicitly handle
   /// ambiguity and overloaded.
-  NamedDecl *LookupSingleName(Scope *S, DeclarationName Name,
-                              SourceLocation Loc, LookupNameKind NameKind,
-                              RedeclarationKind Redecl = NotForRedeclaration);
+  NamedDecl *LookupSingleName(
+      Scope *S, DeclarationName Name, SourceLocation Loc,
+      LookupNameKind NameKind,
+      RedeclarationKind Redecl = RedeclarationKind::NotForRedeclaration);
   bool LookupBuiltin(LookupResult &R);
   void LookupNecessaryTypesForBuiltin(Scope *S, unsigned ID);
   bool LookupName(LookupResult &R, Scope *S, bool AllowBuiltinCreation = false,
@@ -7511,9 +7466,9 @@ class Sema final : public SemaBase {
   bool LookupParsedName(LookupResult &R, Scope *S, CXXScopeSpec *SS,
                         bool AllowBuiltinCreation = false,
                         bool EnteringContext = false);
-  ObjCProtocolDecl *
-  LookupProtocol(IdentifierInfo *II, SourceLocation IdLoc,
-                 RedeclarationKind Redecl = NotForRedeclaration);
+  ObjCProtocolDecl *LookupProtocol(
+      IdentifierInfo *II, SourceLocation IdLoc,
+      RedeclarationKind Redecl = RedeclarationKind::NotForRedeclaration);
   bool LookupInSuper(LookupResult &R, CXXRecordDecl *Class);
 
   void LookupOverloadedOperatorName(OverloadedOperatorKind Op, Scope *S,
@@ -12863,1373 +12818,6 @@ class Sema final : public SemaBase {
   std::unique_ptr<sema::RISCVIntrinsicManager> RVIntrinsicManager;
 
   ///@}
-
-  //
-  //
-  // -------------------------------------------------------------------------
-  //
-  //
-
-  /// \name OpenMP Directives and Clauses
-  /// Implementations are in SemaOpenMP.cpp
-  ///@{
-
-public:
-  /// Creates a SemaDiagnosticBuilder that emits the diagnostic if the current
-  /// context is "used as device code".
-  ///
-  /// - If CurContext is a `declare target` function or it is known that the
-  /// function is emitted for the device, emits the diagnostics immediately.
-  /// - If CurContext is a non-`declare target` function and we are compiling
-  ///   for the device, creates a diagnostic which is emitted if and when we
-  ///   realize that the function will be codegen'ed.
-  ///
-  /// Example usage:
-  ///
-  ///  // Variable-length arrays are not allowed in NVPTX device code.
-  ///  if (diagIfOpenMPDeviceCode(Loc, diag::err_vla_unsupported))
-  ///    return ExprError();
-  ///  // Otherwise, continue parsing as normal.
-  SemaDiagnosticBuilder diagIfOpenMPDeviceCode(SourceLocation Loc,
-                                               unsigned DiagID,
-                                               const FunctionDecl *FD);
-
-  /// Creates a SemaDiagnosticBuilder that emits the diagnostic if the current
-  /// context is "used as host code".
-  ///
-  /// - If CurContext is a `declare target` function or it is known that the
-  /// function is emitted for the host, emits the diagnostics immediately.
-  /// - If CurContext is a non-host function, just ignore it.
-  ///
-  /// Example usage:
-  ///
-  ///  // Variable-length arrays are not allowed in NVPTX device code.
-  ///  if (diagIfOpenMPHostode(Loc, diag::err_vla_unsupported))
-  ///    return ExprError();
-  ///  // Otherwise, continue parsing as normal.
-  SemaDiagnosticBuilder diagIfOpenMPHostCode(SourceLocation Loc,
-                                             unsigned DiagID,
-                                             const FunctionDecl *FD);
-
-  /// Register \p D as specialization of all base functions in \p Bases in the
-  /// current `omp begin/end declare variant` scope.
-  void ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope(
-      Decl *D, SmallVectorImpl<FunctionDecl *> &Bases);
-
-  /// Act on \p D, a function definition inside of an `omp [begin/end] assumes`.
-  void ActOnFinishedFunctionDefinitionInOpenMPAssumeScope(Decl *D);
-
-  /// Can we exit an OpenMP declare variant scope at the moment.
-  bool isInOpenMPDeclareVariantScope() const {
-    return !OMPDeclareVariantScopes.empty();
-  }
-
-  ExprResult
-  VerifyPositiveIntegerConstantInClause(Expr *Op, OpenMPClauseKind CKind,
-                                        bool StrictlyPositive = true,
-                                        bool SuppressExprDiags = false);
-
-  /// Given the potential call expression \p Call, determine if there is a
-  /// specialization via the OpenMP declare variant mechanism available. If
-  /// there is, return the specialized call expression, otherwise return the
-  /// original \p Call.
-  ExprResult ActOnOpenMPCall(ExprResult Call, Scope *Scope,
-                             SourceLocation LParenLoc, MultiExprArg ArgExprs,
-                             SourceLocation RParenLoc, Expr *ExecConfig);
-
-  /// Handle a `omp begin declare variant`.
-  void ActOnOpenMPBeginDeclareVariant(SourceLocation Loc, OMPTraitInfo &TI);
-
-  /// Handle a `omp end declare variant`.
-  void ActOnOpenMPEndDeclareVariant();
-
-  /// Function tries to capture lambda's captured variables in the OpenMP region
-  /// before the original lambda is captured.
-  void tryCaptureOpenMPLambdas(ValueDecl *V);
-
-  /// Return true if the provided declaration \a VD should be captured by
-  /// reference.
-  /// \param Level Relative level of nested OpenMP construct for that the check
-  /// is performed.
-  /// \param OpenMPCaptureLevel Capture level within an OpenMP construct.
-  bool isOpenMPCapturedByRef(const ValueDecl *D, unsigned Level,
-                             unsigned OpenMPCaptureLevel) const;
-
-  /// Check if the specified variable is used in one of the private
-  /// clauses (private, firstprivate, lastprivate, reduction etc.) in OpenMP
-  /// constructs.
-  VarDecl *isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo = false,
-                                unsigned StopAt = 0);
-
-  /// The member expression(this->fd) needs to be rebuilt in the template
-  /// instantiation to generate private copy for OpenMP when default
-  /// clause is used. The function will return true if default
-  /// cluse is used.
-  bool isOpenMPRebuildMemberExpr(ValueDecl *D);
-
-  ExprResult getOpenMPCapturedExpr(VarDecl *Capture, ExprValueKind VK,
-                                   ExprObjectKind OK, SourceLocation Loc);
-
-  /// If the current region is a loop-based region, mark the start of the loop
-  /// construct.
-  void startOpenMPLoop();
-
-  /// If the current region is a range loop-based region, mark the start of the
-  /// loop construct.
-  void startOpenMPCXXRangeFor();
-
-  /// Check if the specified variable is used in 'private' clause.
-  /// \param Level Relative level of nested OpenMP construct for that the check
-  /// is performed.
-  OpenMPClauseKind isOpenMPPrivateDecl(ValueDecl *D, unsigned Level,
-                                       unsigned CapLevel) const;
-
-  /// Sets OpenMP capture kind (OMPC_private, OMPC_firstprivate, OMPC_map etc.)
-  /// for \p FD based on DSA for the provided corresponding captured declaration
-  /// \p D.
-  void setOpenMPCaptureKind(FieldDecl *FD, const ValueDecl *D, unsigned Level);
-
-  /// Check if the specified variable is captured  by 'target' directive.
-  /// \param Level Relative level of nested OpenMP construct for that the check
-  /// is performed.
-  bool isOpenMPTargetCapturedDecl(const ValueDecl *D, unsigned Level,
-                                  unsigned CaptureLevel) const;
-
-  /// Check if the specified global variable must be captured  by outer capture
-  /// regions.
-  /// \param Level Relative level of nested OpenMP construct for that
-  /// the check is performed.
-  bool isOpenMPGlobalCapturedDecl(ValueDecl *D, unsigned Level,
-                                  unsigned CaptureLevel) const;
-
-  ExprResult PerformOpenMPImplicitIntegerConversion(SourceLocation OpLoc,
-                                                    Expr *Op);
-  /// Called on start of new data sharing attribute block.
-  void StartOpenMPDSABlock(OpenMPDirectiveKind K,
-                           const DeclarationNameInfo &DirName, Scope *CurScope,
-                           SourceLocation Loc);
-  /// Start analysis of clauses.
-  void StartOpenMPClause(OpenMPClauseKind K);
-  /// End analysis of clauses.
-  void EndOpenMPClause();
-  /// Called on end of data sharing attribute block.
-  void EndOpenMPDSABlock(Stmt *CurDirective);
-
-  /// Check if the current region is an OpenMP loop region and if it is,
-  /// mark loop control variable, used in \p Init for loop initialization, as
-  /// private by default.
-  /// \param Init First part of the for loop.
-  void ActOnOpenMPLoopInitialization(SourceLocation ForLoc, Stmt *Init);
-
-  /// Called on well-formed '\#pragma omp metadirective' after parsing
-  /// of the  associated statement.
-  StmtResult ActOnOpenMPMetaDirective(ArrayRef<OMPClause *> Clauses,
-                                      Stmt *AStmt, SourceLocation StartLoc,
-                                      SourceLocation EndLoc);
-
-  // OpenMP directives and clauses.
-  /// Called on correct id-expression from the '#pragma omp
-  /// threadprivate'.
-  ExprResult ActOnOpenMPIdExpression(Scope *CurScope, CXXScopeSpec &ScopeSpec,
-                                     const DeclarationNameInfo &Id,
-                                     OpenMPDirectiveKind Kind);
-  /// Called on well-formed '#pragma omp threadprivate'.
-  DeclGroupPtrTy ActOnOpenMPThreadprivateDirective(SourceLocation Loc,
-                                                   ArrayRef<Expr *> VarList);
-  /// Builds a new OpenMPThreadPrivateDecl and checks its correctness.
-  OMPThreadPrivateDecl *CheckOMPThreadPrivateDecl(SourceLocation Loc,
-                                                  ArrayRef<Expr *> VarList);
-  /// Called on well-formed '#pragma omp allocate'.
-  DeclGroupPtrTy ActOnOpenMPAllocateDirective(SourceLocation Loc,
-                                              ArrayRef<Expr *> VarList,
-                                              ArrayRef<OMPClause *> Clauses,
-                                              DeclContext *Owner = nullptr);
-
-  /// Called on well-formed '#pragma omp [begin] assume[s]'.
-  void ActOnOpenMPAssumesDirective(SourceLocation Loc,
-                                   OpenMPDirectiveKind DKind,
-                                   ArrayRef<std::string> Assumptions,
-                                   bool SkippedClauses);
-
-  /// Check if there is an active global `omp begin assumes` directive.
-  bool isInOpenMPAssumeScope() const { return !OMPAssumeScoped.empty(); }
-
-  /// Check if there is an active global `omp assumes` directive.
-  bool hasGlobalOpenMPAssumes() const { return !OMPAssumeGlobal.empty(); }
-
-  /// Called on well-formed '#pragma omp end assumes'.
-  void ActOnOpenMPEndAssumesDirective();
-
-  /// Called on well-formed '#pragma omp requires'.
-  DeclGroupPtrTy ActOnOpenMPRequiresDirective(SourceLocation Loc,
-                                              ArrayRef<OMPClause *> ClauseList);
-  /// Check restrictions on Requires directive
-  OMPRequiresDecl *CheckOMPRequiresDecl(SourceLocation Loc,
-                                        ArrayRef<OMPClause *> Clauses);
-  /// Check if the specified type is allowed to be used in 'omp declare
-  /// reduction' construct.
-  QualType ActOnOpenMPDeclareReductionType(SourceLocation TyLoc,
-                                           TypeResult ParsedType);
-  /// Called on start of '#pragma omp declare reduction'.
-  DeclGroupPtrTy ActOnOpenMPDeclareReductionDirectiveStart(
-      Scope *S, DeclContext *DC, DeclarationName Name,
-      ArrayRef<std::pair<QualType, SourceLocation>> ReductionTypes,
-      AccessSpecifier AS, Decl *PrevDeclInScope = nullptr);
-  /// Initialize declare reduction construct initializer.
-  void ActOnOpenMPDeclareReductionCombinerStart(Scope *S, Decl *D);
-  /// Finish current declare reduction construct initializer.
-  void ActOnOpenMPDeclareReductionCombinerEnd(Decl *D, Expr *Combiner);
-  /// Initialize declare reduction construct initializer.
-  /// \return omp_priv variable.
-  VarDecl *ActOnOpenMPDeclareReductionInitializerStart(Scope *S, Decl *D);
-  /// Finish current declare reduction construct initializer.
-  void ActOnOpenMPDeclareReductionInitializerEnd(Decl *D, Expr *Initializer,
-                                                 VarDecl *OmpPrivParm);
-  /// Called at the end of '#pragma omp declare reduction'.
-  DeclGroupPtrTy ActOnOpenMPDeclareReductionDirectiveEnd(
-      Scope *S, DeclGroupPtrTy DeclReductions, bool IsValid);
-
-  /// Check variable declaration in 'omp declare mapper' construct.
-  TypeResult ActOnOpenMPDeclareMapperVarDecl(Scope *S, Declarator &D);
-  /// Check if the specified type is allowed to be used in 'omp declare
-  /// mapper' construct.
-  QualType ActOnOpenMPDeclareMapperType(SourceLocation TyLoc,
-                                        TypeResult ParsedType);
-  /// Called on start of '#pragma omp declare mapper'.
-  DeclGroupPtrTy ActOnOpenMPDeclareMapperDirective(
-      Scope *S, DeclContext *DC, DeclarationName Name, QualType MapperType,
-      SourceLocation StartLoc, DeclarationName VN, AccessSpecifier AS,
-      Expr *MapperVarRef, ArrayRef<OMPClause *> Clauses,
-      Decl *PrevDeclInScope = nullptr);
-  /// Build the mapper variable of '#pragma omp declare mapper'.
-  ExprResult ActOnOpenMPDeclareMapperDirectiveVarDecl(Scope *S,
-                                                      QualType MapperType,
-                                                      SourceLocation StartLoc,
-                                                      DeclarationName VN);
-  void ActOnOpenMPIteratorVarDecl(VarDecl *VD);
-  bool isOpenMPDeclareMapperVarDeclAllowed(const VarDecl *VD) const;
-  const ValueDecl *getOpenMPDeclareMapperVarName() const;
-
-  struct DeclareTargetContextInfo {
-    struct MapInfo {
-      OMPDeclareTargetDeclAttr::MapTypeTy MT;
-      SourceLocation Loc;
-    };
-    /// Explicitly listed variables and functions in a 'to' or 'link' clause.
-    llvm::DenseMap<NamedDecl *, MapInfo> ExplicitlyMapped;
-
-    /// The 'device_type' as parsed from the clause.
-    OMPDeclareTargetDeclAttr::DevTypeTy DT = OMPDeclareTargetDeclAttr::DT_Any;
-
-    /// The directive kind, `begin declare target` or `declare target`.
-    OpenMPDirectiveKind Kind;
-
-    /// The directive with indirect clause.
-    std::optional<Expr *> Indirect;
-
-    /// The directive location.
-    SourceLocation Loc;
-
-    DeclareTargetContextInfo(OpenMPDirectiveKind Kind, SourceLocation Loc)
-        : Kind(Kind), Loc(Loc) {}
-  };
-
-  /// Called on the start of target region i.e. '#pragma omp declare target'.
-  bool ActOnStartOpenMPDeclareTargetContext(DeclareTargetContextInfo &DTCI);
-
-  /// Called at the end of target region i.e. '#pragma omp end declare target'.
-  const DeclareTargetContextInfo ActOnOpenMPEndDeclareTargetDirective();
-
-  /// Called once a target context is completed, that can be when a
-  /// '#pragma omp end declare target' was encountered or when a
-  /// '#pragma omp declare target' without declaration-definition-seq was
-  /// encountered.
-  void ActOnFinishedOpenMPDeclareTargetContext(DeclareTargetContextInfo &DTCI);
-
-  /// Report unterminated 'omp declare target' or 'omp begin declare target' at
-  /// the end of a compilation unit.
-  void DiagnoseUnterminatedOpenMPDeclareTarget();
-
-  /// Searches for the provided declaration name for OpenMP declare target
-  /// directive.
-  NamedDecl *lookupOpenMPDeclareTargetName(Scope *CurScope,
-                                           CXXScopeSpec &ScopeSpec,
-                                           const DeclarationNameInfo &Id);
-
-  /// Called on correct id-expression from the '#pragma omp declare target'.
-  void ActOnOpenMPDeclareTargetName(NamedDecl *ND, SourceLocation Loc,
-                                    OMPDeclareTargetDeclAttr::MapTypeTy MT,
-                                    DeclareTargetContextInfo &DTCI);
-
-  /// Check declaration inside target region.
-  void
-  checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D,
-                                   SourceLocation IdLoc = SourceLocation());
-
-  /// Adds OMPDeclareTargetDeclAttr to referenced variables in declare target
-  /// directive.
-  void ActOnOpenMPDeclareTargetInitializer(Decl *D);
-
-  /// Finishes analysis of the deferred functions calls that may be declared as
-  /// host/nohost during device/host compilation.
-  void finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller,
-                                     const FunctionDecl *Callee,
-                                     SourceLocation Loc);
-
-  /// Return true if currently in OpenMP task with untied clause context.
-  bool isInOpenMPTaskUntiedContext() const;
-
-  /// Return true inside OpenMP declare target region.
-  bool isInOpenMPDeclareTargetContext() const {
-    return !DeclareTargetNesting.empty();
-  }
-  /// Return true inside OpenMP target region.
-  bool isInOpenMPTargetExecutionDirective() const;
-
-  /// Return the number of captured regions created for an OpenMP directive.
-  static int getOpenMPCaptureLevels(OpenMPDirectiveKind Kind);
-
-  /// Initialization of captured region for OpenMP region.
-  void ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope);
-
-  /// Called for syntactical loops (ForStmt or CXXForRangeStmt) associated to
-  /// an OpenMP loop directive.
-  StmtResult ActOnOpenMPCanonicalLoop(Stmt *AStmt);
-
-  /// Process a canonical OpenMP loop nest that can either be a canonical
-  /// literal loop (ForStmt or CXXForRangeStmt), or the generated loop of an
-  /// OpenMP loop transformation construct.
-  StmtResult ActOnOpenMPLoopnest(Stmt *AStmt);
-
-  /// End of OpenMP region.
-  ///
-  /// \param S Statement associated with the current OpenMP region.
-  /// \param Clauses List of clauses for the current OpenMP region.
-  ///
-  /// \returns Statement for finished OpenMP region.
-  StmtResult ActOnOpenMPRegionEnd(StmtResult S, ArrayRef<OMPClause *> Clauses);
-  StmtResult ActOnOpenMPExecutableDirective(
-      OpenMPDirectiveKind Kind, const DeclarationNameInfo &DirName,
-      OpenMPDirectiveKind CancelRegion, ArrayRef<OMPClause *> Clauses,
-      Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc,
-      OpenMPDirectiveKind PrevMappedDirective = llvm::omp::OMPD_unknown);
-  /// Called on well-formed '\#pragma omp parallel' after parsing
-  /// of the  associated statement.
-  StmtResult ActOnOpenMPParallelDirective(ArrayRef<OMPClause *> Clauses,
-                                          Stmt *AStmt, SourceLocation StartLoc,
-                                          SourceLocation EndLoc);
-  using VarsWithInheritedDSAType =
-      llvm::SmallDenseMap<const ValueDecl *, const Expr *, 4>;
-  /// Called on well-formed '\#pragma omp simd' after parsing
-  /// of the associated statement.
-  StmtResult
-  ActOnOpenMPSimdDirective(ArrayRef<OMPClause *> Clauses, Stmt *AStmt,
-                           SourceLocation StartLoc, SourceLocation EndLoc,
-                           VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '#pragma omp tile' after parsing of its clauses and
-  /// the associated statement.
-  StmtResult ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
-                                      Stmt *AStmt, SourceLocation StartLoc,
-                                      SourceLocation EndLoc);
-  /// Called on well-formed '#pragma omp unroll' after parsing of its clauses
-  /// and the associated statement.
-  StmtResult ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
-                                        Stmt *AStmt, SourceLocation StartLoc,
-                                        SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp for' after parsing
-  /// of the associated statement.
-  StmtResult
-  ActOnOpenMPForDirective(ArrayRef<OMPClause *> Clauses, Stmt *AStmt,
-                          SourceLocation StartLoc, SourceLocation EndLoc,
-                          VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp for simd' after parsing
-  /// of the associated statement.
-  StmtResult
-  ActOnOpenMPForSimdDirective(ArrayRef<OMPClause *> Clauses, Stmt *AStmt,
-                              SourceLocation StartLoc, SourceLocation EndLoc,
-                              VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp sections' after parsing
-  /// of the associated statement.
-  StmtResult ActOnOpenMPSectionsDirective(ArrayRef<OMPClause *> Clauses,
-                                          Stmt *AStmt, SourceLocation StartLoc,
-                                          SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp section' after parsing of the
-  /// associated statement.
-  StmtResult ActOnOpenMPSectionDirective(Stmt *AStmt, SourceLocation StartLoc,
-                                         SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp scope' after parsing of the
-  /// associated statement.
-  StmtResult ActOnOpenMPScopeDirective(ArrayRef<OMPClause *> Clauses,
-                                       Stmt *AStmt, SourceLocation StartLoc,
-                                       SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp single' after parsing of the
-  /// associated statement.
-  StmtResult ActOnOpenMPSingleDirective(ArrayRef<OMPClause *> Clauses,
-                                        Stmt *AStmt, SourceLocation StartLoc,
-                                        SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp master' after parsing of the
-  /// associated statement.
-  StmtResult ActOnOpenMPMasterDirective(Stmt *AStmt, SourceLocation StartLoc,
-                                        SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp critical' after parsing of the
-  /// associated statement.
-  StmtResult ActOnOpenMPCriticalDirective(const DeclarationNameInfo &DirName,
-                                          ArrayRef<OMPClause *> Clauses,
-                                          Stmt *AStmt, SourceLocation StartLoc,
-                                          SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp parallel for' after parsing
-  /// of the  associated statement.
-  StmtResult ActOnOpenMPParallelForDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp parallel for simd' after
-  /// parsing of the  associated statement.
-  StmtResult ActOnOpenMPParallelForSimdDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp parallel master' after
-  /// parsing of the  associated statement.
-  StmtResult ActOnOpenMPParallelMasterDirective(ArrayRef<OMPClause *> Clauses,
-                                                Stmt *AStmt,
-                                                SourceLocation StartLoc,
-                                                SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp parallel masked' after
-  /// parsing of the associated statement.
-  StmtResult ActOnOpenMPParallelMaskedDirective(ArrayRef<OMPClause *> Clauses,
-                                                Stmt *AStmt,
-                                                SourceLocation StartLoc,
-                                                SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp parallel sections' after
-  /// parsing of the  associated statement.
-  StmtResult ActOnOpenMPParallelSectionsDirective(ArrayRef<OMPClause *> Clauses,
-                                                  Stmt *AStmt,
-                                                  SourceLocation StartLoc,
-                                                  SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp task' after parsing of the
-  /// associated statement.
-  StmtResult ActOnOpenMPTaskDirective(ArrayRef<OMPClause *> Clauses,
-                                      Stmt *AStmt, SourceLocation StartLoc,
-                                      SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp taskyield'.
-  StmtResult ActOnOpenMPTaskyieldDirective(SourceLocation StartLoc,
-                                           SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp error'.
-  /// Error direcitive is allowed in both declared and excutable contexts.
-  /// Adding InExContext to identify which context is called from.
-  StmtResult ActOnOpenMPErrorDirective(ArrayRef<OMPClause *> Clauses,
-                                       SourceLocation StartLoc,
-                                       SourceLocation EndLoc,
-                                       bool InExContext = true);
-  /// Called on well-formed '\#pragma omp barrier'.
-  StmtResult ActOnOpenMPBarrierDirective(SourceLocation StartLoc,
-                                         SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp taskwait'.
-  StmtResult ActOnOpenMPTaskwaitDirective(ArrayRef<OMPClause *> Clauses,
-                                          SourceLocation StartLoc,
-                                          SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp taskgroup'.
-  StmtResult ActOnOpenMPTaskgroupDirective(ArrayRef<OMPClause *> Clauses,
-                                           Stmt *AStmt, SourceLocation StartLoc,
-                                           SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp flush'.
-  StmtResult ActOnOpenMPFlushDirective(ArrayRef<OMPClause *> Clauses,
-                                       SourceLocation StartLoc,
-                                       SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp depobj'.
-  StmtResult ActOnOpenMPDepobjDirective(ArrayRef<OMPClause *> Clauses,
-                                        SourceLocation StartLoc,
-                                        SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp scan'.
-  StmtResult ActOnOpenMPScanDirective(ArrayRef<OMPClause *> Clauses,
-                                      SourceLocation StartLoc,
-                                      SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp ordered' after parsing of the
-  /// associated statement.
-  StmtResult ActOnOpenMPOrderedDirective(ArrayRef<OMPClause *> Clauses,
-                                         Stmt *AStmt, SourceLocation StartLoc,
-                                         SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp atomic' after parsing of the
-  /// associated statement.
-  StmtResult ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
-                                        Stmt *AStmt, SourceLocation StartLoc,
-                                        SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp target' after parsing of the
-  /// associated statement.
-  StmtResult ActOnOpenMPTargetDirective(ArrayRef<OMPClause *> Clauses,
-                                        Stmt *AStmt, SourceLocation StartLoc,
-                                        SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp target data' after parsing of
-  /// the associated statement.
-  StmtResult ActOnOpenMPTargetDataDirective(ArrayRef<OMPClause *> Clauses,
-                                            Stmt *AStmt,
-                                            SourceLocation StartLoc,
-                                            SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp target enter data' after
-  /// parsing of the associated statement.
-  StmtResult ActOnOpenMPTargetEnterDataDirective(ArrayRef<OMPClause *> Clauses,
-                                                 SourceLocation StartLoc,
-                                                 SourceLocation EndLoc,
-                                                 Stmt *AStmt);
-  /// Called on well-formed '\#pragma omp target exit data' after
-  /// parsing of the associated statement.
-  StmtResult ActOnOpenMPTargetExitDataDirective(ArrayRef<OMPClause *> Clauses,
-                                                SourceLocation StartLoc,
-                                                SourceLocation EndLoc,
-                                                Stmt *AStmt);
-  /// Called on well-formed '\#pragma omp target parallel' after
-  /// parsing of the associated statement.
-  StmtResult ActOnOpenMPTargetParallelDirective(ArrayRef<OMPClause *> Clauses,
-                                                Stmt *AStmt,
-                                                SourceLocation StartLoc,
-                                                SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp target parallel for' after
-  /// parsing of the  associated statement.
-  StmtResult ActOnOpenMPTargetParallelForDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp teams' after parsing of the
-  /// associated statement.
-  StmtResult ActOnOpenMPTeamsDirective(ArrayRef<OMPClause *> Clauses,
-                                       Stmt *AStmt, SourceLocation StartLoc,
-                                       SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp teams loop' after parsing of the
-  /// associated statement.
-  StmtResult ActOnOpenMPTeamsGenericLoopDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp target teams loop' after parsing of
-  /// the associated statement.
-  StmtResult ActOnOpenMPTargetTeamsGenericLoopDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp parallel loop' after parsing of the
-  /// associated statement.
-  StmtResult ActOnOpenMPParallelGenericLoopDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp target parallel loop' after parsing
-  /// of the associated statement.
-  StmtResult ActOnOpenMPTargetParallelGenericLoopDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp cancellation point'.
-  StmtResult
-  ActOnOpenMPCancellationPointDirective(SourceLocation StartLoc,
-                                        SourceLocation EndLoc,
-                                        OpenMPDirectiveKind CancelRegion);
-  /// Called on well-formed '\#pragma omp cancel'.
-  StmtResult ActOnOpenMPCancelDirective(ArrayRef<OMPClause *> Clauses,
-                                        SourceLocation StartLoc,
-                                        SourceLocation EndLoc,
-                                        OpenMPDirectiveKind CancelRegion);
-  /// Called on well-formed '\#pragma omp taskloop' after parsing of the
-  /// associated statement.
-  StmtResult
-  ActOnOpenMPTaskLoopDirective(ArrayRef<OMPClause *> Clauses, Stmt *AStmt,
-                               SourceLocation StartLoc, SourceLocation EndLoc,
-                               VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp taskloop simd' after parsing of
-  /// the associated statement.
-  StmtResult ActOnOpenMPTaskLoopSimdDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp master taskloop' after parsing of the
-  /// associated statement.
-  StmtResult ActOnOpenMPMasterTaskLoopDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp master taskloop simd' after parsing of
-  /// the associated statement.
-  StmtResult ActOnOpenMPMasterTaskLoopSimdDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp parallel master taskloop' after
-  /// parsing of the associated statement.
-  StmtResult ActOnOpenMPParallelMasterTaskLoopDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp parallel master taskloop simd' after
-  /// parsing of the associated statement.
-  StmtResult ActOnOpenMPParallelMasterTaskLoopSimdDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp masked taskloop' after parsing of the
-  /// associated statement.
-  StmtResult ActOnOpenMPMaskedTaskLoopDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp masked taskloop simd' after parsing of
-  /// the associated statement.
-  StmtResult ActOnOpenMPMaskedTaskLoopSimdDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp parallel masked taskloop' after
-  /// parsing of the associated statement.
-  StmtResult ActOnOpenMPParallelMaskedTaskLoopDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp parallel masked taskloop simd' after
-  /// parsing of the associated statement.
-  StmtResult ActOnOpenMPParallelMaskedTaskLoopSimdDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp distribute' after parsing
-  /// of the associated statement.
-  StmtResult
-  ActOnOpenMPDistributeDirective(ArrayRef<OMPClause *> Clauses, Stmt *AStmt,
-                                 SourceLocation StartLoc, SourceLocation EndLoc,
-                                 VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp target update'.
-  StmtResult ActOnOpenMPTargetUpdateDirective(ArrayRef<OMPClause *> Clauses,
-                                              SourceLocation StartLoc,
-                                              SourceLocation EndLoc,
-                                              Stmt *AStmt);
-  /// Called on well-formed '\#pragma omp distribute parallel for' after
-  /// parsing of the associated statement.
-  StmtResult ActOnOpenMPDistributeParallelForDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp distribute parallel for simd'
-  /// after parsing of the associated statement.
-  StmtResult ActOnOpenMPDistributeParallelForSimdDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp distribute simd' after
-  /// parsing of the associated statement.
-  StmtResult ActOnOpenMPDistributeSimdDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp target parallel for simd' after
-  /// parsing of the associated statement.
-  StmtResult ActOnOpenMPTargetParallelForSimdDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp target simd' after parsing of
-  /// the associated statement.
-  StmtResult
-  ActOnOpenMPTargetSimdDirective(ArrayRef<OMPClause *> Clauses, Stmt *AStmt,
-                                 SourceLocation StartLoc, SourceLocation EndLoc,
-                                 VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp teams distribute' after parsing of
-  /// the associated statement.
-  StmtResult ActOnOpenMPTeamsDistributeDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp teams distribute simd' after parsing
-  /// of the associated statement.
-  StmtResult ActOnOpenMPTeamsDistributeSimdDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp teams distribute parallel for simd'
-  /// after parsing of the associated statement.
-  StmtResult ActOnOpenMPTeamsDistributeParallelForSimdDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp teams distribute parallel for'
-  /// after parsing of the associated statement.
-  StmtResult ActOnOpenMPTeamsDistributeParallelForDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp target teams' after parsing of the
-  /// associated statement.
-  StmtResult ActOnOpenMPTargetTeamsDirective(ArrayRef<OMPClause *> Clauses,
-                                             Stmt *AStmt,
-                                             SourceLocation StartLoc,
-                                             SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp target teams distribute' after parsing
-  /// of the associated statement.
-  StmtResult ActOnOpenMPTargetTeamsDistributeDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp target teams distribute parallel for'
-  /// after parsing of the associated statement.
-  StmtResult ActOnOpenMPTargetTeamsDistributeParallelForDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp target teams distribute parallel for
-  /// simd' after parsing of the associated statement.
-  StmtResult ActOnOpenMPTargetTeamsDistributeParallelForSimdDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp target teams distribute simd' after
-  /// parsing of the associated statement.
-  StmtResult ActOnOpenMPTargetTeamsDistributeSimdDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-  /// Called on well-formed '\#pragma omp interop'.
-  StmtResult ActOnOpenMPInteropDirective(ArrayRef<OMPClause *> Clauses,
-                                         SourceLocation StartLoc,
-                                         SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp dispatch' after parsing of the
-  // /associated statement.
-  StmtResult ActOnOpenMPDispatchDirective(ArrayRef<OMPClause *> Clauses,
-                                          Stmt *AStmt, SourceLocation StartLoc,
-                                          SourceLocation EndLoc);
-  /// Called on well-formed '\#pragma omp masked' after parsing of the
-  // /associated statement.
-  StmtResult ActOnOpenMPMaskedDirective(ArrayRef<OMPClause *> Clauses,
-                                        Stmt *AStmt, SourceLocation StartLoc,
-                                        SourceLocation EndLoc);
-
-  /// Called on well-formed '\#pragma omp loop' after parsing of the
-  /// associated statement.
-  StmtResult ActOnOpenMPGenericLoopDirective(
-      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
-      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
-
-  /// Checks correctness of linear modifiers.
-  bool CheckOpenMPLinearModifier(OpenMPLinearClauseKind LinKind,
-                                 SourceLocation LinLoc);
-  /// Checks that the specified declaration matches requirements for the linear
-  /// decls.
-  bool CheckOpenMPLinearDecl(const ValueDecl *D, SourceLocation ELoc,
-                             OpenMPLinearClauseKind LinKind, QualType Type,
-                             bool IsDeclareSimd = false);
-
-  /// Called on well-formed '\#pragma omp declare simd' after parsing of
-  /// the associated method/function.
-  DeclGroupPtrTy ActOnOpenMPDeclareSimdDirective(
-      DeclGroupPtrTy DG, OMPDeclareSimdDeclAttr::BranchStateTy BS,
-      Expr *Simdlen, ArrayRef<Expr *> Uniforms, ArrayRef<Expr *> Aligneds,
-      ArrayRef<Expr *> Alignments, ArrayRef<Expr *> Linears,
-      ArrayRef<unsigned> LinModifiers, ArrayRef<Expr *> Steps, SourceRange SR);
-
-  /// Checks '\#pragma omp declare variant' variant function and original
-  /// functions after parsing of the associated method/function.
-  /// \param DG Function declaration to which declare variant directive is
-  /// applied to.
-  /// \param VariantRef Expression that references the variant function, which
-  /// must be used instead of the original one, specified in \p DG.
-  /// \param TI The trait info object representing the match clause.
-  /// \param NumAppendArgs The number of omp_interop_t arguments to account for
-  /// in checking.
-  /// \returns std::nullopt, if the function/variant function are not compatible
-  /// with the pragma, pair of original function/variant ref expression
-  /// otherwise.
-  std::optional<std::pair<FunctionDecl *, Expr *>>
-  checkOpenMPDeclareVariantFunction(DeclGroupPtrTy DG, Expr *VariantRef,
-                                    OMPTraitInfo &TI, unsigned NumAppendArgs,
-                                    SourceRange SR);
-
-  /// Called on well-formed '\#pragma omp declare variant' after parsing of
-  /// the associated method/function.
-  /// \param FD Function declaration to which declare variant directive is
-  /// applied to.
-  /// \param VariantRef Expression that references the variant function, which
-  /// must be used instead of the original one, specified in \p DG.
-  /// \param TI The context traits associated with the function variant.
-  /// \param AdjustArgsNothing The list of 'nothing' arguments.
-  /// \param AdjustArgsNeedDevicePtr The list of 'need_device_ptr' arguments.
-  /// \param AppendArgs The list of 'append_args' arguments.
-  /// \param AdjustArgsLoc The Location of an 'adjust_args' clause.
-  /// \param AppendArgsLoc The Location of an 'append_args' clause.
-  /// \param SR The SourceRange of the 'declare variant' directive.
-  void ActOnOpenMPDeclareVariantDirective(
-      FunctionDecl *FD, Expr *VariantRef, OMPTraitInfo &TI,
-      ArrayRef<Expr *> AdjustArgsNothing,
-      ArrayRef<Expr *> AdjustArgsNeedDevicePtr,
-      ArrayRef<OMPInteropInfo> AppendArgs, SourceLocation AdjustArgsLoc,
-      SourceLocation AppendArgsLoc, SourceRange SR);
-
-  OMPClause *ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, Expr *Expr,
-                                         SourceLocation StartLoc,
-                                         SourceLocation LParenLoc,
-                                         SourceLocation EndLoc);
-  /// Called on well-formed 'allocator' clause.
-  OMPClause *ActOnOpenMPAllocatorClause(Expr *Allocator,
-                                        SourceLocation StartLoc,
-                                        SourceLocation LParenLoc,
-                                        SourceLocation EndLoc);
-  /// Called on well-formed 'if' clause.
-  OMPClause *ActOnOpenMPIfClause(OpenMPDirectiveKind NameModifier,
-                                 Expr *Condition, SourceLocation StartLoc,
-                                 SourceLocation LParenLoc,
-                                 SourceLocation NameModifierLoc,
-                                 SourceLocation ColonLoc,
-                                 SourceLocation EndLoc);
-  /// Called on well-formed 'final' clause.
-  OMPClause *ActOnOpenMPFinalClause(Expr *Condition, SourceLocation StartLoc,
-                                    SourceLocation LParenLoc,
-                                    SourceLocation EndLoc);
-  /// Called on well-formed 'num_threads' clause.
-  OMPClause *ActOnOpenMPNumThreadsClause(Expr *NumThreads,
-                                         SourceLocation StartLoc,
-                                         SourceLocation LParenLoc,
-                                         SourceLocation EndLoc);
-  /// Called on well-formed 'align' clause.
-  OMPClause *ActOnOpenMPAlignClause(Expr *Alignment, SourceLocation StartLoc,
-                                    SourceLocation LParenLoc,
-                                    SourceLocation EndLoc);
-  /// Called on well-formed 'safelen' clause.
-  OMPClause *ActOnOpenMPSafelenClause(Expr *Length, SourceLocation StartLoc,
-                                      SourceLocation LParenLoc,
-                                      SourceLocation EndLoc);
-  /// Called on well-formed 'simdlen' clause.
-  OMPClause *ActOnOpenMPSimdlenClause(Expr *Length, SourceLocation StartLoc,
-                                      SourceLocation LParenLoc,
-                                      SourceLocation EndLoc);
-  /// Called on well-form 'sizes' clause.
-  OMPClause *ActOnOpenMPSizesClause(ArrayRef<Expr *> SizeExprs,
-                                    SourceLocation StartLoc,
-                                    SourceLocation LParenLoc,
-                                    SourceLocation EndLoc);
-  /// Called on well-form 'full' clauses.
-  OMPClause *ActOnOpenMPFullClause(SourceLocation StartLoc,
-                                   SourceLocation EndLoc);
-  /// Called on well-form 'partial' clauses.
-  OMPClause *ActOnOpenMPPartialClause(Expr *FactorExpr, SourceLocation StartLoc,
-                                      SourceLocation LParenLoc,
-                                      SourceLocation EndLoc);
-  /// Called on well-formed 'collapse' clause.
-  OMPClause *ActOnOpenMPCollapseClause(Expr *NumForLoops,
-                                       SourceLocation StartLoc,
-                                       SourceLocation LParenLoc,
-                                       SourceLocation EndLoc);
-  /// Called on well-formed 'ordered' clause.
-  OMPClause *
-  ActOnOpenMPOrderedClause(SourceLocation StartLoc, SourceLocation EndLoc,
-                           SourceLocation LParenLoc = SourceLocation(),
-                           Expr *NumForLoops = nullptr);
-  /// Called on well-formed 'grainsize' clause.
-  OMPClause *ActOnOpenMPGrainsizeClause(OpenMPGrainsizeClauseModifier Modifier,
-                                        Expr *Size, SourceLocation StartLoc,
-                                        SourceLocation LParenLoc,
-                                        SourceLocation ModifierLoc,
-                                        SourceLocation EndLoc);
-  /// Called on well-formed 'num_tasks' clause.
-  OMPClause *ActOnOpenMPNumTasksClause(OpenMPNumTasksClauseModifier Modifier,
-                                       Expr *NumTasks, SourceLocation StartLoc,
-                                       SourceLocation LParenLoc,
-                                       SourceLocation ModifierLoc,
-                                       SourceLocation EndLoc);
-  /// Called on well-formed 'hint' clause.
-  OMPClause *ActOnOpenMPHintClause(Expr *Hint, SourceLocation StartLoc,
-                                   SourceLocation LParenLoc,
-                                   SourceLocation EndLoc);
-  /// Called on well-formed 'detach' clause.
-  OMPClause *ActOnOpenMPDetachClause(Expr *Evt, SourceLocation StartLoc,
-                                     SourceLocation LParenLoc,
-                                     SourceLocation EndLoc);
-
-  OMPClause *ActOnOpenMPSimpleClause(OpenMPClauseKind Kind, unsigned Argument,
-                                     SourceLocation ArgumentLoc,
-                                     SourceLocation StartLoc,
-                                     SourceLocation LParenLoc,
-                                     SourceLocation EndLoc);
-  /// Called on well-formed 'when' clause.
-  OMPClause *ActOnOpenMPWhenClause(OMPTraitInfo &TI, SourceLocation StartLoc,
-                                   SourceLocation LParenLoc,
-                                   SourceLocation EndLoc);
-  /// Called on well-formed 'default' clause.
-  OMPClause *ActOnOpenMPDefaultClause(llvm::omp::DefaultKind Kind,
-                                      SourceLocation KindLoc,
-                                      SourceLocation StartLoc,
-                                      SourceLocation LParenLoc,
-                                      SourceLocation EndLoc);
-  /// Called on well-formed 'proc_bind' clause.
-  OMPClause *ActOnOpenMPProcBindClause(llvm::omp::ProcBindKind Kind,
-                                       SourceLocation KindLoc,
-                                       SourceLocation StartLoc,
-                                       SourceLocation LParenLoc,
-                                       SourceLocation EndLoc);
-  /// Called on well-formed 'order' clause.
-  OMPClause *ActOnOpenMPOrderClause(OpenMPOrderClauseModifier Modifier,
-                                    OpenMPOrderClauseKind Kind,
-                                    SourceLocation StartLoc,
-                                    SourceLocation LParenLoc,
-                                    SourceLocation MLoc, SourceLocation KindLoc,
-                                    SourceLocation EndLoc);
-  /// Called on well-formed 'update' clause.
-  OMPClause *ActOnOpenMPUpdateClause(OpenMPDependClauseKind Kind,
-                                     SourceLocation KindLoc,
-                                     SourceLocation StartLoc,
-                                     SourceLocation LParenLoc,
-                                     SourceLocation EndLoc);
-
-  OMPClause *ActOnOpenMPSingleExprWithArgClause(
-      OpenMPClauseKind Kind, ArrayRef<unsigned> Arguments, Expr *Expr,
-      SourceLocation StartLoc, SourceLocation LParenLoc,
-      ArrayRef<SourceLocation> ArgumentsLoc, SourceLocation DelimLoc,
-      SourceLocation EndLoc);
-  /// Called on well-formed 'schedule' clause.
-  OMPClause *ActOnOpenMPScheduleClause(
-      OpenMPScheduleClauseModifier M1, OpenMPScheduleClauseModifier M2,
-      OpenMPScheduleClauseKind Kind, Expr *ChunkSize, SourceLocation StartLoc,
-      SourceLocation LParenLoc, SourceLocation M1Loc, SourceLocation M2Loc,
-      SourceLocation KindLoc, SourceLocation CommaLoc, SourceLocation EndLoc);
-
-  OMPClause *ActOnOpenMPClause(OpenMPClauseKind Kind, SourceLocation StartLoc,
-                               SourceLocation EndLoc);
-  /// Called on well-formed 'nowait' clause.
-  OMPClause *ActOnOpenMPNowaitClause(SourceLocation StartLoc,
-                                     SourceLocation EndLoc);
-  /// Called on well-formed 'untied' clause.
-  OMPClause *ActOnOpenMPUntiedClause(SourceLocation StartLoc,
-                                     SourceLocation EndLoc);
-  /// Called on well-formed 'mergeable' clause.
-  OMPClause *ActOnOpenMPMergeableClause(SourceLocation StartLoc,
-                                        SourceLocation EndLoc);
-  /// Called on well-formed 'read' clause.
-  OMPClause *ActOnOpenMPReadClause(SourceLocation StartLoc,
-                                   SourceLocation EndLoc);
-  /// Called on well-formed 'write' clause.
-  OMPClause *ActOnOpenMPWriteClause(SourceLocation StartLoc,
-                                    SourceLocation EndLoc);
-  /// Called on well-formed 'update' clause.
-  OMPClause *ActOnOpenMPUpdateClause(SourceLocation StartLoc,
-                                     SourceLocation EndLoc);
-  /// Called on well-formed 'capture' clause.
-  OMPClause *ActOnOpenMPCaptureClause(SourceLocation StartLoc,
-                                      SourceLocation EndLoc);
-  /// Called on well-formed 'compare' clause.
-  OMPClause *ActOnOpenMPCompareClause(SourceLocation StartLoc,
-                                      SourceLocation EndLoc);
-  /// Called on well-formed 'fail' clause.
-  OMPClause *ActOnOpenMPFailClause(SourceLocation StartLoc,
-                                   SourceLocation EndLoc);
-  OMPClause *ActOnOpenMPFailClause(OpenMPClauseKind Kind,
-                                   SourceLocation KindLoc,
-                                   SourceLocation StartLoc,
-                                   SourceLocation LParenLoc,
-                                   SourceLocation EndLoc);
-
-  /// Called on well-formed 'seq_cst' clause.
-  OMPClause *ActOnOpenMPSeqCstClause(SourceLocation StartLoc,
-                                     SourceLocation EndLoc);
-  /// Called on well-formed 'acq_rel' clause.
-  OMPClause *ActOnOpenMPAcqRelClause(SourceLocation StartLoc,
-                                     SourceLocation EndLoc);
-  /// Called on well-formed 'acquire' clause.
-  OMPClause *ActOnOpenMPAcquireClause(SourceLocation StartLoc,
-                                      SourceLocation EndLoc);
-  /// Called on well-formed 'release' clause.
-  OMPClause *ActOnOpenMPReleaseClause(SourceLocation StartLoc,
-                                      SourceLocation EndLoc);
-  /// Called on well-formed 'relaxed' clause.
-  OMPClause *ActOnOpenMPRelaxedClause(SourceLocation StartLoc,
-                                      SourceLocation EndLoc);
-  /// Called on well-formed 'weak' clause.
-  OMPClause *ActOnOpenMPWeakClause(SourceLocation StartLoc,
-                                   SourceLocation EndLoc);
-
-  /// Called on well-formed 'init' clause.
-  OMPClause *
-  ActOnOpenMPInitClause(Expr *InteropVar, OMPInteropInfo &InteropInfo,
-                        SourceLocation StartLoc, SourceLocation LParenLoc,
-                        SourceLocation VarLoc, SourceLocation EndLoc);
-
-  /// Called on well-formed 'use' clause.
-  OMPClause *ActOnOpenMPUseClause(Expr *InteropVar, SourceLocation StartLoc,
-                                  SourceLocation LParenLoc,
-                                  SourceLocation VarLoc, SourceLocation EndLoc);
-
-  /// Called on well-formed 'destroy' clause.
-  OMPClause *ActOnOpenMPDestroyClause(Expr *InteropVar, SourceLocation StartLoc,
-                                      SourceLocation LParenLoc,
-                                      SourceLocation VarLoc,
-                                      SourceLocation EndLoc);
-  /// Called on well-formed 'novariants' clause.
-  OMPClause *ActOnOpenMPNovariantsClause(Expr *Condition,
-                                         SourceLocation StartLoc,
-                                         SourceLocation LParenLoc,
-                                         SourceLocation EndLoc);
-  /// Called on well-formed 'nocontext' clause.
-  OMPClause *ActOnOpenMPNocontextClause(Expr *Condition,
-                                        SourceLocation StartLoc,
-                                        SourceLocation LParenLoc,
-                                        SourceLocation EndLoc);
-  /// Called on well-formed 'filter' clause.
-  OMPClause *ActOnOpenMPFilterClause(Expr *ThreadID, SourceLocation StartLoc,
-                                     SourceLocation LParenLoc,
-                                     SourceLocation EndLoc);
-  /// Called on well-formed 'threads' clause.
-  OMPClause *ActOnOpenMPThreadsClause(SourceLocation StartLoc,
-                                      SourceLocation EndLoc);
-  /// Called on well-formed 'simd' clause.
-  OMPClause *ActOnOpenMPSIMDClause(SourceLocation StartLoc,
-                                   SourceLocation EndLoc);
-  /// Called on well-formed 'nogroup' clause.
-  OMPClause *ActOnOpenMPNogroupClause(SourceLocation StartLoc,
-                                      SourceLocation EndLoc);
-  /// Called on well-formed 'unified_address' clause.
-  OMPClause *ActOnOpenMPUnifiedAddressClause(SourceLocation StartLoc,
-                                             SourceLocation EndLoc);
-
-  /// Called on well-formed 'unified_address' clause.
-  OMPClause *ActOnOpenMPUnifiedSharedMemoryClause(SourceLocation StartLoc,
-                                                  SourceLocation EndLoc);
-
-  /// Called on well-formed 'reverse_offload' clause.
-  OMPClause *ActOnOpenMPReverseOffloadClause(SourceLocation StartLoc,
-                                             SourceLocation EndLoc);
-
-  /// Called on well-formed 'dynamic_allocators' clause.
-  OMPClause *ActOnOpenMPDynamicAllocatorsClause(SourceLocation StartLoc,
-                                                SourceLocation EndLoc);
-
-  /// Called on well-formed 'atomic_default_mem_order' clause.
-  OMPClause *ActOnOpenMPAtomicDefaultMemOrderClause(
-      OpenMPAtomicDefaultMemOrderClauseKind Kind, SourceLocation KindLoc,
-      SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc);
-
-  /// Called on well-formed 'at' clause.
-  OMPClause *ActOnOpenMPAtClause(OpenMPAtClauseKind Kind,
-                                 SourceLocation KindLoc,
-                                 SourceLocation StartLoc,
-                                 SourceLocation LParenLoc,
-                                 SourceLocation EndLoc);
-
-  /// Called on well-formed 'severity' clause.
-  OMPClause *ActOnOpenMPSeverityClause(OpenMPSeverityClauseKind Kind,
-                                       SourceLocation KindLoc,
-                                       SourceLocation StartLoc,
-                                       SourceLocation LParenLoc,
-                                       SourceLocation EndLoc);
-
-  /// Called on well-formed 'message' clause.
-  /// passing string for message.
-  OMPClause *ActOnOpenMPMessageClause(Expr *MS, SourceLocation StartLoc,
-                                      SourceLocation LParenLoc,
-                                      SourceLocation EndLoc);
-
-  /// Data used for processing a list of variables in OpenMP clauses.
-  struct OpenMPVarListDataTy final {
-    Expr *DepModOrTailExpr = nullptr;
-    Expr *IteratorExpr = nullptr;
-    SourceLocation ColonLoc;
-    SourceLocation RLoc;
-    CXXScopeSpec ReductionOrMapperIdScopeSpec;
-    DeclarationNameInfo ReductionOrMapperId;
-    int ExtraModifier = -1; ///< Additional modifier for linear, map, depend or
-                            ///< lastprivate clause.
-    SmallVector<OpenMPMapModifierKind, NumberOfOMPMapClauseModifiers>
-        MapTypeModifiers;
-    SmallVector<SourceLocation, NumberOfOMPMapClauseModifiers>
-        MapTypeModifiersLoc;
-    SmallVector<OpenMPMotionModifierKind, NumberOfOMPMotionModifiers>
-        MotionModifiers;
-    SmallVector<SourceLocation, NumberOfOMPMotionModifiers> MotionModifiersLoc;
-    bool IsMapTypeImplicit = false;
-    SourceLocation ExtraModifierLoc;
-    SourceLocation OmpAllMemoryLoc;
-    SourceLocation
-        StepModifierLoc; /// 'step' modifier location for linear clause
-  };
-
-  OMPClause *ActOnOpenMPVarListClause(OpenMPClauseKind Kind,
-                                      ArrayRef<Expr *> Vars,
-                                      const OMPVarListLocTy &Locs,
-                                      OpenMPVarListDataTy &Data);
-  /// Called on well-formed 'inclusive' clause.
-  OMPClause *ActOnOpenMPInclusiveClause(ArrayRef<Expr *> VarList,
-                                        SourceLocation StartLoc,
-                                        SourceLocation LParenLoc,
-                                        SourceLocation EndLoc);
-  /// Called on well-formed 'exclusive' clause.
-  OMPClause *ActOnOpenMPExclusiveClause(ArrayRef<Expr *> VarList,
-                                        SourceLocation StartLoc,
-                                        SourceLocation LParenLoc,
-                                        SourceLocation EndLoc);
-  /// Called on well-formed 'allocate' clause.
-  OMPClause *
-  ActOnOpenMPAllocateClause(Expr *Allocator, ArrayRef<Expr *> VarList,
-                            SourceLocation StartLoc, SourceLocation ColonLoc,
-                            SourceLocation LParenLoc, SourceLocation EndLoc);
-  /// Called on well-formed 'private' clause.
-  OMPClause *ActOnOpenMPPrivateClause(ArrayRef<Expr *> VarList,
-                                      SourceLocation StartLoc,
-                                      SourceLocation LParenLoc,
-                                      SourceLocation EndLoc);
-  /// Called on well-formed 'firstprivate' clause.
-  OMPClause *ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
-                                           SourceLocation StartLoc,
-                                           SourceLocation LParenLoc,
-                                           SourceLocation EndLoc);
-  /// Called on well-formed 'lastprivate' clause.
-  OMPClause *ActOnOpenMPLastprivateClause(
-      ArrayRef<Expr *> VarList, OpenMPLastprivateModifier LPKind,
-      SourceLocation LPKindLoc, SourceLocation ColonLoc,
-      SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc);
-  /// Called on well-formed 'shared' clause.
-  OMPClause *ActOnOpenMPSharedClause(ArrayRef<Expr *> VarList,
-                                     SourceLocation StartLoc,
-                                     SourceLocation LParenLoc,
-                                     SourceLocation EndLoc);
-  /// Called on well-formed 'reduction' clause.
-  OMPClause *ActOnOpenMPReductionClause(
-      ArrayRef<Expr *> VarList, OpenMPReductionClauseModifier Modifier,
-      SourceLocation StartLoc, SourceLocation LParenLoc,
-      SourceLocation ModifierLoc, SourceLocation ColonLoc,
-      SourceLocation EndLoc, CXXScopeSpec &ReductionIdScopeSpec,
-      const DeclarationNameInfo &ReductionId,
-      ArrayRef<Expr *> UnresolvedReductions = std::nullopt);
-  /// Called on well-formed 'task_reduction' clause.
-  OMPClause *ActOnOpenMPTaskReductionClause(
-      ArrayRef<Expr *> VarList, SourceLocation StartLoc,
-      SourceLocation LParenLoc, SourceLocation ColonLoc, SourceLocation EndLoc,
-      CXXScopeSpec &ReductionIdScopeSpec,
-      const DeclarationNameInfo &ReductionId,
-      ArrayRef<Expr *> UnresolvedReductions = std::nullopt);
-  /// Called on well-formed 'in_reduction' clause.
-  OMPClause *ActOnOpenMPInReductionClause(
-      ArrayRef<Expr *> VarList, SourceLocation StartLoc,
-      SourceLocation LParenLoc, SourceLocation ColonLoc, SourceLocation EndLoc,
-      CXXScopeSpec &ReductionIdScopeSpec,
-      const DeclarationNameInfo &ReductionId,
-      ArrayRef<Expr *> UnresolvedReductions = std::nullopt);
-  /// Called on well-formed 'linear' clause.
-  OMPClause *ActOnOpenMPLinearClause(
-      ArrayRef<Expr *> VarList, Expr *Step, SourceLocation StartLoc,
-      SourceLocation LParenLoc, OpenMPLinearClauseKind LinKind,
-      SourceLocation LinLoc, SourceLocation ColonLoc,
-      SourceLocation StepModifierLoc, SourceLocation EndLoc);
-  /// Called on well-formed 'aligned' clause.
-  OMPClause *ActOnOpenMPAlignedClause(ArrayRef<Expr *> VarList, Expr *Alignment,
-                                      SourceLocation StartLoc,
-                                      SourceLocation LParenLoc,
-                                      SourceLocation ColonLoc,
-                                      SourceLocation EndLoc);
-  /// Called on well-formed 'copyin' clause.
-  OMPClause *ActOnOpenMPCopyinClause(ArrayRef<Expr *> VarList,
-                                     SourceLocation StartLoc,
-                                     SourceLocation LParenLoc,
-                                     SourceLocation EndLoc);
-  /// Called on well-formed 'copyprivate' clause.
-  OMPClause *ActOnOpenMPCopyprivateClause(ArrayRef<Expr *> VarList,
-                                          SourceLocation StartLoc,
-                                          SourceLocation LParenLoc,
-                                          SourceLocation EndLoc);
-  /// Called on well-formed 'flush' pseudo clause.
-  OMPClause *ActOnOpenMPFlushClause(ArrayRef<Expr *> VarList,
-                                    SourceLocation StartLoc,
-                                    SourceLocation LParenLoc,
-                                    SourceLocation EndLoc);
-  /// Called on well-formed 'depobj' pseudo clause.
-  OMPClause *ActOnOpenMPDepobjClause(Expr *Depobj, SourceLocation StartLoc,
-                                     SourceLocation LParenLoc,
-                                     SourceLocation EndLoc);
-  /// Called on well-formed 'depend' clause.
-  OMPClause *ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data,
-                                     Expr *DepModifier,
-                                     ArrayRef<Expr *> VarList,
-                                     SourceLocation StartLoc,
-                                     SourceLocation LParenLoc,
-                                     SourceLocation EndLoc);
-  /// Called on well-formed 'device' clause.
-  OMPClause *ActOnOpenMPDeviceClause(OpenMPDeviceClauseModifier Modifier,
-                                     Expr *Device, SourceLocation StartLoc,
-                                     SourceLocation LParenLoc,
-                                     SourceLocation ModifierLoc,
-                                     SourceLocation EndLoc);
-  /// Called on well-formed 'map' clause.
-  OMPClause *ActOnOpenMPMapClause(
-      Expr *IteratorModifier, ArrayRef<OpenMPMapModifierKind> MapTypeModifiers,
-      ArrayRef<SourceLocation> MapTypeModifiersLoc,
-      CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId,
-      OpenMPMapClauseKind MapType, bool IsMapTypeImplicit,
-      SourceLocation MapLoc, SourceLocation ColonLoc, ArrayRef<Expr *> VarList,
-      const OMPVarListLocTy &Locs, bool NoDiagnose = false,
-      ArrayRef<Expr *> UnresolvedMappers = std::nullopt);
-  /// Called on well-formed 'num_teams' clause.
-  OMPClause *ActOnOpenMPNumTeamsClause(Expr *NumTeams, SourceLocation StartLoc,
-                                       SourceLocation LParenLoc,
-                                       SourceLocation EndLoc);
-  /// Called on well-formed 'thread_limit' clause.
-  OMPClause *ActOnOpenMPThreadLimitClause(Expr *ThreadLimit,
-                                          SourceLocation StartLoc,
-                                          SourceLocation LParenLoc,
-                                          SourceLocation EndLoc);
-  /// Called on well-formed 'priority' clause.
-  OMPClause *ActOnOpenMPPriorityClause(Expr *Priority, SourceLocation StartLoc,
-                                       SourceLocation LParenLoc,
-                                       SourceLocation EndLoc);
-  /// Called on well-formed 'dist_schedule' clause.
-  OMPClause *ActOnOpenMPDistScheduleClause(
-      OpenMPDistScheduleClauseKind Kind, Expr *ChunkSize,
-      SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation KindLoc,
-      SourceLocation CommaLoc, SourceLocation EndLoc);
-  /// Called on well-formed 'defaultmap' clause.
-  OMPClause *ActOnOpenMPDefaultmapClause(
-      OpenMPDefaultmapClauseModifier M, OpenMPDefaultmapClauseKind Kind,
-      SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation MLoc,
-      SourceLocation KindLoc, SourceLocation EndLoc);
-  /// Called on well-formed 'to' clause.
-  OMPClause *
-  ActOnOpenMPToClause(ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
-                      ArrayRef<SourceLocation> MotionModifiersLoc,
-                      CXXScopeSpec &MapperIdScopeSpec,
-                      DeclarationNameInfo &MapperId, SourceLocation ColonLoc,
-                      ArrayRef<Expr *> VarList, const OMPVarListLocTy &Locs,
-                      ArrayRef<Expr *> UnresolvedMappers = std::nullopt);
-  /// Called on well-formed 'from' clause.
-  OMPClause *
-  ActOnOpenMPFromClause(ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
-                        ArrayRef<SourceLocation> MotionModifiersLoc,
-                        CXXScopeSpec &MapperIdScopeSpec,
-                        DeclarationNameInfo &MapperId, SourceLocation ColonLoc,
-                        ArrayRef<Expr *> VarList, const OMPVarListLocTy &Locs,
-                        ArrayRef<Expr *> UnresolvedMappers = std::nullopt);
-  /// Called on well-formed 'use_device_ptr' clause.
-  OMPClause *ActOnOpenMPUseDevicePtrClause(ArrayRef<Expr *> VarList,
-                                           const OMPVarListLocTy &Locs);
-  /// Called on well-formed 'use_device_addr' clause.
-  OMPClause *ActOnOpenMPUseDeviceAddrClause(ArrayRef<Expr *> VarList,
-                                            const OMPVarListLocTy &Locs);
-  /// Called on well-formed 'is_device_ptr' clause.
-  OMPClause *ActOnOpenMPIsDevicePtrClause(ArrayRef<Expr *> VarList,
-                                          const OMPVarListLocTy &Locs);
-  /// Called on well-formed 'has_device_addr' clause.
-  OMPClause *ActOnOpenMPHasDeviceAddrClause(ArrayRef<Expr *> VarList,
-                                            const OMPVarListLocTy &Locs);
-  /// Called on well-formed 'nontemporal' clause.
-  OMPClause *ActOnOpenMPNontemporalClause(ArrayRef<Expr *> VarList,
-                                          SourceLocation StartLoc,
-                                          SourceLocation LParenLoc,
-                                          SourceLocation EndLoc);
-
-  /// Data for list of allocators.
-  struct UsesAllocatorsData {
-    /// Allocator.
-    Expr *Allocator = nullptr;
-    /// Allocator traits.
-    Expr *AllocatorTraits = nullptr;
-    /// Locations of '(' and ')' symbols.
-    SourceLocation LParenLoc, RParenLoc;
-  };
-  /// Called on well-formed 'uses_allocators' clause.
-  OMPClause *ActOnOpenMPUsesAllocatorClause(SourceLocation StartLoc,
-                                            SourceLocation LParenLoc,
-                                            SourceLocation EndLoc,
-                                            ArrayRef<UsesAllocatorsData> Data);
-  /// Called on well-formed 'affinity' clause.
-  OMPClause *ActOnOpenMPAffinityClause(SourceLocation StartLoc,
-                                       SourceLocation LParenLoc,
-                                       SourceLocation ColonLoc,
-                                       SourceLocation EndLoc, Expr *Modifier,
-                                       ArrayRef<Expr *> Locators);
-  /// Called on a well-formed 'bind' clause.
-  OMPClause *ActOnOpenMPBindClause(OpenMPBindClauseKind Kind,
-                                   SourceLocation KindLoc,
-                                   SourceLocation StartLoc,
-                                   SourceLocation LParenLoc,
-                                   SourceLocation EndLoc);
-
-  /// Called on a well-formed 'ompx_dyn_cgroup_mem' clause.
-  OMPClause *ActOnOpenMPXDynCGroupMemClause(Expr *Size, SourceLocation StartLoc,
-                                            SourceLocation LParenLoc,
-                                            SourceLocation EndLoc);
-
-  /// Called on well-formed 'doacross' clause.
-  OMPClause *
-  ActOnOpenMPDoacrossClause(OpenMPDoacrossClauseModifier DepType,
-                            SourceLocation DepLoc, SourceLocation ColonLoc,
-                            ArrayRef<Expr *> VarList, SourceLocation StartLoc,
-                            SourceLocation LParenLoc, SourceLocation EndLoc);
-
-  /// Called on a well-formed 'ompx_attribute' clause.
-  OMPClause *ActOnOpenMPXAttributeClause(ArrayRef<const Attr *> Attrs,
-                                         SourceLocation StartLoc,
-                                         SourceLocation LParenLoc,
-                                         SourceLocation EndLoc);
-
-  /// Called on a well-formed 'ompx_bare' clause.
-  OMPClause *ActOnOpenMPXBareClause(SourceLocation StartLoc,
-                                    SourceLocation EndLoc);
-
-private:
-  void *VarDataSharingAttributesStack;
-
-  /// Number of nested '#pragma omp declare target' directives.
-  SmallVector<DeclareTargetContextInfo, 4> DeclareTargetNesting;
-
-  /// Initialization of data-sharing attributes stack.
-  void InitDataSharingAttributesStack();
-  void DestroyDataSharingAttributesStack();
-
-  /// Returns OpenMP nesting level for current directive.
-  unsigned getOpenMPNestingLevel() const;
-
-  /// Adjusts the function scopes index for the target-based regions.
-  void adjustOpenMPTargetScopeIndex(unsigned &FunctionScopesIndex,
-                                    unsigned Level) const;
-
-  /// Returns the number of scopes associated with the construct on the given
-  /// OpenMP level.
-  int getNumberOfConstructScopes(unsigned Level) const;
-
-  /// Push new OpenMP function region for non-capturing function.
-  void pushOpenMPFunctionRegion();
-
-  /// Pop OpenMP function region for non-capturing function.
-  void popOpenMPFunctionRegion(const sema::FunctionScopeInfo *OldFSI);
-
-  /// Analyzes and checks a loop nest for use by a loop transformation.
-  ///
-  /// \param Kind          The loop transformation directive kind.
-  /// \param NumLoops      How many nested loops the directive is expecting.
-  /// \param AStmt         Associated statement of the transformation directive.
-  /// \param LoopHelpers   [out] The loop analysis result.
-  /// \param Body          [out] The body code nested in \p NumLoops loop.
-  /// \param OriginalInits [out] Collection of statements and declarations that
-  ///                      must have been executed/declared before entering the
-  ///                      loop.
-  ///
-  /// \return Whether there was any error.
-  bool checkTransformableLoopNest(
-      OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops,
-      SmallVectorImpl<OMPLoopBasedDirective::HelperExprs> &LoopHelpers,
-      Stmt *&Body,
-      SmallVectorImpl<SmallVector<llvm::PointerUnion<Stmt *, Decl *>, 0>>
-          &OriginalInits);
-
-  /// Helper to keep information about the current `omp begin/end declare
-  /// variant` nesting.
-  struct OMPDeclareVariantScope {
-    /// The associated OpenMP context selector.
-    OMPTraitInfo *TI;
-
-    /// The associated OpenMP context selector mangling.
-    std::string NameSuffix;
-
-    OMPDeclareVariantScope(OMPTraitInfo &TI);
-  };
-
-  /// Return the OMPTraitInfo for the surrounding scope, if any.
-  OMPTraitInfo *getOMPTraitInfoForSurroundingScope() {
-    return OMPDeclareVariantScopes.empty() ? nullptr
-                                           : OMPDeclareVariantScopes.back().TI;
-  }
-
-  /// The current `omp begin/end declare variant` scopes.
-  SmallVector<OMPDeclareVariantScope, 4> OMPDeclareVariantScopes;
-
-  /// The current `omp begin/end assumes` scopes.
-  SmallVector<OMPAssumeAttr *, 4> OMPAssumeScoped;
-
-  /// All `omp assumes` we encountered so far.
-  SmallVector<OMPAssumeAttr *, 4> OMPAssumeGlobal;
-
-  /// OMPD_loop is mapped to OMPD_for, OMPD_distribute or OMPD_simd depending
-  /// on the parameter of the bind clause. In the methods for the
-  /// mapped directives, check the parameters of the lastprivate clause.
-  bool checkLastPrivateForMappedDirectives(ArrayRef<OMPClause *> Clauses);
-  /// Depending on the bind clause of OMPD_loop map the directive to new
-  /// directives.
-  ///    1) loop bind(parallel) --> OMPD_for
-  ///    2) loop bind(teams) --> OMPD_distribute
-  ///    3) loop bind(thread) --> OMPD_simd
-  /// This is being handled in Sema instead of Codegen because of the need for
-  /// rigorous semantic checking in the new mapped directives.
-  bool mapLoopConstruct(llvm::SmallVector<OMPClause *> &ClausesWithoutBind,
-                        ArrayRef<OMPClause *> Clauses,
-                        OpenMPBindClauseKind &BindKind,
-                        OpenMPDirectiveKind &Kind,
-                        OpenMPDirectiveKind &PrevMappedDirective,
-                        SourceLocation StartLoc, SourceLocation EndLoc,
-                        const DeclarationNameInfo &DirName,
-                        OpenMPDirectiveKind CancelRegion);
-
-  ///@}
 };
 
 DeductionFailureInfo
diff --git a/clang/include/clang/Sema/SemaOpenACC.h b/clang/include/clang/Sema/SemaOpenACC.h
index c1fe0f5b9c0f6..329dc3945fa2a 100644
--- a/clang/include/clang/Sema/SemaOpenACC.h
+++ b/clang/include/clang/Sema/SemaOpenACC.h
@@ -44,7 +44,8 @@ class SemaOpenACC : public SemaBase {
       Expr *ConditionExpr;
     };
 
-    std::variant<DefaultDetails, ConditionDetails> Details;
+    std::variant<std::monostate, DefaultDetails, ConditionDetails> Details =
+        std::monostate{};
 
   public:
     OpenACCParsedClause(OpenACCDirectiveKind DirKind,
@@ -72,8 +73,17 @@ class SemaOpenACC : public SemaBase {
     }
 
     Expr *getConditionExpr() {
-      assert(ClauseKind == OpenACCClauseKind::If &&
+      assert((ClauseKind == OpenACCClauseKind::If ||
+              (ClauseKind == OpenACCClauseKind::Self &&
+               DirKind != OpenACCDirectiveKind::Update)) &&
              "Parsed clause kind does not have a condition expr");
+
+      // 'self' has an optional ConditionExpr, so be tolerant of that. This will
+      // assert in variant otherwise.
+      if (ClauseKind == OpenACCClauseKind::Self &&
+          std::holds_alternative<std::monostate>(Details))
+        return nullptr;
+
       return std::get<ConditionDetails>(Details).ConditionExpr;
     }
 
@@ -87,7 +97,9 @@ class SemaOpenACC : public SemaBase {
     }
 
     void setConditionDetails(Expr *ConditionExpr) {
-      assert(ClauseKind == OpenACCClauseKind::If &&
+      assert((ClauseKind == OpenACCClauseKind::If ||
+              (ClauseKind == OpenACCClauseKind::Self &&
+               DirKind != OpenACCDirectiveKind::Update)) &&
              "Parsed clause kind does not have a condition expr");
       // In C++ we can count on this being a 'bool', but in C this gets left as
       // some sort of scalar that codegen will have to take care of converting.
diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h
new file mode 100644
index 0000000000000..9927459bbc594
--- /dev/null
+++ b/clang/include/clang/Sema/SemaOpenMP.h
@@ -0,0 +1,1447 @@
+//===----- SemaOpenMP.h -- Semantic Analysis for OpenMP constructs -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares semantic analysis for OpenMP constructs and
+/// clauses.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_SEMA_SEMAOPENMP_H
+#define LLVM_CLANG_SEMA_SEMAOPENMP_H
+
+#include "clang/AST/Attr.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/DeclBase.h"
+#include "clang/AST/DeclOpenMP.h"
+#include "clang/AST/DeclarationName.h"
+#include "clang/AST/Expr.h"
+#include "clang/AST/ExprOpenMP.h"
+#include "clang/AST/OpenMPClause.h"
+#include "clang/AST/Stmt.h"
+#include "clang/AST/StmtOpenMP.h"
+#include "clang/AST/Type.h"
+#include "clang/Basic/IdentifierTable.h"
+#include "clang/Basic/LLVM.h"
+#include "clang/Basic/OpenMPKinds.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/Specifiers.h"
+#include "clang/Sema/DeclSpec.h"
+#include "clang/Sema/Ownership.h"
+#include "clang/Sema/Scope.h"
+#include "clang/Sema/ScopeInfo.h"
+#include "clang/Sema/SemaBase.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PointerUnion.h"
+#include <optional>
+#include <string>
+#include <utility>
+
+namespace clang {
+
+class SemaOpenMP : public SemaBase {
+public:
+  SemaOpenMP(Sema &S);
+
+  friend class Parser;
+  friend class Sema;
+
+  using DeclGroupPtrTy = OpaquePtr<DeclGroupRef>;
+  using CapturedParamNameType = std::pair<StringRef, QualType>;
+
+  /// Creates a SemaDiagnosticBuilder that emits the diagnostic if the current
+  /// context is "used as device code".
+  ///
+  /// - If CurContext is a `declare target` function or it is known that the
+  /// function is emitted for the device, emits the diagnostics immediately.
+  /// - If CurContext is a non-`declare target` function and we are compiling
+  ///   for the device, creates a diagnostic which is emitted if and when we
+  ///   realize that the function will be codegen'ed.
+  ///
+  /// Example usage:
+  ///
+  ///  // Variable-length arrays are not allowed in NVPTX device code.
+  ///  if (diagIfOpenMPDeviceCode(Loc, diag::err_vla_unsupported))
+  ///    return ExprError();
+  ///  // Otherwise, continue parsing as normal.
+  SemaDiagnosticBuilder diagIfOpenMPDeviceCode(SourceLocation Loc,
+                                               unsigned DiagID,
+                                               const FunctionDecl *FD);
+
+  /// Creates a SemaDiagnosticBuilder that emits the diagnostic if the current
+  /// context is "used as host code".
+  ///
+  /// - If CurContext is a `declare target` function or it is known that the
+  /// function is emitted for the host, emits the diagnostics immediately.
+  /// - If CurContext is a non-host function, just ignore it.
+  ///
+  /// Example usage:
+  ///
+  ///  // Variable-length arrays are not allowed in NVPTX device code.
+  ///  if (diagIfOpenMPHostode(Loc, diag::err_vla_unsupported))
+  ///    return ExprError();
+  ///  // Otherwise, continue parsing as normal.
+  SemaDiagnosticBuilder diagIfOpenMPHostCode(SourceLocation Loc,
+                                             unsigned DiagID,
+                                             const FunctionDecl *FD);
+
+  /// The declarator \p D defines a function in the scope \p S which is nested
+  /// in an `omp begin/end declare variant` scope. In this method we create a
+  /// declaration for \p D and rename \p D according to the OpenMP context
+  /// selector of the surrounding scope. Return all base functions in \p Bases.
+  void ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(
+      Scope *S, Declarator &D, MultiTemplateParamsArg TemplateParameterLists,
+      SmallVectorImpl<FunctionDecl *> &Bases);
+
+  /// Register \p D as specialization of all base functions in \p Bases in the
+  /// current `omp begin/end declare variant` scope.
+  void ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope(
+      Decl *D, SmallVectorImpl<FunctionDecl *> &Bases);
+
+  /// Act on \p D, a function definition inside of an `omp [begin/end] assumes`.
+  void ActOnFinishedFunctionDefinitionInOpenMPAssumeScope(Decl *D);
+
+  /// Can we exit an OpenMP declare variant scope at the moment.
+  bool isInOpenMPDeclareVariantScope() const {
+    return !OMPDeclareVariantScopes.empty();
+  }
+
+  ExprResult
+  VerifyPositiveIntegerConstantInClause(Expr *Op, OpenMPClauseKind CKind,
+                                        bool StrictlyPositive = true,
+                                        bool SuppressExprDiags = false);
+
+  /// Given the potential call expression \p Call, determine if there is a
+  /// specialization via the OpenMP declare variant mechanism available. If
+  /// there is, return the specialized call expression, otherwise return the
+  /// original \p Call.
+  ExprResult ActOnOpenMPCall(ExprResult Call, Scope *Scope,
+                             SourceLocation LParenLoc, MultiExprArg ArgExprs,
+                             SourceLocation RParenLoc, Expr *ExecConfig);
+
+  /// Handle a `omp begin declare variant`.
+  void ActOnOpenMPBeginDeclareVariant(SourceLocation Loc, OMPTraitInfo &TI);
+
+  /// Handle a `omp end declare variant`.
+  void ActOnOpenMPEndDeclareVariant();
+
+  /// Function tries to capture lambda's captured variables in the OpenMP region
+  /// before the original lambda is captured.
+  void tryCaptureOpenMPLambdas(ValueDecl *V);
+
+  /// Return true if the provided declaration \a VD should be captured by
+  /// reference.
+  /// \param Level Relative level of nested OpenMP construct for that the check
+  /// is performed.
+  /// \param OpenMPCaptureLevel Capture level within an OpenMP construct.
+  bool isOpenMPCapturedByRef(const ValueDecl *D, unsigned Level,
+                             unsigned OpenMPCaptureLevel) const;
+
+  /// Check if the specified variable is used in one of the private
+  /// clauses (private, firstprivate, lastprivate, reduction etc.) in OpenMP
+  /// constructs.
+  VarDecl *isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo = false,
+                                unsigned StopAt = 0);
+
+  /// The member expression(this->fd) needs to be rebuilt in the template
+  /// instantiation to generate private copy for OpenMP when default
+  /// clause is used. The function will return true if default
+  /// cluse is used.
+  bool isOpenMPRebuildMemberExpr(ValueDecl *D);
+
+  ExprResult getOpenMPCapturedExpr(VarDecl *Capture, ExprValueKind VK,
+                                   ExprObjectKind OK, SourceLocation Loc);
+
+  /// If the current region is a loop-based region, mark the start of the loop
+  /// construct.
+  void startOpenMPLoop();
+
+  /// If the current region is a range loop-based region, mark the start of the
+  /// loop construct.
+  void startOpenMPCXXRangeFor();
+
+  /// Check if the specified variable is used in 'private' clause.
+  /// \param Level Relative level of nested OpenMP construct for that the check
+  /// is performed.
+  OpenMPClauseKind isOpenMPPrivateDecl(ValueDecl *D, unsigned Level,
+                                       unsigned CapLevel) const;
+
+  /// Sets OpenMP capture kind (OMPC_private, OMPC_firstprivate, OMPC_map etc.)
+  /// for \p FD based on DSA for the provided corresponding captured declaration
+  /// \p D.
+  void setOpenMPCaptureKind(FieldDecl *FD, const ValueDecl *D, unsigned Level);
+
+  /// Check if the specified variable is captured  by 'target' directive.
+  /// \param Level Relative level of nested OpenMP construct for that the check
+  /// is performed.
+  bool isOpenMPTargetCapturedDecl(const ValueDecl *D, unsigned Level,
+                                  unsigned CaptureLevel) const;
+
+  /// Check if the specified global variable must be captured  by outer capture
+  /// regions.
+  /// \param Level Relative level of nested OpenMP construct for that
+  /// the check is performed.
+  bool isOpenMPGlobalCapturedDecl(ValueDecl *D, unsigned Level,
+                                  unsigned CaptureLevel) const;
+
+  ExprResult PerformOpenMPImplicitIntegerConversion(SourceLocation OpLoc,
+                                                    Expr *Op);
+  /// Called on start of new data sharing attribute block.
+  void StartOpenMPDSABlock(OpenMPDirectiveKind K,
+                           const DeclarationNameInfo &DirName, Scope *CurScope,
+                           SourceLocation Loc);
+  /// Start analysis of clauses.
+  void StartOpenMPClause(OpenMPClauseKind K);
+  /// End analysis of clauses.
+  void EndOpenMPClause();
+  /// Called on end of data sharing attribute block.
+  void EndOpenMPDSABlock(Stmt *CurDirective);
+
+  /// Check if the current region is an OpenMP loop region and if it is,
+  /// mark loop control variable, used in \p Init for loop initialization, as
+  /// private by default.
+  /// \param Init First part of the for loop.
+  void ActOnOpenMPLoopInitialization(SourceLocation ForLoc, Stmt *Init);
+
+  /// Called on well-formed '\#pragma omp metadirective' after parsing
+  /// of the  associated statement.
+  StmtResult ActOnOpenMPMetaDirective(ArrayRef<OMPClause *> Clauses,
+                                      Stmt *AStmt, SourceLocation StartLoc,
+                                      SourceLocation EndLoc);
+
+  // OpenMP directives and clauses.
+  /// Called on correct id-expression from the '#pragma omp
+  /// threadprivate'.
+  ExprResult ActOnOpenMPIdExpression(Scope *CurScope, CXXScopeSpec &ScopeSpec,
+                                     const DeclarationNameInfo &Id,
+                                     OpenMPDirectiveKind Kind);
+  /// Called on well-formed '#pragma omp threadprivate'.
+  DeclGroupPtrTy ActOnOpenMPThreadprivateDirective(SourceLocation Loc,
+                                                   ArrayRef<Expr *> VarList);
+  /// Builds a new OpenMPThreadPrivateDecl and checks its correctness.
+  OMPThreadPrivateDecl *CheckOMPThreadPrivateDecl(SourceLocation Loc,
+                                                  ArrayRef<Expr *> VarList);
+  /// Called on well-formed '#pragma omp allocate'.
+  DeclGroupPtrTy ActOnOpenMPAllocateDirective(SourceLocation Loc,
+                                              ArrayRef<Expr *> VarList,
+                                              ArrayRef<OMPClause *> Clauses,
+                                              DeclContext *Owner = nullptr);
+
+  /// Called on well-formed '#pragma omp [begin] assume[s]'.
+  void ActOnOpenMPAssumesDirective(SourceLocation Loc,
+                                   OpenMPDirectiveKind DKind,
+                                   ArrayRef<std::string> Assumptions,
+                                   bool SkippedClauses);
+
+  /// Check if there is an active global `omp begin assumes` directive.
+  bool isInOpenMPAssumeScope() const { return !OMPAssumeScoped.empty(); }
+
+  /// Check if there is an active global `omp assumes` directive.
+  bool hasGlobalOpenMPAssumes() const { return !OMPAssumeGlobal.empty(); }
+
+  /// Called on well-formed '#pragma omp end assumes'.
+  void ActOnOpenMPEndAssumesDirective();
+
+  /// Called on well-formed '#pragma omp requires'.
+  DeclGroupPtrTy ActOnOpenMPRequiresDirective(SourceLocation Loc,
+                                              ArrayRef<OMPClause *> ClauseList);
+  /// Check restrictions on Requires directive
+  OMPRequiresDecl *CheckOMPRequiresDecl(SourceLocation Loc,
+                                        ArrayRef<OMPClause *> Clauses);
+  /// Check if the specified type is allowed to be used in 'omp declare
+  /// reduction' construct.
+  QualType ActOnOpenMPDeclareReductionType(SourceLocation TyLoc,
+                                           TypeResult ParsedType);
+  /// Called on start of '#pragma omp declare reduction'.
+  DeclGroupPtrTy ActOnOpenMPDeclareReductionDirectiveStart(
+      Scope *S, DeclContext *DC, DeclarationName Name,
+      ArrayRef<std::pair<QualType, SourceLocation>> ReductionTypes,
+      AccessSpecifier AS, Decl *PrevDeclInScope = nullptr);
+  /// Initialize declare reduction construct initializer.
+  void ActOnOpenMPDeclareReductionCombinerStart(Scope *S, Decl *D);
+  /// Finish current declare reduction construct initializer.
+  void ActOnOpenMPDeclareReductionCombinerEnd(Decl *D, Expr *Combiner);
+  /// Initialize declare reduction construct initializer.
+  /// \return omp_priv variable.
+  VarDecl *ActOnOpenMPDeclareReductionInitializerStart(Scope *S, Decl *D);
+  /// Finish current declare reduction construct initializer.
+  void ActOnOpenMPDeclareReductionInitializerEnd(Decl *D, Expr *Initializer,
+                                                 VarDecl *OmpPrivParm);
+  /// Called at the end of '#pragma omp declare reduction'.
+  DeclGroupPtrTy ActOnOpenMPDeclareReductionDirectiveEnd(
+      Scope *S, DeclGroupPtrTy DeclReductions, bool IsValid);
+
+  /// Check variable declaration in 'omp declare mapper' construct.
+  TypeResult ActOnOpenMPDeclareMapperVarDecl(Scope *S, Declarator &D);
+  /// Check if the specified type is allowed to be used in 'omp declare
+  /// mapper' construct.
+  QualType ActOnOpenMPDeclareMapperType(SourceLocation TyLoc,
+                                        TypeResult ParsedType);
+  /// Called on start of '#pragma omp declare mapper'.
+  DeclGroupPtrTy ActOnOpenMPDeclareMapperDirective(
+      Scope *S, DeclContext *DC, DeclarationName Name, QualType MapperType,
+      SourceLocation StartLoc, DeclarationName VN, AccessSpecifier AS,
+      Expr *MapperVarRef, ArrayRef<OMPClause *> Clauses,
+      Decl *PrevDeclInScope = nullptr);
+  /// Build the mapper variable of '#pragma omp declare mapper'.
+  ExprResult ActOnOpenMPDeclareMapperDirectiveVarDecl(Scope *S,
+                                                      QualType MapperType,
+                                                      SourceLocation StartLoc,
+                                                      DeclarationName VN);
+  void ActOnOpenMPIteratorVarDecl(VarDecl *VD);
+  bool isOpenMPDeclareMapperVarDeclAllowed(const VarDecl *VD) const;
+  const ValueDecl *getOpenMPDeclareMapperVarName() const;
+
+  struct DeclareTargetContextInfo {
+    struct MapInfo {
+      OMPDeclareTargetDeclAttr::MapTypeTy MT;
+      SourceLocation Loc;
+    };
+    /// Explicitly listed variables and functions in a 'to' or 'link' clause.
+    llvm::DenseMap<NamedDecl *, MapInfo> ExplicitlyMapped;
+
+    /// The 'device_type' as parsed from the clause.
+    OMPDeclareTargetDeclAttr::DevTypeTy DT = OMPDeclareTargetDeclAttr::DT_Any;
+
+    /// The directive kind, `begin declare target` or `declare target`.
+    OpenMPDirectiveKind Kind;
+
+    /// The directive with indirect clause.
+    std::optional<Expr *> Indirect;
+
+    /// The directive location.
+    SourceLocation Loc;
+
+    DeclareTargetContextInfo(OpenMPDirectiveKind Kind, SourceLocation Loc)
+        : Kind(Kind), Loc(Loc) {}
+  };
+
+  /// Called on the start of target region i.e. '#pragma omp declare target'.
+  bool ActOnStartOpenMPDeclareTargetContext(DeclareTargetContextInfo &DTCI);
+
+  /// Called at the end of target region i.e. '#pragma omp end declare target'.
+  const DeclareTargetContextInfo ActOnOpenMPEndDeclareTargetDirective();
+
+  /// Called once a target context is completed, that can be when a
+  /// '#pragma omp end declare target' was encountered or when a
+  /// '#pragma omp declare target' without declaration-definition-seq was
+  /// encountered.
+  void ActOnFinishedOpenMPDeclareTargetContext(DeclareTargetContextInfo &DTCI);
+
+  /// Report unterminated 'omp declare target' or 'omp begin declare target' at
+  /// the end of a compilation unit.
+  void DiagnoseUnterminatedOpenMPDeclareTarget();
+
+  /// Searches for the provided declaration name for OpenMP declare target
+  /// directive.
+  NamedDecl *lookupOpenMPDeclareTargetName(Scope *CurScope,
+                                           CXXScopeSpec &ScopeSpec,
+                                           const DeclarationNameInfo &Id);
+
+  /// Called on correct id-expression from the '#pragma omp declare target'.
+  void ActOnOpenMPDeclareTargetName(NamedDecl *ND, SourceLocation Loc,
+                                    OMPDeclareTargetDeclAttr::MapTypeTy MT,
+                                    DeclareTargetContextInfo &DTCI);
+
+  /// Check declaration inside target region.
+  void
+  checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D,
+                                   SourceLocation IdLoc = SourceLocation());
+
+  /// Adds OMPDeclareTargetDeclAttr to referenced variables in declare target
+  /// directive.
+  void ActOnOpenMPDeclareTargetInitializer(Decl *D);
+
+  /// Finishes analysis of the deferred functions calls that may be declared as
+  /// host/nohost during device/host compilation.
+  void finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller,
+                                     const FunctionDecl *Callee,
+                                     SourceLocation Loc);
+
+  /// Return true if currently in OpenMP task with untied clause context.
+  bool isInOpenMPTaskUntiedContext() const;
+
+  /// Return true inside OpenMP declare target region.
+  bool isInOpenMPDeclareTargetContext() const {
+    return !DeclareTargetNesting.empty();
+  }
+  /// Return true inside OpenMP target region.
+  bool isInOpenMPTargetExecutionDirective() const;
+
+  /// Return the number of captured regions created for an OpenMP directive.
+  static int getOpenMPCaptureLevels(OpenMPDirectiveKind Kind);
+
+  /// Initialization of captured region for OpenMP region.
+  void ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope);
+
+  /// Called for syntactical loops (ForStmt or CXXForRangeStmt) associated to
+  /// an OpenMP loop directive.
+  StmtResult ActOnOpenMPCanonicalLoop(Stmt *AStmt);
+
+  /// Process a canonical OpenMP loop nest that can either be a canonical
+  /// literal loop (ForStmt or CXXForRangeStmt), or the generated loop of an
+  /// OpenMP loop transformation construct.
+  StmtResult ActOnOpenMPLoopnest(Stmt *AStmt);
+
+  /// End of OpenMP region.
+  ///
+  /// \param S Statement associated with the current OpenMP region.
+  /// \param Clauses List of clauses for the current OpenMP region.
+  ///
+  /// \returns Statement for finished OpenMP region.
+  StmtResult ActOnOpenMPRegionEnd(StmtResult S, ArrayRef<OMPClause *> Clauses);
+  StmtResult ActOnOpenMPExecutableDirective(
+      OpenMPDirectiveKind Kind, const DeclarationNameInfo &DirName,
+      OpenMPDirectiveKind CancelRegion, ArrayRef<OMPClause *> Clauses,
+      Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc,
+      OpenMPDirectiveKind PrevMappedDirective = llvm::omp::OMPD_unknown);
+  /// Called on well-formed '\#pragma omp parallel' after parsing
+  /// of the  associated statement.
+  StmtResult ActOnOpenMPParallelDirective(ArrayRef<OMPClause *> Clauses,
+                                          Stmt *AStmt, SourceLocation StartLoc,
+                                          SourceLocation EndLoc);
+  using VarsWithInheritedDSAType =
+      llvm::SmallDenseMap<const ValueDecl *, const Expr *, 4>;
+  /// Called on well-formed '\#pragma omp simd' after parsing
+  /// of the associated statement.
+  StmtResult
+  ActOnOpenMPSimdDirective(ArrayRef<OMPClause *> Clauses, Stmt *AStmt,
+                           SourceLocation StartLoc, SourceLocation EndLoc,
+                           VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '#pragma omp tile' after parsing of its clauses and
+  /// the associated statement.
+  StmtResult ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
+                                      Stmt *AStmt, SourceLocation StartLoc,
+                                      SourceLocation EndLoc);
+  /// Called on well-formed '#pragma omp unroll' after parsing of its clauses
+  /// and the associated statement.
+  StmtResult ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
+                                        Stmt *AStmt, SourceLocation StartLoc,
+                                        SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp for' after parsing
+  /// of the associated statement.
+  StmtResult
+  ActOnOpenMPForDirective(ArrayRef<OMPClause *> Clauses, Stmt *AStmt,
+                          SourceLocation StartLoc, SourceLocation EndLoc,
+                          VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp for simd' after parsing
+  /// of the associated statement.
+  StmtResult
+  ActOnOpenMPForSimdDirective(ArrayRef<OMPClause *> Clauses, Stmt *AStmt,
+                              SourceLocation StartLoc, SourceLocation EndLoc,
+                              VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp sections' after parsing
+  /// of the associated statement.
+  StmtResult ActOnOpenMPSectionsDirective(ArrayRef<OMPClause *> Clauses,
+                                          Stmt *AStmt, SourceLocation StartLoc,
+                                          SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp section' after parsing of the
+  /// associated statement.
+  StmtResult ActOnOpenMPSectionDirective(Stmt *AStmt, SourceLocation StartLoc,
+                                         SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp scope' after parsing of the
+  /// associated statement.
+  StmtResult ActOnOpenMPScopeDirective(ArrayRef<OMPClause *> Clauses,
+                                       Stmt *AStmt, SourceLocation StartLoc,
+                                       SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp single' after parsing of the
+  /// associated statement.
+  StmtResult ActOnOpenMPSingleDirective(ArrayRef<OMPClause *> Clauses,
+                                        Stmt *AStmt, SourceLocation StartLoc,
+                                        SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp master' after parsing of the
+  /// associated statement.
+  StmtResult ActOnOpenMPMasterDirective(Stmt *AStmt, SourceLocation StartLoc,
+                                        SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp critical' after parsing of the
+  /// associated statement.
+  StmtResult ActOnOpenMPCriticalDirective(const DeclarationNameInfo &DirName,
+                                          ArrayRef<OMPClause *> Clauses,
+                                          Stmt *AStmt, SourceLocation StartLoc,
+                                          SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp parallel for' after parsing
+  /// of the  associated statement.
+  StmtResult ActOnOpenMPParallelForDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp parallel for simd' after
+  /// parsing of the  associated statement.
+  StmtResult ActOnOpenMPParallelForSimdDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp parallel master' after
+  /// parsing of the  associated statement.
+  StmtResult ActOnOpenMPParallelMasterDirective(ArrayRef<OMPClause *> Clauses,
+                                                Stmt *AStmt,
+                                                SourceLocation StartLoc,
+                                                SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp parallel masked' after
+  /// parsing of the associated statement.
+  StmtResult ActOnOpenMPParallelMaskedDirective(ArrayRef<OMPClause *> Clauses,
+                                                Stmt *AStmt,
+                                                SourceLocation StartLoc,
+                                                SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp parallel sections' after
+  /// parsing of the  associated statement.
+  StmtResult ActOnOpenMPParallelSectionsDirective(ArrayRef<OMPClause *> Clauses,
+                                                  Stmt *AStmt,
+                                                  SourceLocation StartLoc,
+                                                  SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp task' after parsing of the
+  /// associated statement.
+  StmtResult ActOnOpenMPTaskDirective(ArrayRef<OMPClause *> Clauses,
+                                      Stmt *AStmt, SourceLocation StartLoc,
+                                      SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp taskyield'.
+  StmtResult ActOnOpenMPTaskyieldDirective(SourceLocation StartLoc,
+                                           SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp error'.
+  /// Error direcitive is allowed in both declared and excutable contexts.
+  /// Adding InExContext to identify which context is called from.
+  StmtResult ActOnOpenMPErrorDirective(ArrayRef<OMPClause *> Clauses,
+                                       SourceLocation StartLoc,
+                                       SourceLocation EndLoc,
+                                       bool InExContext = true);
+  /// Called on well-formed '\#pragma omp barrier'.
+  StmtResult ActOnOpenMPBarrierDirective(SourceLocation StartLoc,
+                                         SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp taskwait'.
+  StmtResult ActOnOpenMPTaskwaitDirective(ArrayRef<OMPClause *> Clauses,
+                                          SourceLocation StartLoc,
+                                          SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp taskgroup'.
+  StmtResult ActOnOpenMPTaskgroupDirective(ArrayRef<OMPClause *> Clauses,
+                                           Stmt *AStmt, SourceLocation StartLoc,
+                                           SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp flush'.
+  StmtResult ActOnOpenMPFlushDirective(ArrayRef<OMPClause *> Clauses,
+                                       SourceLocation StartLoc,
+                                       SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp depobj'.
+  StmtResult ActOnOpenMPDepobjDirective(ArrayRef<OMPClause *> Clauses,
+                                        SourceLocation StartLoc,
+                                        SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp scan'.
+  StmtResult ActOnOpenMPScanDirective(ArrayRef<OMPClause *> Clauses,
+                                      SourceLocation StartLoc,
+                                      SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp ordered' after parsing of the
+  /// associated statement.
+  StmtResult ActOnOpenMPOrderedDirective(ArrayRef<OMPClause *> Clauses,
+                                         Stmt *AStmt, SourceLocation StartLoc,
+                                         SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp atomic' after parsing of the
+  /// associated statement.
+  StmtResult ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
+                                        Stmt *AStmt, SourceLocation StartLoc,
+                                        SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp target' after parsing of the
+  /// associated statement.
+  StmtResult ActOnOpenMPTargetDirective(ArrayRef<OMPClause *> Clauses,
+                                        Stmt *AStmt, SourceLocation StartLoc,
+                                        SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp target data' after parsing of
+  /// the associated statement.
+  StmtResult ActOnOpenMPTargetDataDirective(ArrayRef<OMPClause *> Clauses,
+                                            Stmt *AStmt,
+                                            SourceLocation StartLoc,
+                                            SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp target enter data' after
+  /// parsing of the associated statement.
+  StmtResult ActOnOpenMPTargetEnterDataDirective(ArrayRef<OMPClause *> Clauses,
+                                                 SourceLocation StartLoc,
+                                                 SourceLocation EndLoc,
+                                                 Stmt *AStmt);
+  /// Called on well-formed '\#pragma omp target exit data' after
+  /// parsing of the associated statement.
+  StmtResult ActOnOpenMPTargetExitDataDirective(ArrayRef<OMPClause *> Clauses,
+                                                SourceLocation StartLoc,
+                                                SourceLocation EndLoc,
+                                                Stmt *AStmt);
+  /// Called on well-formed '\#pragma omp target parallel' after
+  /// parsing of the associated statement.
+  StmtResult ActOnOpenMPTargetParallelDirective(ArrayRef<OMPClause *> Clauses,
+                                                Stmt *AStmt,
+                                                SourceLocation StartLoc,
+                                                SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp target parallel for' after
+  /// parsing of the  associated statement.
+  StmtResult ActOnOpenMPTargetParallelForDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp teams' after parsing of the
+  /// associated statement.
+  StmtResult ActOnOpenMPTeamsDirective(ArrayRef<OMPClause *> Clauses,
+                                       Stmt *AStmt, SourceLocation StartLoc,
+                                       SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp teams loop' after parsing of the
+  /// associated statement.
+  StmtResult ActOnOpenMPTeamsGenericLoopDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp target teams loop' after parsing of
+  /// the associated statement.
+  StmtResult ActOnOpenMPTargetTeamsGenericLoopDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp parallel loop' after parsing of the
+  /// associated statement.
+  StmtResult ActOnOpenMPParallelGenericLoopDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp target parallel loop' after parsing
+  /// of the associated statement.
+  StmtResult ActOnOpenMPTargetParallelGenericLoopDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp cancellation point'.
+  StmtResult
+  ActOnOpenMPCancellationPointDirective(SourceLocation StartLoc,
+                                        SourceLocation EndLoc,
+                                        OpenMPDirectiveKind CancelRegion);
+  /// Called on well-formed '\#pragma omp cancel'.
+  StmtResult ActOnOpenMPCancelDirective(ArrayRef<OMPClause *> Clauses,
+                                        SourceLocation StartLoc,
+                                        SourceLocation EndLoc,
+                                        OpenMPDirectiveKind CancelRegion);
+  /// Called on well-formed '\#pragma omp taskloop' after parsing of the
+  /// associated statement.
+  StmtResult
+  ActOnOpenMPTaskLoopDirective(ArrayRef<OMPClause *> Clauses, Stmt *AStmt,
+                               SourceLocation StartLoc, SourceLocation EndLoc,
+                               VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp taskloop simd' after parsing of
+  /// the associated statement.
+  StmtResult ActOnOpenMPTaskLoopSimdDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp master taskloop' after parsing of the
+  /// associated statement.
+  StmtResult ActOnOpenMPMasterTaskLoopDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp master taskloop simd' after parsing of
+  /// the associated statement.
+  StmtResult ActOnOpenMPMasterTaskLoopSimdDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp parallel master taskloop' after
+  /// parsing of the associated statement.
+  StmtResult ActOnOpenMPParallelMasterTaskLoopDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp parallel master taskloop simd' after
+  /// parsing of the associated statement.
+  StmtResult ActOnOpenMPParallelMasterTaskLoopSimdDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp masked taskloop' after parsing of the
+  /// associated statement.
+  StmtResult ActOnOpenMPMaskedTaskLoopDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp masked taskloop simd' after parsing of
+  /// the associated statement.
+  StmtResult ActOnOpenMPMaskedTaskLoopSimdDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp parallel masked taskloop' after
+  /// parsing of the associated statement.
+  StmtResult ActOnOpenMPParallelMaskedTaskLoopDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp parallel masked taskloop simd' after
+  /// parsing of the associated statement.
+  StmtResult ActOnOpenMPParallelMaskedTaskLoopSimdDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp distribute' after parsing
+  /// of the associated statement.
+  StmtResult
+  ActOnOpenMPDistributeDirective(ArrayRef<OMPClause *> Clauses, Stmt *AStmt,
+                                 SourceLocation StartLoc, SourceLocation EndLoc,
+                                 VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp target update'.
+  StmtResult ActOnOpenMPTargetUpdateDirective(ArrayRef<OMPClause *> Clauses,
+                                              SourceLocation StartLoc,
+                                              SourceLocation EndLoc,
+                                              Stmt *AStmt);
+  /// Called on well-formed '\#pragma omp distribute parallel for' after
+  /// parsing of the associated statement.
+  StmtResult ActOnOpenMPDistributeParallelForDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp distribute parallel for simd'
+  /// after parsing of the associated statement.
+  StmtResult ActOnOpenMPDistributeParallelForSimdDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp distribute simd' after
+  /// parsing of the associated statement.
+  StmtResult ActOnOpenMPDistributeSimdDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp target parallel for simd' after
+  /// parsing of the associated statement.
+  StmtResult ActOnOpenMPTargetParallelForSimdDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp target simd' after parsing of
+  /// the associated statement.
+  StmtResult
+  ActOnOpenMPTargetSimdDirective(ArrayRef<OMPClause *> Clauses, Stmt *AStmt,
+                                 SourceLocation StartLoc, SourceLocation EndLoc,
+                                 VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp teams distribute' after parsing of
+  /// the associated statement.
+  StmtResult ActOnOpenMPTeamsDistributeDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp teams distribute simd' after parsing
+  /// of the associated statement.
+  StmtResult ActOnOpenMPTeamsDistributeSimdDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp teams distribute parallel for simd'
+  /// after parsing of the associated statement.
+  StmtResult ActOnOpenMPTeamsDistributeParallelForSimdDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp teams distribute parallel for'
+  /// after parsing of the associated statement.
+  StmtResult ActOnOpenMPTeamsDistributeParallelForDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp target teams' after parsing of the
+  /// associated statement.
+  StmtResult ActOnOpenMPTargetTeamsDirective(ArrayRef<OMPClause *> Clauses,
+                                             Stmt *AStmt,
+                                             SourceLocation StartLoc,
+                                             SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp target teams distribute' after parsing
+  /// of the associated statement.
+  StmtResult ActOnOpenMPTargetTeamsDistributeDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp target teams distribute parallel for'
+  /// after parsing of the associated statement.
+  StmtResult ActOnOpenMPTargetTeamsDistributeParallelForDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp target teams distribute parallel for
+  /// simd' after parsing of the associated statement.
+  StmtResult ActOnOpenMPTargetTeamsDistributeParallelForSimdDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp target teams distribute simd' after
+  /// parsing of the associated statement.
+  StmtResult ActOnOpenMPTargetTeamsDistributeSimdDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+  /// Called on well-formed '\#pragma omp interop'.
+  StmtResult ActOnOpenMPInteropDirective(ArrayRef<OMPClause *> Clauses,
+                                         SourceLocation StartLoc,
+                                         SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp dispatch' after parsing of the
+  // /associated statement.
+  StmtResult ActOnOpenMPDispatchDirective(ArrayRef<OMPClause *> Clauses,
+                                          Stmt *AStmt, SourceLocation StartLoc,
+                                          SourceLocation EndLoc);
+  /// Called on well-formed '\#pragma omp masked' after parsing of the
+  // /associated statement.
+  StmtResult ActOnOpenMPMaskedDirective(ArrayRef<OMPClause *> Clauses,
+                                        Stmt *AStmt, SourceLocation StartLoc,
+                                        SourceLocation EndLoc);
+
+  /// Called on well-formed '\#pragma omp loop' after parsing of the
+  /// associated statement.
+  StmtResult ActOnOpenMPGenericLoopDirective(
+      ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+      SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA);
+
+  /// Checks correctness of linear modifiers.
+  bool CheckOpenMPLinearModifier(OpenMPLinearClauseKind LinKind,
+                                 SourceLocation LinLoc);
+  /// Checks that the specified declaration matches requirements for the linear
+  /// decls.
+  bool CheckOpenMPLinearDecl(const ValueDecl *D, SourceLocation ELoc,
+                             OpenMPLinearClauseKind LinKind, QualType Type,
+                             bool IsDeclareSimd = false);
+
+  /// Called on well-formed '\#pragma omp declare simd' after parsing of
+  /// the associated method/function.
+  DeclGroupPtrTy ActOnOpenMPDeclareSimdDirective(
+      DeclGroupPtrTy DG, OMPDeclareSimdDeclAttr::BranchStateTy BS,
+      Expr *Simdlen, ArrayRef<Expr *> Uniforms, ArrayRef<Expr *> Aligneds,
+      ArrayRef<Expr *> Alignments, ArrayRef<Expr *> Linears,
+      ArrayRef<unsigned> LinModifiers, ArrayRef<Expr *> Steps, SourceRange SR);
+
+  /// Checks '\#pragma omp declare variant' variant function and original
+  /// functions after parsing of the associated method/function.
+  /// \param DG Function declaration to which declare variant directive is
+  /// applied to.
+  /// \param VariantRef Expression that references the variant function, which
+  /// must be used instead of the original one, specified in \p DG.
+  /// \param TI The trait info object representing the match clause.
+  /// \param NumAppendArgs The number of omp_interop_t arguments to account for
+  /// in checking.
+  /// \returns std::nullopt, if the function/variant function are not compatible
+  /// with the pragma, pair of original function/variant ref expression
+  /// otherwise.
+  std::optional<std::pair<FunctionDecl *, Expr *>>
+  checkOpenMPDeclareVariantFunction(DeclGroupPtrTy DG, Expr *VariantRef,
+                                    OMPTraitInfo &TI, unsigned NumAppendArgs,
+                                    SourceRange SR);
+
+  /// Called on well-formed '\#pragma omp declare variant' after parsing of
+  /// the associated method/function.
+  /// \param FD Function declaration to which declare variant directive is
+  /// applied to.
+  /// \param VariantRef Expression that references the variant function, which
+  /// must be used instead of the original one, specified in \p DG.
+  /// \param TI The context traits associated with the function variant.
+  /// \param AdjustArgsNothing The list of 'nothing' arguments.
+  /// \param AdjustArgsNeedDevicePtr The list of 'need_device_ptr' arguments.
+  /// \param AppendArgs The list of 'append_args' arguments.
+  /// \param AdjustArgsLoc The Location of an 'adjust_args' clause.
+  /// \param AppendArgsLoc The Location of an 'append_args' clause.
+  /// \param SR The SourceRange of the 'declare variant' directive.
+  void ActOnOpenMPDeclareVariantDirective(
+      FunctionDecl *FD, Expr *VariantRef, OMPTraitInfo &TI,
+      ArrayRef<Expr *> AdjustArgsNothing,
+      ArrayRef<Expr *> AdjustArgsNeedDevicePtr,
+      ArrayRef<OMPInteropInfo> AppendArgs, SourceLocation AdjustArgsLoc,
+      SourceLocation AppendArgsLoc, SourceRange SR);
+
+  OMPClause *ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, Expr *Expr,
+                                         SourceLocation StartLoc,
+                                         SourceLocation LParenLoc,
+                                         SourceLocation EndLoc);
+  /// Called on well-formed 'allocator' clause.
+  OMPClause *ActOnOpenMPAllocatorClause(Expr *Allocator,
+                                        SourceLocation StartLoc,
+                                        SourceLocation LParenLoc,
+                                        SourceLocation EndLoc);
+  /// Called on well-formed 'if' clause.
+  OMPClause *ActOnOpenMPIfClause(OpenMPDirectiveKind NameModifier,
+                                 Expr *Condition, SourceLocation StartLoc,
+                                 SourceLocation LParenLoc,
+                                 SourceLocation NameModifierLoc,
+                                 SourceLocation ColonLoc,
+                                 SourceLocation EndLoc);
+  /// Called on well-formed 'final' clause.
+  OMPClause *ActOnOpenMPFinalClause(Expr *Condition, SourceLocation StartLoc,
+                                    SourceLocation LParenLoc,
+                                    SourceLocation EndLoc);
+  /// Called on well-formed 'num_threads' clause.
+  OMPClause *ActOnOpenMPNumThreadsClause(Expr *NumThreads,
+                                         SourceLocation StartLoc,
+                                         SourceLocation LParenLoc,
+                                         SourceLocation EndLoc);
+  /// Called on well-formed 'align' clause.
+  OMPClause *ActOnOpenMPAlignClause(Expr *Alignment, SourceLocation StartLoc,
+                                    SourceLocation LParenLoc,
+                                    SourceLocation EndLoc);
+  /// Called on well-formed 'safelen' clause.
+  OMPClause *ActOnOpenMPSafelenClause(Expr *Length, SourceLocation StartLoc,
+                                      SourceLocation LParenLoc,
+                                      SourceLocation EndLoc);
+  /// Called on well-formed 'simdlen' clause.
+  OMPClause *ActOnOpenMPSimdlenClause(Expr *Length, SourceLocation StartLoc,
+                                      SourceLocation LParenLoc,
+                                      SourceLocation EndLoc);
+  /// Called on well-form 'sizes' clause.
+  OMPClause *ActOnOpenMPSizesClause(ArrayRef<Expr *> SizeExprs,
+                                    SourceLocation StartLoc,
+                                    SourceLocation LParenLoc,
+                                    SourceLocation EndLoc);
+  /// Called on well-form 'full' clauses.
+  OMPClause *ActOnOpenMPFullClause(SourceLocation StartLoc,
+                                   SourceLocation EndLoc);
+  /// Called on well-form 'partial' clauses.
+  OMPClause *ActOnOpenMPPartialClause(Expr *FactorExpr, SourceLocation StartLoc,
+                                      SourceLocation LParenLoc,
+                                      SourceLocation EndLoc);
+  /// Called on well-formed 'collapse' clause.
+  OMPClause *ActOnOpenMPCollapseClause(Expr *NumForLoops,
+                                       SourceLocation StartLoc,
+                                       SourceLocation LParenLoc,
+                                       SourceLocation EndLoc);
+  /// Called on well-formed 'ordered' clause.
+  OMPClause *
+  ActOnOpenMPOrderedClause(SourceLocation StartLoc, SourceLocation EndLoc,
+                           SourceLocation LParenLoc = SourceLocation(),
+                           Expr *NumForLoops = nullptr);
+  /// Called on well-formed 'grainsize' clause.
+  OMPClause *ActOnOpenMPGrainsizeClause(OpenMPGrainsizeClauseModifier Modifier,
+                                        Expr *Size, SourceLocation StartLoc,
+                                        SourceLocation LParenLoc,
+                                        SourceLocation ModifierLoc,
+                                        SourceLocation EndLoc);
+  /// Called on well-formed 'num_tasks' clause.
+  OMPClause *ActOnOpenMPNumTasksClause(OpenMPNumTasksClauseModifier Modifier,
+                                       Expr *NumTasks, SourceLocation StartLoc,
+                                       SourceLocation LParenLoc,
+                                       SourceLocation ModifierLoc,
+                                       SourceLocation EndLoc);
+  /// Called on well-formed 'hint' clause.
+  OMPClause *ActOnOpenMPHintClause(Expr *Hint, SourceLocation StartLoc,
+                                   SourceLocation LParenLoc,
+                                   SourceLocation EndLoc);
+  /// Called on well-formed 'detach' clause.
+  OMPClause *ActOnOpenMPDetachClause(Expr *Evt, SourceLocation StartLoc,
+                                     SourceLocation LParenLoc,
+                                     SourceLocation EndLoc);
+
+  OMPClause *ActOnOpenMPSimpleClause(OpenMPClauseKind Kind, unsigned Argument,
+                                     SourceLocation ArgumentLoc,
+                                     SourceLocation StartLoc,
+                                     SourceLocation LParenLoc,
+                                     SourceLocation EndLoc);
+  /// Called on well-formed 'when' clause.
+  OMPClause *ActOnOpenMPWhenClause(OMPTraitInfo &TI, SourceLocation StartLoc,
+                                   SourceLocation LParenLoc,
+                                   SourceLocation EndLoc);
+  /// Called on well-formed 'default' clause.
+  OMPClause *ActOnOpenMPDefaultClause(llvm::omp::DefaultKind Kind,
+                                      SourceLocation KindLoc,
+                                      SourceLocation StartLoc,
+                                      SourceLocation LParenLoc,
+                                      SourceLocation EndLoc);
+  /// Called on well-formed 'proc_bind' clause.
+  OMPClause *ActOnOpenMPProcBindClause(llvm::omp::ProcBindKind Kind,
+                                       SourceLocation KindLoc,
+                                       SourceLocation StartLoc,
+                                       SourceLocation LParenLoc,
+                                       SourceLocation EndLoc);
+  /// Called on well-formed 'order' clause.
+  OMPClause *ActOnOpenMPOrderClause(OpenMPOrderClauseModifier Modifier,
+                                    OpenMPOrderClauseKind Kind,
+                                    SourceLocation StartLoc,
+                                    SourceLocation LParenLoc,
+                                    SourceLocation MLoc, SourceLocation KindLoc,
+                                    SourceLocation EndLoc);
+  /// Called on well-formed 'update' clause.
+  OMPClause *ActOnOpenMPUpdateClause(OpenMPDependClauseKind Kind,
+                                     SourceLocation KindLoc,
+                                     SourceLocation StartLoc,
+                                     SourceLocation LParenLoc,
+                                     SourceLocation EndLoc);
+
+  OMPClause *ActOnOpenMPSingleExprWithArgClause(
+      OpenMPClauseKind Kind, ArrayRef<unsigned> Arguments, Expr *Expr,
+      SourceLocation StartLoc, SourceLocation LParenLoc,
+      ArrayRef<SourceLocation> ArgumentsLoc, SourceLocation DelimLoc,
+      SourceLocation EndLoc);
+  /// Called on well-formed 'schedule' clause.
+  OMPClause *ActOnOpenMPScheduleClause(
+      OpenMPScheduleClauseModifier M1, OpenMPScheduleClauseModifier M2,
+      OpenMPScheduleClauseKind Kind, Expr *ChunkSize, SourceLocation StartLoc,
+      SourceLocation LParenLoc, SourceLocation M1Loc, SourceLocation M2Loc,
+      SourceLocation KindLoc, SourceLocation CommaLoc, SourceLocation EndLoc);
+
+  OMPClause *ActOnOpenMPClause(OpenMPClauseKind Kind, SourceLocation StartLoc,
+                               SourceLocation EndLoc);
+  /// Called on well-formed 'nowait' clause.
+  OMPClause *ActOnOpenMPNowaitClause(SourceLocation StartLoc,
+                                     SourceLocation EndLoc);
+  /// Called on well-formed 'untied' clause.
+  OMPClause *ActOnOpenMPUntiedClause(SourceLocation StartLoc,
+                                     SourceLocation EndLoc);
+  /// Called on well-formed 'mergeable' clause.
+  OMPClause *ActOnOpenMPMergeableClause(SourceLocation StartLoc,
+                                        SourceLocation EndLoc);
+  /// Called on well-formed 'read' clause.
+  OMPClause *ActOnOpenMPReadClause(SourceLocation StartLoc,
+                                   SourceLocation EndLoc);
+  /// Called on well-formed 'write' clause.
+  OMPClause *ActOnOpenMPWriteClause(SourceLocation StartLoc,
+                                    SourceLocation EndLoc);
+  /// Called on well-formed 'update' clause.
+  OMPClause *ActOnOpenMPUpdateClause(SourceLocation StartLoc,
+                                     SourceLocation EndLoc);
+  /// Called on well-formed 'capture' clause.
+  OMPClause *ActOnOpenMPCaptureClause(SourceLocation StartLoc,
+                                      SourceLocation EndLoc);
+  /// Called on well-formed 'compare' clause.
+  OMPClause *ActOnOpenMPCompareClause(SourceLocation StartLoc,
+                                      SourceLocation EndLoc);
+  /// Called on well-formed 'fail' clause.
+  OMPClause *ActOnOpenMPFailClause(SourceLocation StartLoc,
+                                   SourceLocation EndLoc);
+  OMPClause *ActOnOpenMPFailClause(OpenMPClauseKind Kind,
+                                   SourceLocation KindLoc,
+                                   SourceLocation StartLoc,
+                                   SourceLocation LParenLoc,
+                                   SourceLocation EndLoc);
+
+  /// Called on well-formed 'seq_cst' clause.
+  OMPClause *ActOnOpenMPSeqCstClause(SourceLocation StartLoc,
+                                     SourceLocation EndLoc);
+  /// Called on well-formed 'acq_rel' clause.
+  OMPClause *ActOnOpenMPAcqRelClause(SourceLocation StartLoc,
+                                     SourceLocation EndLoc);
+  /// Called on well-formed 'acquire' clause.
+  OMPClause *ActOnOpenMPAcquireClause(SourceLocation StartLoc,
+                                      SourceLocation EndLoc);
+  /// Called on well-formed 'release' clause.
+  OMPClause *ActOnOpenMPReleaseClause(SourceLocation StartLoc,
+                                      SourceLocation EndLoc);
+  /// Called on well-formed 'relaxed' clause.
+  OMPClause *ActOnOpenMPRelaxedClause(SourceLocation StartLoc,
+                                      SourceLocation EndLoc);
+  /// Called on well-formed 'weak' clause.
+  OMPClause *ActOnOpenMPWeakClause(SourceLocation StartLoc,
+                                   SourceLocation EndLoc);
+
+  /// Called on well-formed 'init' clause.
+  OMPClause *
+  ActOnOpenMPInitClause(Expr *InteropVar, OMPInteropInfo &InteropInfo,
+                        SourceLocation StartLoc, SourceLocation LParenLoc,
+                        SourceLocation VarLoc, SourceLocation EndLoc);
+
+  /// Called on well-formed 'use' clause.
+  OMPClause *ActOnOpenMPUseClause(Expr *InteropVar, SourceLocation StartLoc,
+                                  SourceLocation LParenLoc,
+                                  SourceLocation VarLoc, SourceLocation EndLoc);
+
+  /// Called on well-formed 'destroy' clause.
+  OMPClause *ActOnOpenMPDestroyClause(Expr *InteropVar, SourceLocation StartLoc,
+                                      SourceLocation LParenLoc,
+                                      SourceLocation VarLoc,
+                                      SourceLocation EndLoc);
+  /// Called on well-formed 'novariants' clause.
+  OMPClause *ActOnOpenMPNovariantsClause(Expr *Condition,
+                                         SourceLocation StartLoc,
+                                         SourceLocation LParenLoc,
+                                         SourceLocation EndLoc);
+  /// Called on well-formed 'nocontext' clause.
+  OMPClause *ActOnOpenMPNocontextClause(Expr *Condition,
+                                        SourceLocation StartLoc,
+                                        SourceLocation LParenLoc,
+                                        SourceLocation EndLoc);
+  /// Called on well-formed 'filter' clause.
+  OMPClause *ActOnOpenMPFilterClause(Expr *ThreadID, SourceLocation StartLoc,
+                                     SourceLocation LParenLoc,
+                                     SourceLocation EndLoc);
+  /// Called on well-formed 'threads' clause.
+  OMPClause *ActOnOpenMPThreadsClause(SourceLocation StartLoc,
+                                      SourceLocation EndLoc);
+  /// Called on well-formed 'simd' clause.
+  OMPClause *ActOnOpenMPSIMDClause(SourceLocation StartLoc,
+                                   SourceLocation EndLoc);
+  /// Called on well-formed 'nogroup' clause.
+  OMPClause *ActOnOpenMPNogroupClause(SourceLocation StartLoc,
+                                      SourceLocation EndLoc);
+  /// Called on well-formed 'unified_address' clause.
+  OMPClause *ActOnOpenMPUnifiedAddressClause(SourceLocation StartLoc,
+                                             SourceLocation EndLoc);
+
+  /// Called on well-formed 'unified_address' clause.
+  OMPClause *ActOnOpenMPUnifiedSharedMemoryClause(SourceLocation StartLoc,
+                                                  SourceLocation EndLoc);
+
+  /// Called on well-formed 'reverse_offload' clause.
+  OMPClause *ActOnOpenMPReverseOffloadClause(SourceLocation StartLoc,
+                                             SourceLocation EndLoc);
+
+  /// Called on well-formed 'dynamic_allocators' clause.
+  OMPClause *ActOnOpenMPDynamicAllocatorsClause(SourceLocation StartLoc,
+                                                SourceLocation EndLoc);
+
+  /// Called on well-formed 'atomic_default_mem_order' clause.
+  OMPClause *ActOnOpenMPAtomicDefaultMemOrderClause(
+      OpenMPAtomicDefaultMemOrderClauseKind Kind, SourceLocation KindLoc,
+      SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc);
+
+  /// Called on well-formed 'at' clause.
+  OMPClause *ActOnOpenMPAtClause(OpenMPAtClauseKind Kind,
+                                 SourceLocation KindLoc,
+                                 SourceLocation StartLoc,
+                                 SourceLocation LParenLoc,
+                                 SourceLocation EndLoc);
+
+  /// Called on well-formed 'severity' clause.
+  OMPClause *ActOnOpenMPSeverityClause(OpenMPSeverityClauseKind Kind,
+                                       SourceLocation KindLoc,
+                                       SourceLocation StartLoc,
+                                       SourceLocation LParenLoc,
+                                       SourceLocation EndLoc);
+
+  /// Called on well-formed 'message' clause.
+  /// passing string for message.
+  OMPClause *ActOnOpenMPMessageClause(Expr *MS, SourceLocation StartLoc,
+                                      SourceLocation LParenLoc,
+                                      SourceLocation EndLoc);
+
+  /// Data used for processing a list of variables in OpenMP clauses.
+  struct OpenMPVarListDataTy final {
+    Expr *DepModOrTailExpr = nullptr;
+    Expr *IteratorExpr = nullptr;
+    SourceLocation ColonLoc;
+    SourceLocation RLoc;
+    CXXScopeSpec ReductionOrMapperIdScopeSpec;
+    DeclarationNameInfo ReductionOrMapperId;
+    int ExtraModifier = -1; ///< Additional modifier for linear, map, depend or
+                            ///< lastprivate clause.
+    SmallVector<OpenMPMapModifierKind, NumberOfOMPMapClauseModifiers>
+        MapTypeModifiers;
+    SmallVector<SourceLocation, NumberOfOMPMapClauseModifiers>
+        MapTypeModifiersLoc;
+    SmallVector<OpenMPMotionModifierKind, NumberOfOMPMotionModifiers>
+        MotionModifiers;
+    SmallVector<SourceLocation, NumberOfOMPMotionModifiers> MotionModifiersLoc;
+    bool IsMapTypeImplicit = false;
+    SourceLocation ExtraModifierLoc;
+    SourceLocation OmpAllMemoryLoc;
+    SourceLocation
+        StepModifierLoc; /// 'step' modifier location for linear clause
+  };
+
+  OMPClause *ActOnOpenMPVarListClause(OpenMPClauseKind Kind,
+                                      ArrayRef<Expr *> Vars,
+                                      const OMPVarListLocTy &Locs,
+                                      OpenMPVarListDataTy &Data);
+  /// Called on well-formed 'inclusive' clause.
+  OMPClause *ActOnOpenMPInclusiveClause(ArrayRef<Expr *> VarList,
+                                        SourceLocation StartLoc,
+                                        SourceLocation LParenLoc,
+                                        SourceLocation EndLoc);
+  /// Called on well-formed 'exclusive' clause.
+  OMPClause *ActOnOpenMPExclusiveClause(ArrayRef<Expr *> VarList,
+                                        SourceLocation StartLoc,
+                                        SourceLocation LParenLoc,
+                                        SourceLocation EndLoc);
+  /// Called on well-formed 'allocate' clause.
+  OMPClause *
+  ActOnOpenMPAllocateClause(Expr *Allocator, ArrayRef<Expr *> VarList,
+                            SourceLocation StartLoc, SourceLocation ColonLoc,
+                            SourceLocation LParenLoc, SourceLocation EndLoc);
+  /// Called on well-formed 'private' clause.
+  OMPClause *ActOnOpenMPPrivateClause(ArrayRef<Expr *> VarList,
+                                      SourceLocation StartLoc,
+                                      SourceLocation LParenLoc,
+                                      SourceLocation EndLoc);
+  /// Called on well-formed 'firstprivate' clause.
+  OMPClause *ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
+                                           SourceLocation StartLoc,
+                                           SourceLocation LParenLoc,
+                                           SourceLocation EndLoc);
+  /// Called on well-formed 'lastprivate' clause.
+  OMPClause *ActOnOpenMPLastprivateClause(
+      ArrayRef<Expr *> VarList, OpenMPLastprivateModifier LPKind,
+      SourceLocation LPKindLoc, SourceLocation ColonLoc,
+      SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc);
+  /// Called on well-formed 'shared' clause.
+  OMPClause *ActOnOpenMPSharedClause(ArrayRef<Expr *> VarList,
+                                     SourceLocation StartLoc,
+                                     SourceLocation LParenLoc,
+                                     SourceLocation EndLoc);
+  /// Called on well-formed 'reduction' clause.
+  OMPClause *ActOnOpenMPReductionClause(
+      ArrayRef<Expr *> VarList, OpenMPReductionClauseModifier Modifier,
+      SourceLocation StartLoc, SourceLocation LParenLoc,
+      SourceLocation ModifierLoc, SourceLocation ColonLoc,
+      SourceLocation EndLoc, CXXScopeSpec &ReductionIdScopeSpec,
+      const DeclarationNameInfo &ReductionId,
+      ArrayRef<Expr *> UnresolvedReductions = std::nullopt);
+  /// Called on well-formed 'task_reduction' clause.
+  OMPClause *ActOnOpenMPTaskReductionClause(
+      ArrayRef<Expr *> VarList, SourceLocation StartLoc,
+      SourceLocation LParenLoc, SourceLocation ColonLoc, SourceLocation EndLoc,
+      CXXScopeSpec &ReductionIdScopeSpec,
+      const DeclarationNameInfo &ReductionId,
+      ArrayRef<Expr *> UnresolvedReductions = std::nullopt);
+  /// Called on well-formed 'in_reduction' clause.
+  OMPClause *ActOnOpenMPInReductionClause(
+      ArrayRef<Expr *> VarList, SourceLocation StartLoc,
+      SourceLocation LParenLoc, SourceLocation ColonLoc, SourceLocation EndLoc,
+      CXXScopeSpec &ReductionIdScopeSpec,
+      const DeclarationNameInfo &ReductionId,
+      ArrayRef<Expr *> UnresolvedReductions = std::nullopt);
+  /// Called on well-formed 'linear' clause.
+  OMPClause *ActOnOpenMPLinearClause(
+      ArrayRef<Expr *> VarList, Expr *Step, SourceLocation StartLoc,
+      SourceLocation LParenLoc, OpenMPLinearClauseKind LinKind,
+      SourceLocation LinLoc, SourceLocation ColonLoc,
+      SourceLocation StepModifierLoc, SourceLocation EndLoc);
+  /// Called on well-formed 'aligned' clause.
+  OMPClause *ActOnOpenMPAlignedClause(ArrayRef<Expr *> VarList, Expr *Alignment,
+                                      SourceLocation StartLoc,
+                                      SourceLocation LParenLoc,
+                                      SourceLocation ColonLoc,
+                                      SourceLocation EndLoc);
+  /// Called on well-formed 'copyin' clause.
+  OMPClause *ActOnOpenMPCopyinClause(ArrayRef<Expr *> VarList,
+                                     SourceLocation StartLoc,
+                                     SourceLocation LParenLoc,
+                                     SourceLocation EndLoc);
+  /// Called on well-formed 'copyprivate' clause.
+  OMPClause *ActOnOpenMPCopyprivateClause(ArrayRef<Expr *> VarList,
+                                          SourceLocation StartLoc,
+                                          SourceLocation LParenLoc,
+                                          SourceLocation EndLoc);
+  /// Called on well-formed 'flush' pseudo clause.
+  OMPClause *ActOnOpenMPFlushClause(ArrayRef<Expr *> VarList,
+                                    SourceLocation StartLoc,
+                                    SourceLocation LParenLoc,
+                                    SourceLocation EndLoc);
+  /// Called on well-formed 'depobj' pseudo clause.
+  OMPClause *ActOnOpenMPDepobjClause(Expr *Depobj, SourceLocation StartLoc,
+                                     SourceLocation LParenLoc,
+                                     SourceLocation EndLoc);
+  /// Called on well-formed 'depend' clause.
+  OMPClause *ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data,
+                                     Expr *DepModifier,
+                                     ArrayRef<Expr *> VarList,
+                                     SourceLocation StartLoc,
+                                     SourceLocation LParenLoc,
+                                     SourceLocation EndLoc);
+  /// Called on well-formed 'device' clause.
+  OMPClause *ActOnOpenMPDeviceClause(OpenMPDeviceClauseModifier Modifier,
+                                     Expr *Device, SourceLocation StartLoc,
+                                     SourceLocation LParenLoc,
+                                     SourceLocation ModifierLoc,
+                                     SourceLocation EndLoc);
+  /// Called on well-formed 'map' clause.
+  OMPClause *ActOnOpenMPMapClause(
+      Expr *IteratorModifier, ArrayRef<OpenMPMapModifierKind> MapTypeModifiers,
+      ArrayRef<SourceLocation> MapTypeModifiersLoc,
+      CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId,
+      OpenMPMapClauseKind MapType, bool IsMapTypeImplicit,
+      SourceLocation MapLoc, SourceLocation ColonLoc, ArrayRef<Expr *> VarList,
+      const OMPVarListLocTy &Locs, bool NoDiagnose = false,
+      ArrayRef<Expr *> UnresolvedMappers = std::nullopt);
+  /// Called on well-formed 'num_teams' clause.
+  OMPClause *ActOnOpenMPNumTeamsClause(Expr *NumTeams, SourceLocation StartLoc,
+                                       SourceLocation LParenLoc,
+                                       SourceLocation EndLoc);
+  /// Called on well-formed 'thread_limit' clause.
+  OMPClause *ActOnOpenMPThreadLimitClause(Expr *ThreadLimit,
+                                          SourceLocation StartLoc,
+                                          SourceLocation LParenLoc,
+                                          SourceLocation EndLoc);
+  /// Called on well-formed 'priority' clause.
+  OMPClause *ActOnOpenMPPriorityClause(Expr *Priority, SourceLocation StartLoc,
+                                       SourceLocation LParenLoc,
+                                       SourceLocation EndLoc);
+  /// Called on well-formed 'dist_schedule' clause.
+  OMPClause *ActOnOpenMPDistScheduleClause(
+      OpenMPDistScheduleClauseKind Kind, Expr *ChunkSize,
+      SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation KindLoc,
+      SourceLocation CommaLoc, SourceLocation EndLoc);
+  /// Called on well-formed 'defaultmap' clause.
+  OMPClause *ActOnOpenMPDefaultmapClause(
+      OpenMPDefaultmapClauseModifier M, OpenMPDefaultmapClauseKind Kind,
+      SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation MLoc,
+      SourceLocation KindLoc, SourceLocation EndLoc);
+  /// Called on well-formed 'to' clause.
+  OMPClause *
+  ActOnOpenMPToClause(ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
+                      ArrayRef<SourceLocation> MotionModifiersLoc,
+                      CXXScopeSpec &MapperIdScopeSpec,
+                      DeclarationNameInfo &MapperId, SourceLocation ColonLoc,
+                      ArrayRef<Expr *> VarList, const OMPVarListLocTy &Locs,
+                      ArrayRef<Expr *> UnresolvedMappers = std::nullopt);
+  /// Called on well-formed 'from' clause.
+  OMPClause *
+  ActOnOpenMPFromClause(ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
+                        ArrayRef<SourceLocation> MotionModifiersLoc,
+                        CXXScopeSpec &MapperIdScopeSpec,
+                        DeclarationNameInfo &MapperId, SourceLocation ColonLoc,
+                        ArrayRef<Expr *> VarList, const OMPVarListLocTy &Locs,
+                        ArrayRef<Expr *> UnresolvedMappers = std::nullopt);
+  /// Called on well-formed 'use_device_ptr' clause.
+  OMPClause *ActOnOpenMPUseDevicePtrClause(ArrayRef<Expr *> VarList,
+                                           const OMPVarListLocTy &Locs);
+  /// Called on well-formed 'use_device_addr' clause.
+  OMPClause *ActOnOpenMPUseDeviceAddrClause(ArrayRef<Expr *> VarList,
+                                            const OMPVarListLocTy &Locs);
+  /// Called on well-formed 'is_device_ptr' clause.
+  OMPClause *ActOnOpenMPIsDevicePtrClause(ArrayRef<Expr *> VarList,
+                                          const OMPVarListLocTy &Locs);
+  /// Called on well-formed 'has_device_addr' clause.
+  OMPClause *ActOnOpenMPHasDeviceAddrClause(ArrayRef<Expr *> VarList,
+                                            const OMPVarListLocTy &Locs);
+  /// Called on well-formed 'nontemporal' clause.
+  OMPClause *ActOnOpenMPNontemporalClause(ArrayRef<Expr *> VarList,
+                                          SourceLocation StartLoc,
+                                          SourceLocation LParenLoc,
+                                          SourceLocation EndLoc);
+
+  /// Data for list of allocators.
+  struct UsesAllocatorsData {
+    /// Allocator.
+    Expr *Allocator = nullptr;
+    /// Allocator traits.
+    Expr *AllocatorTraits = nullptr;
+    /// Locations of '(' and ')' symbols.
+    SourceLocation LParenLoc, RParenLoc;
+  };
+  /// Called on well-formed 'uses_allocators' clause.
+  OMPClause *ActOnOpenMPUsesAllocatorClause(SourceLocation StartLoc,
+                                            SourceLocation LParenLoc,
+                                            SourceLocation EndLoc,
+                                            ArrayRef<UsesAllocatorsData> Data);
+  /// Called on well-formed 'affinity' clause.
+  OMPClause *ActOnOpenMPAffinityClause(SourceLocation StartLoc,
+                                       SourceLocation LParenLoc,
+                                       SourceLocation ColonLoc,
+                                       SourceLocation EndLoc, Expr *Modifier,
+                                       ArrayRef<Expr *> Locators);
+  /// Called on a well-formed 'bind' clause.
+  OMPClause *ActOnOpenMPBindClause(OpenMPBindClauseKind Kind,
+                                   SourceLocation KindLoc,
+                                   SourceLocation StartLoc,
+                                   SourceLocation LParenLoc,
+                                   SourceLocation EndLoc);
+
+  /// Called on a well-formed 'ompx_dyn_cgroup_mem' clause.
+  OMPClause *ActOnOpenMPXDynCGroupMemClause(Expr *Size, SourceLocation StartLoc,
+                                            SourceLocation LParenLoc,
+                                            SourceLocation EndLoc);
+
+  /// Called on well-formed 'doacross' clause.
+  OMPClause *
+  ActOnOpenMPDoacrossClause(OpenMPDoacrossClauseModifier DepType,
+                            SourceLocation DepLoc, SourceLocation ColonLoc,
+                            ArrayRef<Expr *> VarList, SourceLocation StartLoc,
+                            SourceLocation LParenLoc, SourceLocation EndLoc);
+
+  /// Called on a well-formed 'ompx_attribute' clause.
+  OMPClause *ActOnOpenMPXAttributeClause(ArrayRef<const Attr *> Attrs,
+                                         SourceLocation StartLoc,
+                                         SourceLocation LParenLoc,
+                                         SourceLocation EndLoc);
+
+  /// Called on a well-formed 'ompx_bare' clause.
+  OMPClause *ActOnOpenMPXBareClause(SourceLocation StartLoc,
+                                    SourceLocation EndLoc);
+
+  ExprResult ActOnOMPArraySectionExpr(Expr *Base, SourceLocation LBLoc,
+                                      Expr *LowerBound,
+                                      SourceLocation ColonLocFirst,
+                                      SourceLocation ColonLocSecond,
+                                      Expr *Length, Expr *Stride,
+                                      SourceLocation RBLoc);
+  ExprResult ActOnOMPArrayShapingExpr(Expr *Base, SourceLocation LParenLoc,
+                                      SourceLocation RParenLoc,
+                                      ArrayRef<Expr *> Dims,
+                                      ArrayRef<SourceRange> Brackets);
+
+  /// Data structure for iterator expression.
+  struct OMPIteratorData {
+    IdentifierInfo *DeclIdent = nullptr;
+    SourceLocation DeclIdentLoc;
+    ParsedType Type;
+    OMPIteratorExpr::IteratorRange Range;
+    SourceLocation AssignLoc;
+    SourceLocation ColonLoc;
+    SourceLocation SecColonLoc;
+  };
+
+  ExprResult ActOnOMPIteratorExpr(Scope *S, SourceLocation IteratorKwLoc,
+                                  SourceLocation LLoc, SourceLocation RLoc,
+                                  ArrayRef<OMPIteratorData> Data);
+
+private:
+  void *VarDataSharingAttributesStack;
+
+  /// Number of nested '#pragma omp declare target' directives.
+  SmallVector<DeclareTargetContextInfo, 4> DeclareTargetNesting;
+
+  /// Initialization of data-sharing attributes stack.
+  void InitDataSharingAttributesStack();
+  void DestroyDataSharingAttributesStack();
+
+  /// Returns OpenMP nesting level for current directive.
+  unsigned getOpenMPNestingLevel() const;
+
+  /// Adjusts the function scopes index for the target-based regions.
+  void adjustOpenMPTargetScopeIndex(unsigned &FunctionScopesIndex,
+                                    unsigned Level) const;
+
+  /// Returns the number of scopes associated with the construct on the given
+  /// OpenMP level.
+  int getNumberOfConstructScopes(unsigned Level) const;
+
+  /// Push new OpenMP function region for non-capturing function.
+  void pushOpenMPFunctionRegion();
+
+  /// Pop OpenMP function region for non-capturing function.
+  void popOpenMPFunctionRegion(const sema::FunctionScopeInfo *OldFSI);
+
+  /// Analyzes and checks a loop nest for use by a loop transformation.
+  ///
+  /// \param Kind          The loop transformation directive kind.
+  /// \param NumLoops      How many nested loops the directive is expecting.
+  /// \param AStmt         Associated statement of the transformation directive.
+  /// \param LoopHelpers   [out] The loop analysis result.
+  /// \param Body          [out] The body code nested in \p NumLoops loop.
+  /// \param OriginalInits [out] Collection of statements and declarations that
+  ///                      must have been executed/declared before entering the
+  ///                      loop.
+  ///
+  /// \return Whether there was any error.
+  bool checkTransformableLoopNest(
+      OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops,
+      SmallVectorImpl<OMPLoopBasedDirective::HelperExprs> &LoopHelpers,
+      Stmt *&Body,
+      SmallVectorImpl<SmallVector<llvm::PointerUnion<Stmt *, Decl *>, 0>>
+          &OriginalInits);
+
+  /// Helper to keep information about the current `omp begin/end declare
+  /// variant` nesting.
+  struct OMPDeclareVariantScope {
+    /// The associated OpenMP context selector.
+    OMPTraitInfo *TI;
+
+    /// The associated OpenMP context selector mangling.
+    std::string NameSuffix;
+
+    OMPDeclareVariantScope(OMPTraitInfo &TI);
+  };
+
+  /// Return the OMPTraitInfo for the surrounding scope, if any.
+  OMPTraitInfo *getOMPTraitInfoForSurroundingScope() {
+    return OMPDeclareVariantScopes.empty() ? nullptr
+                                           : OMPDeclareVariantScopes.back().TI;
+  }
+
+  /// The current `omp begin/end declare variant` scopes.
+  SmallVector<OMPDeclareVariantScope, 4> OMPDeclareVariantScopes;
+
+  /// The current `omp begin/end assumes` scopes.
+  SmallVector<OMPAssumeAttr *, 4> OMPAssumeScoped;
+
+  /// All `omp assumes` we encountered so far.
+  SmallVector<OMPAssumeAttr *, 4> OMPAssumeGlobal;
+
+  /// OMPD_loop is mapped to OMPD_for, OMPD_distribute or OMPD_simd depending
+  /// on the parameter of the bind clause. In the methods for the
+  /// mapped directives, check the parameters of the lastprivate clause.
+  bool checkLastPrivateForMappedDirectives(ArrayRef<OMPClause *> Clauses);
+  /// Depending on the bind clause of OMPD_loop map the directive to new
+  /// directives.
+  ///    1) loop bind(parallel) --> OMPD_for
+  ///    2) loop bind(teams) --> OMPD_distribute
+  ///    3) loop bind(thread) --> OMPD_simd
+  /// This is being handled in Sema instead of Codegen because of the need for
+  /// rigorous semantic checking in the new mapped directives.
+  bool mapLoopConstruct(llvm::SmallVector<OMPClause *> &ClausesWithoutBind,
+                        ArrayRef<OMPClause *> Clauses,
+                        OpenMPBindClauseKind &BindKind,
+                        OpenMPDirectiveKind &Kind,
+                        OpenMPDirectiveKind &PrevMappedDirective,
+                        SourceLocation StartLoc, SourceLocation EndLoc,
+                        const DeclarationNameInfo &DirName,
+                        OpenMPDirectiveKind CancelRegion);
+};
+
+} // namespace clang
+
+#endif // LLVM_CLANG_SEMA_SEMAOPENMP_H
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index e3fde887f99cb..43ee06c524b3a 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -2457,6 +2457,12 @@ class BitsUnpacker {
   uint32_t Value;
   uint32_t CurrentBitsIndex = ~0;
 };
+
+inline bool shouldSkipCheckingODR(const Decl *D) {
+  return D->getASTContext().getLangOpts().SkipODRCheckInGMF &&
+         D->isFromExplicitGlobalModule();
+}
+
 } // namespace clang
 
 #endif // LLVM_CLANG_SERIALIZATION_ASTREADER_H
diff --git a/clang/include/clang/Serialization/ModuleFileExtension.h b/clang/include/clang/Serialization/ModuleFileExtension.h
index d7d456c8b5db8..50ce401516275 100644
--- a/clang/include/clang/Serialization/ModuleFileExtension.h
+++ b/clang/include/clang/Serialization/ModuleFileExtension.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_CLANG_SERIALIZATION_MODULEFILEEXTENSION_H
 #define LLVM_CLANG_SERIALIZATION_MODULEFILEEXTENSION_H
 
-#include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/Support/ExtensibleRTTI.h"
 #include "llvm/Support/HashBuilder.h"
 #include "llvm/Support/MD5.h"
diff --git a/clang/include/clang/Serialization/PCHContainerOperations.h b/clang/include/clang/Serialization/PCHContainerOperations.h
index ddfddf2dafadf..c9a7e334ce6eb 100644
--- a/clang/include/clang/Serialization/PCHContainerOperations.h
+++ b/clang/include/clang/Serialization/PCHContainerOperations.h
@@ -12,7 +12,7 @@
 #include "clang/Basic/Module.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/MemoryBufferRef.h"
 #include <memory>
 
 namespace llvm {
diff --git a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
index 081899cc2c850..da51292296a90 100644
--- a/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
+++ b/clang/include/clang/Tooling/DependencyScanning/ModuleDepCollector.h
@@ -308,6 +308,11 @@ class ModuleDepCollector final : public DependencyCollector {
                                 ModuleDeps &Deps);
 };
 
+/// Resets codegen options that don't affect modules/PCH.
+void resetBenignCodeGenOptions(frontend::ActionKind ProgramAction,
+                               const LangOptions &LangOpts,
+                               CodeGenOptions &CGOpts);
+
 } // end namespace dependencies
 } // end namespace tooling
 } // end namespace clang
diff --git a/clang/lib/APINotes/APINotesReader.cpp b/clang/lib/APINotes/APINotesReader.cpp
index fbbe9c32ce125..dfc3beb6fa13e 100644
--- a/clang/lib/APINotes/APINotesReader.cpp
+++ b/clang/lib/APINotes/APINotesReader.cpp
@@ -30,23 +30,20 @@ namespace {
 llvm::VersionTuple ReadVersionTuple(const uint8_t *&Data) {
   uint8_t NumVersions = (*Data++) & 0x03;
 
-  unsigned Major =
-      endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+  unsigned Major = endian::readNext<uint32_t, llvm::endianness::little>(Data);
   if (NumVersions == 0)
     return llvm::VersionTuple(Major);
 
-  unsigned Minor =
-      endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+  unsigned Minor = endian::readNext<uint32_t, llvm::endianness::little>(Data);
   if (NumVersions == 1)
     return llvm::VersionTuple(Major, Minor);
 
   unsigned Subminor =
-      endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+      endian::readNext<uint32_t, llvm::endianness::little>(Data);
   if (NumVersions == 2)
     return llvm::VersionTuple(Major, Minor, Subminor);
 
-  unsigned Build =
-      endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+  unsigned Build = endian::readNext<uint32_t, llvm::endianness::little>(Data);
   return llvm::VersionTuple(Major, Minor, Subminor, Build);
 }
 
@@ -71,16 +68,16 @@ class VersionedTableInfo {
 
   static std::pair<unsigned, unsigned> ReadKeyDataLength(const uint8_t *&Data) {
     unsigned KeyLength =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     unsigned DataLength =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     return {KeyLength, DataLength};
   }
 
   static data_type ReadData(internal_key_type Key, const uint8_t *Data,
                             unsigned Length) {
     unsigned NumElements =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     data_type Result;
     Result.reserve(NumElements);
     for (unsigned i = 0; i != NumElements; ++i) {
@@ -105,14 +102,14 @@ void ReadCommonEntityInfo(const uint8_t *&Data, CommonEntityInfo &Info) {
     Info.setSwiftPrivate(static_cast<bool>((UnavailableBits >> 3) & 0x01));
 
   unsigned MsgLength =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+      endian::readNext<uint16_t, llvm::endianness::little>(Data);
   Info.UnavailableMsg =
       std::string(reinterpret_cast<const char *>(Data),
                   reinterpret_cast<const char *>(Data) + MsgLength);
   Data += MsgLength;
 
   unsigned SwiftNameLength =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+      endian::readNext<uint16_t, llvm::endianness::little>(Data);
   Info.SwiftName =
       std::string(reinterpret_cast<const char *>(Data),
                   reinterpret_cast<const char *>(Data) + SwiftNameLength);
@@ -124,7 +121,7 @@ void ReadCommonTypeInfo(const uint8_t *&Data, CommonTypeInfo &Info) {
   ReadCommonEntityInfo(Data, Info);
 
   unsigned SwiftBridgeLength =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+      endian::readNext<uint16_t, llvm::endianness::little>(Data);
   if (SwiftBridgeLength > 0) {
     Info.setSwiftBridge(std::string(reinterpret_cast<const char *>(Data),
                                     SwiftBridgeLength - 1));
@@ -132,7 +129,7 @@ void ReadCommonTypeInfo(const uint8_t *&Data, CommonTypeInfo &Info) {
   }
 
   unsigned ErrorDomainLength =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+      endian::readNext<uint16_t, llvm::endianness::little>(Data);
   if (ErrorDomainLength > 0) {
     Info.setNSErrorDomain(std::optional<std::string>(std::string(
         reinterpret_cast<const char *>(Data), ErrorDomainLength - 1)));
@@ -163,9 +160,9 @@ class IdentifierTableInfo {
 
   static std::pair<unsigned, unsigned> ReadKeyDataLength(const uint8_t *&Data) {
     unsigned KeyLength =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     unsigned DataLength =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     return {KeyLength, DataLength};
   }
 
@@ -175,8 +172,7 @@ class IdentifierTableInfo {
 
   static data_type ReadData(internal_key_type key, const uint8_t *Data,
                             unsigned Length) {
-    return endian::readNext<uint32_t, llvm::endianness::little, unaligned>(
-        Data);
+    return endian::readNext<uint32_t, llvm::endianness::little>(Data);
   }
 };
 
@@ -203,26 +199,24 @@ class ObjCContextIDTableInfo {
 
   static std::pair<unsigned, unsigned> ReadKeyDataLength(const uint8_t *&Data) {
     unsigned KeyLength =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     unsigned DataLength =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     return {KeyLength, DataLength};
   }
 
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
     auto ParentCtxID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint32_t, llvm::endianness::little>(Data);
     auto ContextKind =
-        endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
-    auto NameID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint8_t, llvm::endianness::little>(Data);
+    auto NameID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
     return {ParentCtxID, ContextKind, NameID};
   }
 
   static data_type ReadData(internal_key_type Key, const uint8_t *Data,
                             unsigned Length) {
-    return endian::readNext<uint32_t, llvm::endianness::little, unaligned>(
-        Data);
+    return endian::readNext<uint32_t, llvm::endianness::little>(Data);
   }
 };
 
@@ -232,8 +226,7 @@ class ObjCContextInfoTableInfo
                                 ObjCContextInfo> {
 public:
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
-    return endian::readNext<uint32_t, llvm::endianness::little, unaligned>(
-        Data);
+    return endian::readNext<uint32_t, llvm::endianness::little>(Data);
   }
 
   hash_value_type ComputeHash(internal_key_type Key) {
@@ -273,8 +266,7 @@ void ReadVariableInfo(const uint8_t *&Data, VariableInfo &Info) {
   }
   ++Data;
 
-  auto TypeLen =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+  auto TypeLen = endian::readNext<uint16_t, llvm::endianness::little>(Data);
   Info.setType(std::string(Data, Data + TypeLen));
   Data += TypeLen;
 }
@@ -286,12 +278,9 @@ class ObjCPropertyTableInfo
                                 ObjCPropertyInfo> {
 public:
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
-    auto ClassID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
-    auto NameID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
-    char IsInstance =
-        endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+    auto ClassID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
+    auto NameID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
+    char IsInstance = endian::readNext<uint8_t, llvm::endianness::little>(Data);
     return {ClassID, NameID, IsInstance};
   }
 
@@ -314,8 +303,7 @@ class ObjCPropertyTableInfo
 void ReadParamInfo(const uint8_t *&Data, ParamInfo &Info) {
   ReadVariableInfo(Data, Info);
 
-  uint8_t Payload =
-      endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+  uint8_t Payload = endian::readNext<uint8_t, llvm::endianness::little>(Data);
   if (auto RawConvention = Payload & 0x7) {
     auto Convention = static_cast<RetainCountConventionKind>(RawConvention - 1);
     Info.setRetainCountConvention(Convention);
@@ -331,8 +319,7 @@ void ReadParamInfo(const uint8_t *&Data, ParamInfo &Info) {
 void ReadFunctionInfo(const uint8_t *&Data, FunctionInfo &Info) {
   ReadCommonEntityInfo(Data, Info);
 
-  uint8_t Payload =
-      endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+  uint8_t Payload = endian::readNext<uint8_t, llvm::endianness::little>(Data);
   if (auto RawConvention = Payload & 0x7) {
     auto Convention = static_cast<RetainCountConventionKind>(RawConvention - 1);
     Info.setRetainCountConvention(Convention);
@@ -343,12 +330,12 @@ void ReadFunctionInfo(const uint8_t *&Data, FunctionInfo &Info) {
   assert(Payload == 0 && "Bad API notes");
 
   Info.NumAdjustedNullable =
-      endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+      endian::readNext<uint8_t, llvm::endianness::little>(Data);
   Info.NullabilityPayload =
-      endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Data);
+      endian::readNext<uint64_t, llvm::endianness::little>(Data);
 
   unsigned NumParams =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+      endian::readNext<uint16_t, llvm::endianness::little>(Data);
   while (NumParams > 0) {
     ParamInfo pi;
     ReadParamInfo(Data, pi);
@@ -357,7 +344,7 @@ void ReadFunctionInfo(const uint8_t *&Data, FunctionInfo &Info) {
   }
 
   unsigned ResultTypeLen =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+      endian::readNext<uint16_t, llvm::endianness::little>(Data);
   Info.ResultType = std::string(Data, Data + ResultTypeLen);
   Data += ResultTypeLen;
 }
@@ -369,12 +356,10 @@ class ObjCMethodTableInfo
                                 ObjCMethodInfo> {
 public:
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
-    auto ClassID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    auto ClassID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
     auto SelectorID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
-    auto IsInstance =
-        endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint32_t, llvm::endianness::little>(Data);
+    auto IsInstance = endian::readNext<uint8_t, llvm::endianness::little>(Data);
     return {ClassID, SelectorID, IsInstance};
   }
 
@@ -419,29 +404,26 @@ class ObjCSelectorTableInfo {
 
   static std::pair<unsigned, unsigned> ReadKeyDataLength(const uint8_t *&Data) {
     unsigned KeyLength =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     unsigned DataLength =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     return {KeyLength, DataLength};
   }
 
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
     internal_key_type Key;
-    Key.NumArgs =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+    Key.NumArgs = endian::readNext<uint16_t, llvm::endianness::little>(Data);
     unsigned NumIdents = (Length - sizeof(uint16_t)) / sizeof(uint32_t);
     for (unsigned i = 0; i != NumIdents; ++i) {
       Key.Identifiers.push_back(
-          endian::readNext<uint32_t, llvm::endianness::little, unaligned>(
-              Data));
+          endian::readNext<uint32_t, llvm::endianness::little>(Data));
     }
     return Key;
   }
 
   static data_type ReadData(internal_key_type Key, const uint8_t *Data,
                             unsigned Length) {
-    return endian::readNext<uint32_t, llvm::endianness::little, unaligned>(
-        Data);
+    return endian::readNext<uint32_t, llvm::endianness::little>(Data);
   }
 };
 
@@ -451,12 +433,10 @@ class GlobalVariableTableInfo
                                 GlobalVariableInfo> {
 public:
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
-    auto CtxID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    auto CtxID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
     auto ContextKind =
-        endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
-    auto NameID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint8_t, llvm::endianness::little>(Data);
+    auto NameID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
     return {CtxID, ContextKind, NameID};
   }
 
@@ -478,12 +458,10 @@ class GlobalFunctionTableInfo
                                 GlobalFunctionInfo> {
 public:
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
-    auto CtxID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    auto CtxID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
     auto ContextKind =
-        endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
-    auto NameID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint8_t, llvm::endianness::little>(Data);
+    auto NameID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
     return {CtxID, ContextKind, NameID};
   }
 
@@ -505,8 +483,7 @@ class EnumConstantTableInfo
                                 EnumConstantInfo> {
 public:
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
-    auto NameID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    auto NameID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
     return NameID;
   }
 
@@ -527,13 +504,11 @@ class TagTableInfo
     : public VersionedTableInfo<TagTableInfo, ContextTableKey, TagInfo> {
 public:
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
-    auto CtxID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    auto CtxID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
     auto ContextKind =
-        endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint8_t, llvm::endianness::little>(Data);
     auto NameID =
-        endian::readNext<IdentifierID, llvm::endianness::little, unaligned>(
-            Data);
+        endian::readNext<IdentifierID, llvm::endianness::little>(Data);
     return {CtxID, ContextKind, NameID};
   }
 
@@ -553,21 +528,21 @@ class TagTableInfo
           static_cast<EnumExtensibilityKind>((Payload & 0x3) - 1);
 
     unsigned ImportAsLength =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     if (ImportAsLength > 0) {
       Info.SwiftImportAs =
           std::string(reinterpret_cast<const char *>(Data), ImportAsLength - 1);
       Data += ImportAsLength - 1;
     }
     unsigned RetainOpLength =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     if (RetainOpLength > 0) {
       Info.SwiftRetainOp =
           std::string(reinterpret_cast<const char *>(Data), RetainOpLength - 1);
       Data += RetainOpLength - 1;
     }
     unsigned ReleaseOpLength =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint16_t, llvm::endianness::little>(Data);
     if (ReleaseOpLength > 0) {
       Info.SwiftReleaseOp = std::string(reinterpret_cast<const char *>(Data),
                                         ReleaseOpLength - 1);
@@ -585,13 +560,11 @@ class TypedefTableInfo
                                 TypedefInfo> {
 public:
   static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
-    auto CtxID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    auto CtxID = endian::readNext<uint32_t, llvm::endianness::little>(Data);
     auto ContextKind =
-        endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint8_t, llvm::endianness::little>(Data);
     auto nameID =
-        endian::readNext<IdentifierID, llvm::endianness::little, unaligned>(
-            Data);
+        endian::readNext<IdentifierID, llvm::endianness::little>(Data);
     return {CtxID, ContextKind, nameID};
   }
 
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 2b2d5a2663a18..33b6f8611f216 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -4534,7 +4534,7 @@ unsigned FunctionDecl::getODRHash() {
   }
 
   class ODRHash Hash;
-  Hash.AddFunctionDecl(this, /*SkipBody=*/shouldSkipCheckingODR());
+  Hash.AddFunctionDecl(this);
   setHasODRHash(true);
   ODRHash = Hash.CalculateHash();
   return ODRHash;
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index 66a727d9dd0c3..434926324c96c 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -1106,11 +1106,6 @@ bool Decl::isFromExplicitGlobalModule() const {
   return getOwningModule() && getOwningModule()->isExplicitGlobalModule();
 }
 
-bool Decl::shouldSkipCheckingODR() const {
-  return getASTContext().getLangOpts().SkipODRCheckInGMF &&
-         isFromExplicitGlobalModule();
-}
-
 static Decl::Kind getKind(const Decl *D) { return D->getKind(); }
 static Decl::Kind getKind(const DeclContext *DC) { return DC->getDeclKind(); }
 
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 01ec31e4077f7..6b4b51aac41e8 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -262,7 +262,7 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
       return this->discard(SubExpr);
 
     std::optional<PrimType> FromT = classify(SubExpr->getType());
-    std::optional<PrimType> ToT = classifyPrim(CE->getType());
+    std::optional<PrimType> ToT = classify(CE->getType());
     if (!FromT || !ToT)
       return false;
 
@@ -398,6 +398,35 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
     return true;
   }
 
+  case CK_VectorSplat: {
+    assert(!classify(CE->getType()));
+    assert(classify(SubExpr->getType()));
+    assert(CE->getType()->isVectorType());
+
+    if (DiscardResult)
+      return this->discard(SubExpr);
+
+    assert(Initializing); // FIXME: Not always correct.
+    const auto *VT = CE->getType()->getAs<VectorType>();
+    PrimType ElemT = classifyPrim(SubExpr);
+    unsigned ElemOffset = allocateLocalPrimitive(
+        SubExpr, ElemT, /*IsConst=*/true, /*IsExtended=*/false);
+
+    if (!this->visit(SubExpr))
+      return false;
+    if (!this->emitSetLocal(ElemT, ElemOffset, CE))
+      return false;
+
+    for (unsigned I = 0; I != VT->getNumElements(); ++I) {
+      if (!this->emitGetLocal(ElemT, ElemOffset, CE))
+        return false;
+      if (!this->emitInitElem(ElemT, I, CE))
+        return false;
+    }
+
+    return true;
+  }
+
   case CK_ToVoid:
     return discard(SubExpr);
 
@@ -1251,6 +1280,15 @@ bool ByteCodeExprGen<Emitter>::VisitUnaryExprOrTypeTraitExpr(
     return this->emitConst(Size.getQuantity(), E);
   }
 
+  if (Kind == UETT_VectorElements) {
+    if (const auto *VT = E->getTypeOfArgument()->getAs<VectorType>())
+      return this->emitConst(VT->getNumElements(), E);
+
+    // FIXME: Apparently we need to catch the fact that a sizeless vector type
+    // has been passed and diagnose that (at run time).
+    assert(E->getTypeOfArgument()->isSizelessVectorType());
+  }
+
   return false;
 }
 
@@ -1258,10 +1296,30 @@ template <class Emitter>
 bool ByteCodeExprGen<Emitter>::VisitMemberExpr(const MemberExpr *E) {
   // 'Base.Member'
   const Expr *Base = E->getBase();
+  const ValueDecl *Member = E->getMemberDecl();
 
   if (DiscardResult)
     return this->discard(Base);
 
+  // MemberExprs are almost always lvalues, in which case we don't need to
+  // do the load. But sometimes they aren't.
+  const auto maybeLoadValue = [&]() -> bool {
+    if (E->isGLValue())
+      return true;
+    if (std::optional<PrimType> T = classify(E))
+      return this->emitLoadPop(*T, E);
+    return false;
+  };
+
+  if (const auto *VD = dyn_cast<VarDecl>(Member)) {
+    // I am almost confident in saying that a var decl must be static
+    // and therefore registered as a global variable. But this will probably
+    // turn out to be wrong some time in the future, as always.
+    if (auto GlobalIndex = P.getGlobal(VD))
+      return this->emitGetPtrGlobal(*GlobalIndex, E) && maybeLoadValue();
+    return false;
+  }
+
   if (Initializing) {
     if (!this->delegate(Base))
       return false;
@@ -1271,16 +1329,14 @@ bool ByteCodeExprGen<Emitter>::VisitMemberExpr(const MemberExpr *E) {
   }
 
   // Base above gives us a pointer on the stack.
-  // TODO: Implement non-FieldDecl members.
-  const ValueDecl *Member = E->getMemberDecl();
   if (const auto *FD = dyn_cast<FieldDecl>(Member)) {
     const RecordDecl *RD = FD->getParent();
     const Record *R = getRecord(RD);
     const Record::Field *F = R->getField(FD);
     // Leave a pointer to the field on the stack.
     if (F->Decl->getType()->isReferenceType())
-      return this->emitGetFieldPop(PT_Ptr, F->Offset, E);
-    return this->emitGetPtrField(F->Offset, E);
+      return this->emitGetFieldPop(PT_Ptr, F->Offset, E) && maybeLoadValue();
+    return this->emitGetPtrField(F->Offset, E) && maybeLoadValue();
   }
 
   return false;
@@ -1615,7 +1671,7 @@ bool ByteCodeExprGen<Emitter>::VisitCompoundAssignOperator(
     return false;
   if (!this->emitLoad(*LT, E))
     return false;
-  if (*LT != *LHSComputationT) {
+  if (LT != LHSComputationT) {
     if (!this->emitCast(*LT, *LHSComputationT, E))
       return false;
   }
@@ -1671,7 +1727,7 @@ bool ByteCodeExprGen<Emitter>::VisitCompoundAssignOperator(
   }
 
   // And now cast from LHSComputationT to ResultT.
-  if (*ResultT != *LHSComputationT) {
+  if (ResultT != LHSComputationT) {
     if (!this->emitCast(*LHSComputationT, *ResultT, E))
       return false;
   }
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index db0d73ce23f7c..7e9dc8631fc0d 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -148,13 +148,20 @@ class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
     return Ctx.classify(Ty);
   }
 
-  /// Classifies a known primitive type
+  /// Classifies a known primitive type.
   PrimType classifyPrim(QualType Ty) const {
     if (auto T = classify(Ty)) {
       return *T;
     }
     llvm_unreachable("not a primitive type");
   }
+  /// Classifies a known primitive expression.
+  PrimType classifyPrim(const Expr *E) const {
+    if (auto T = classify(E))
+      return *T;
+    llvm_unreachable("not a primitive type");
+  }
+
   /// Evaluates an expression and places the result on the stack. If the
   /// expression is of composite type, a local variable will be created
   /// and a pointer to said variable will be placed on the stack.
diff --git a/clang/lib/AST/Interp/Disasm.cpp b/clang/lib/AST/Interp/Disasm.cpp
index 022b394e58e64..d127f33223e80 100644
--- a/clang/lib/AST/Interp/Disasm.cpp
+++ b/clang/lib/AST/Interp/Disasm.cpp
@@ -140,7 +140,7 @@ LLVM_DUMP_METHOD void Program::dump(llvm::raw_ostream &OS) const {
     const Descriptor *Desc = G->block()->getDescriptor();
     Pointer GP = getPtrGlobal(GI);
 
-    OS << GI << ": " << (void *)G->block() << " ";
+    OS << GI << ": " << (const void *)G->block() << " ";
     {
       ColorScope SC(OS, true,
                     GP.isInitialized()
@@ -264,3 +264,19 @@ LLVM_DUMP_METHOD void Record::dump(llvm::raw_ostream &OS, unsigned Indentation,
     ++I;
   }
 }
+
+LLVM_DUMP_METHOD void Block::dump(llvm::raw_ostream &OS) const {
+  {
+    ColorScope SC(OS, true, {llvm::raw_ostream::BRIGHT_BLUE, true});
+    OS << "Block " << (const void *)this << "\n";
+  }
+  unsigned NPointers = 0;
+  for (const Pointer *P = Pointers; P; P = P->Next) {
+    ++NPointers;
+  }
+  OS << "  Pointers: " << NPointers << "\n";
+  OS << "  Dead: " << IsDead << "\n";
+  OS << "  Static: " << IsStatic << "\n";
+  OS << "  Extern: " << IsExtern << "\n";
+  OS << "  Initialized: " << IsInitialized << "\n";
+}
diff --git a/clang/lib/AST/Interp/FunctionPointer.h b/clang/lib/AST/Interp/FunctionPointer.h
index c2ea295b82bdf..fc3d7a4214a72 100644
--- a/clang/lib/AST/Interp/FunctionPointer.h
+++ b/clang/lib/AST/Interp/FunctionPointer.h
@@ -32,6 +32,7 @@ class FunctionPointer final {
 
   const Function *getFunction() const { return Func; }
   bool isZero() const { return !Func; }
+  bool isValid() const { return Valid; }
   bool isWeak() const {
     if (!Func || !Valid)
       return false;
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 4182254357eb9..dd0bacd73acb1 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -2236,6 +2236,10 @@ inline bool CallPtr(InterpState &S, CodePtr OpPC, uint32_t ArgSize,
         << const_cast<Expr *>(E) << E->getSourceRange();
     return false;
   }
+
+  if (!FuncPtr.isValid())
+    return false;
+
   assert(F);
 
   // Check argument nullability state.
diff --git a/clang/lib/AST/Interp/InterpBlock.h b/clang/lib/AST/Interp/InterpBlock.h
index 9db82567d2d5d..6d5856fbd4ea1 100644
--- a/clang/lib/AST/Interp/InterpBlock.h
+++ b/clang/lib/AST/Interp/InterpBlock.h
@@ -118,6 +118,9 @@ class Block final {
     IsInitialized = false;
   }
 
+  void dump() const { dump(llvm::errs()); }
+  void dump(llvm::raw_ostream &OS) const;
+
 protected:
   friend class Pointer;
   friend class DeadBlock;
diff --git a/clang/lib/AST/Interp/InterpBuiltin.cpp b/clang/lib/AST/Interp/InterpBuiltin.cpp
index 984ba4f7f2689..f562f9e1cb19f 100644
--- a/clang/lib/AST/Interp/InterpBuiltin.cpp
+++ b/clang/lib/AST/Interp/InterpBuiltin.cpp
@@ -977,6 +977,117 @@ static bool interp__builtin_complex(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+/// __builtin_is_aligned()
+/// __builtin_align_up()
+/// __builtin_align_down()
+/// The first parameter is either an integer or a pointer.
+/// The second parameter is the requested alignment as an integer.
+static bool interp__builtin_is_aligned_up_down(InterpState &S, CodePtr OpPC,
+                                               const InterpFrame *Frame,
+                                               const Function *Func,
+                                               const CallExpr *Call) {
+  unsigned BuiltinOp = Func->getBuiltinID();
+  unsigned CallSize = callArgSize(S, Call);
+
+  PrimType AlignmentT = *S.Ctx.classify(Call->getArg(1));
+  const APSInt &Alignment = peekToAPSInt(S.Stk, AlignmentT);
+
+  if (Alignment < 0 || !Alignment.isPowerOf2()) {
+    S.FFDiag(Call, diag::note_constexpr_invalid_alignment) << Alignment;
+    return false;
+  }
+  unsigned SrcWidth = S.getCtx().getIntWidth(Call->getArg(0)->getType());
+  APSInt MaxValue(APInt::getOneBitSet(SrcWidth, SrcWidth - 1));
+  if (APSInt::compareValues(Alignment, MaxValue) > 0) {
+    S.FFDiag(Call, diag::note_constexpr_alignment_too_big)
+        << MaxValue << Call->getArg(0)->getType() << Alignment;
+    return false;
+  }
+
+  // The first parameter is either an integer or a pointer (but not a function
+  // pointer).
+  PrimType FirstArgT = *S.Ctx.classify(Call->getArg(0));
+
+  if (isIntegralType(FirstArgT)) {
+    const APSInt &Src = peekToAPSInt(S.Stk, FirstArgT, CallSize);
+    APSInt Align = Alignment.extOrTrunc(Src.getBitWidth());
+    if (BuiltinOp == Builtin::BI__builtin_align_up) {
+      APSInt AlignedVal =
+          APSInt((Src + (Align - 1)) & ~(Align - 1), Src.isUnsigned());
+      pushInteger(S, AlignedVal, Call->getType());
+    } else if (BuiltinOp == Builtin::BI__builtin_align_down) {
+      APSInt AlignedVal = APSInt(Src & ~(Align - 1), Src.isUnsigned());
+      pushInteger(S, AlignedVal, Call->getType());
+    } else {
+      assert(*S.Ctx.classify(Call->getType()) == PT_Bool);
+      S.Stk.push<Boolean>((Src & (Align - 1)) == 0);
+    }
+    return true;
+  }
+
+  assert(FirstArgT == PT_Ptr);
+  const Pointer &Ptr = S.Stk.peek<Pointer>(CallSize);
+
+  unsigned PtrOffset = Ptr.getByteOffset();
+  PtrOffset = Ptr.getIndex();
+  CharUnits BaseAlignment =
+      S.getCtx().getDeclAlign(Ptr.getDeclDesc()->asValueDecl());
+  CharUnits PtrAlign =
+      BaseAlignment.alignmentAtOffset(CharUnits::fromQuantity(PtrOffset));
+
+  if (BuiltinOp == Builtin::BI__builtin_is_aligned) {
+    if (PtrAlign.getQuantity() >= Alignment) {
+      S.Stk.push<Boolean>(true);
+      return true;
+    }
+    // If the alignment is not known to be sufficient, some cases could still
+    // be aligned at run time. However, if the requested alignment is less or
+    // equal to the base alignment and the offset is not aligned, we know that
+    // the run-time value can never be aligned.
+    if (BaseAlignment.getQuantity() >= Alignment &&
+        PtrAlign.getQuantity() < Alignment) {
+      S.Stk.push<Boolean>(false);
+      return true;
+    }
+
+    S.FFDiag(Call->getArg(0), diag::note_constexpr_alignment_compute)
+        << Alignment;
+    return false;
+  }
+
+  assert(BuiltinOp == Builtin::BI__builtin_align_down ||
+         BuiltinOp == Builtin::BI__builtin_align_up);
+
+  // For align_up/align_down, we can return the same value if the alignment
+  // is known to be greater or equal to the requested value.
+  if (PtrAlign.getQuantity() >= Alignment) {
+    S.Stk.push<Pointer>(Ptr);
+    return true;
+  }
+
+  // The alignment could be greater than the minimum at run-time, so we cannot
+  // infer much about the resulting pointer value. One case is possible:
+  // For `_Alignas(32) char buf[N]; __builtin_align_down(&buf[idx], 32)` we
+  // can infer the correct index if the requested alignment is smaller than
+  // the base alignment so we can perform the computation on the offset.
+  if (BaseAlignment.getQuantity() >= Alignment) {
+    assert(Alignment.getBitWidth() <= 64 &&
+           "Cannot handle > 64-bit address-space");
+    uint64_t Alignment64 = Alignment.getZExtValue();
+    CharUnits NewOffset =
+        CharUnits::fromQuantity(BuiltinOp == Builtin::BI__builtin_align_down
+                                    ? llvm::alignDown(PtrOffset, Alignment64)
+                                    : llvm::alignTo(PtrOffset, Alignment64));
+
+    S.Stk.push<Pointer>(Ptr.atIndex(NewOffset.getQuantity()));
+    return true;
+  }
+
+  // Otherwise, we cannot constant-evaluate the result.
+  S.FFDiag(Call->getArg(0), diag::note_constexpr_alignment_adjust) << Alignment;
+  return false;
+}
+
 bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
                       const CallExpr *Call) {
   const InterpFrame *Frame = S.Current;
@@ -1291,6 +1402,13 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
       return false;
     break;
 
+  case Builtin::BI__builtin_is_aligned:
+  case Builtin::BI__builtin_align_up:
+  case Builtin::BI__builtin_align_down:
+    if (!interp__builtin_is_aligned_up_down(S, OpPC, Frame, F, Call))
+      return false;
+    break;
+
   default:
     S.FFDiag(S.Current->getLocation(OpPC),
              diag::note_invalid_subexpr_in_const_expr)
diff --git a/clang/lib/AST/Interp/InterpFrame.cpp b/clang/lib/AST/Interp/InterpFrame.cpp
index 12e2e6ff9155b..ba957546473e9 100644
--- a/clang/lib/AST/Interp/InterpFrame.cpp
+++ b/clang/lib/AST/Interp/InterpFrame.cpp
@@ -152,6 +152,13 @@ void print(llvm::raw_ostream &OS, const Pointer &P, ASTContext &Ctx,
 }
 
 void InterpFrame::describe(llvm::raw_ostream &OS) const {
+  // We create frames for builtin functions as well, but we can't reliably
+  // diagnose them. The 'in call to' diagnostics for them add no value to the
+  // user _and_ it doesn't generally work since the argument types don't always
+  // match the function prototype. Just ignore them.
+  if (const auto *F = getFunction(); F && F->isBuiltin())
+    return;
+
   const FunctionDecl *F = getCallee();
   if (const auto *M = dyn_cast<CXXMethodDecl>(F);
       M && M->isInstance() && !isa<CXXConstructorDecl>(F)) {
diff --git a/clang/lib/AST/Interp/Pointer.h b/clang/lib/AST/Interp/Pointer.h
index fcd00aac62f93..b4475577b7462 100644
--- a/clang/lib/AST/Interp/Pointer.h
+++ b/clang/lib/AST/Interp/Pointer.h
@@ -241,13 +241,10 @@ class Pointer {
 
   /// Checks if the pointer is null.
   bool isZero() const {
-    if (Offset != 0)
-      return false;
-
     if (isBlockPointer())
       return asBlockPointer().Pointee == nullptr;
     assert(isIntegralPointer());
-    return asIntPointer().Value == 0;
+    return asIntPointer().Value == 0 && Offset == 0;
   }
   /// Checks if the pointer is live.
   bool isLive() const {
diff --git a/clang/lib/AST/Interp/State.cpp b/clang/lib/AST/Interp/State.cpp
index 47fbf5145cd4e..0d9dadec4b958 100644
--- a/clang/lib/AST/Interp/State.cpp
+++ b/clang/lib/AST/Interp/State.cpp
@@ -155,7 +155,8 @@ void State::addCallStack(unsigned Limit) {
     SmallString<128> Buffer;
     llvm::raw_svector_ostream Out(Buffer);
     F->describe(Out);
-    addDiag(CallRange.getBegin(), diag::note_constexpr_call_here)
-        << Out.str() << CallRange;
+    if (!Buffer.empty())
+      addDiag(CallRange.getBegin(), diag::note_constexpr_call_here)
+          << Out.str() << CallRange;
   }
 }
diff --git a/clang/lib/AST/OpenACCClause.cpp b/clang/lib/AST/OpenACCClause.cpp
index dcb512cb51417..9c259c8f9bd0a 100644
--- a/clang/lib/AST/OpenACCClause.cpp
+++ b/clang/lib/AST/OpenACCClause.cpp
@@ -48,6 +48,26 @@ OpenACCIfClause::OpenACCIfClause(SourceLocation BeginLoc,
          "Condition expression type not scalar/dependent");
 }
 
+OpenACCSelfClause *OpenACCSelfClause::Create(const ASTContext &C,
+                                             SourceLocation BeginLoc,
+                                             SourceLocation LParenLoc,
+                                             Expr *ConditionExpr,
+                                             SourceLocation EndLoc) {
+  void *Mem = C.Allocate(sizeof(OpenACCIfClause), alignof(OpenACCIfClause));
+  return new (Mem)
+      OpenACCSelfClause(BeginLoc, LParenLoc, ConditionExpr, EndLoc);
+}
+
+OpenACCSelfClause::OpenACCSelfClause(SourceLocation BeginLoc,
+                                     SourceLocation LParenLoc,
+                                     Expr *ConditionExpr, SourceLocation EndLoc)
+    : OpenACCClauseWithCondition(OpenACCClauseKind::Self, BeginLoc, LParenLoc,
+                                 ConditionExpr, EndLoc) {
+  assert((!ConditionExpr || ConditionExpr->isInstantiationDependent() ||
+          ConditionExpr->getType()->isScalarType()) &&
+         "Condition expression type not scalar/dependent");
+}
+
 OpenACCClause::child_range OpenACCClause::children() {
   switch (getClauseKind()) {
   default:
@@ -72,3 +92,9 @@ void OpenACCClausePrinter::VisitDefaultClause(const OpenACCDefaultClause &C) {
 void OpenACCClausePrinter::VisitIfClause(const OpenACCIfClause &C) {
   OS << "if(" << C.getConditionExpr() << ")";
 }
+
+void OpenACCClausePrinter::VisitSelfClause(const OpenACCSelfClause &C) {
+  OS << "self";
+  if (const Expr *CondExpr = C.getConditionExpr())
+    OS << "(" << CondExpr << ")";
+}
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index d2aac1e640380..b26d804c6f079 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -2071,13 +2071,31 @@ StmtProfiler::VisitLambdaExpr(const LambdaExpr *S) {
   }
 
   CXXRecordDecl *Lambda = S->getLambdaClass();
-  ID.AddInteger(Lambda->getODRHash());
-
   for (const auto &Capture : Lambda->captures()) {
     ID.AddInteger(Capture.getCaptureKind());
     if (Capture.capturesVariable())
       VisitDecl(Capture.getCapturedVar());
   }
+
+  // Profiling the body of the lambda may be dangerous during deserialization.
+  // So we'd like only to profile the signature here.
+  ODRHash Hasher;
+  // FIXME: We can't get the operator call easily by
+  // `CXXRecordDecl::getLambdaCallOperator()` if we're in deserialization.
+  // So we have to do something raw here.
+  for (auto *SubDecl : Lambda->decls()) {
+    FunctionDecl *Call = nullptr;
+    if (auto *FTD = dyn_cast<FunctionTemplateDecl>(SubDecl))
+      Call = FTD->getTemplatedDecl();
+    else if (auto *FD = dyn_cast<FunctionDecl>(SubDecl))
+      Call = FD;
+
+    if (!Call)
+      continue;
+
+    Hasher.AddFunctionDecl(Call, /*SkipBody=*/true);
+  }
+  ID.AddInteger(Hasher.CalculateHash());
 }
 
 void
@@ -2473,6 +2491,11 @@ void OpenACCClauseProfiler::VisitIfClause(const OpenACCIfClause &Clause) {
          "if clause requires a valid condition expr");
   Profiler.VisitStmt(Clause.getConditionExpr());
 }
+
+void OpenACCClauseProfiler::VisitSelfClause(const OpenACCSelfClause &Clause) {
+  if (Clause.hasConditionExpr())
+    Profiler.VisitStmt(Clause.getConditionExpr());
+}
 } // namespace
 
 void StmtProfiler::VisitOpenACCComputeConstruct(
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 688daa64d6197..ff5b3df2d6dfa 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -398,6 +398,7 @@ void TextNodeDumper::Visit(const OpenACCClause *C) {
       OS << '(' << cast<OpenACCDefaultClause>(C)->getDefaultClauseKind() << ')';
       break;
     case OpenACCClauseKind::If:
+    case OpenACCClauseKind::Self:
       // The condition expression will be printed as a part of the 'children',
       // but print 'clause' here so it is clear what is happening from the dump.
       OS << " clause";
diff --git a/clang/lib/Analysis/ExprMutationAnalyzer.cpp b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
index bb042760d297a..941322be8f870 100644
--- a/clang/lib/Analysis/ExprMutationAnalyzer.cpp
+++ b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
@@ -186,9 +186,10 @@ template <> struct NodeID<Decl> { static constexpr StringRef value = "decl"; };
 constexpr StringRef NodeID<Expr>::value;
 constexpr StringRef NodeID<Decl>::value;
 
-template <class T, class F = const Stmt *(ExprMutationAnalyzer::*)(const T *)>
+template <class T,
+          class F = const Stmt *(ExprMutationAnalyzer::Analyzer::*)(const T *)>
 const Stmt *tryEachMatch(ArrayRef<ast_matchers::BoundNodes> Matches,
-                         ExprMutationAnalyzer *Analyzer, F Finder) {
+                         ExprMutationAnalyzer::Analyzer *Analyzer, F Finder) {
   const StringRef ID = NodeID<T>::value;
   for (const auto &Nodes : Matches) {
     if (const Stmt *S = (Analyzer->*Finder)(Nodes.getNodeAs<T>(ID)))
@@ -199,33 +200,37 @@ const Stmt *tryEachMatch(ArrayRef<ast_matchers::BoundNodes> Matches,
 
 } // namespace
 
-const Stmt *ExprMutationAnalyzer::findMutation(const Expr *Exp) {
-  return findMutationMemoized(Exp,
-                              {&ExprMutationAnalyzer::findDirectMutation,
-                               &ExprMutationAnalyzer::findMemberMutation,
-                               &ExprMutationAnalyzer::findArrayElementMutation,
-                               &ExprMutationAnalyzer::findCastMutation,
-                               &ExprMutationAnalyzer::findRangeLoopMutation,
-                               &ExprMutationAnalyzer::findReferenceMutation,
-                               &ExprMutationAnalyzer::findFunctionArgMutation},
-                              Results);
+const Stmt *ExprMutationAnalyzer::Analyzer::findMutation(const Expr *Exp) {
+  return findMutationMemoized(
+      Exp,
+      {&ExprMutationAnalyzer::Analyzer::findDirectMutation,
+       &ExprMutationAnalyzer::Analyzer::findMemberMutation,
+       &ExprMutationAnalyzer::Analyzer::findArrayElementMutation,
+       &ExprMutationAnalyzer::Analyzer::findCastMutation,
+       &ExprMutationAnalyzer::Analyzer::findRangeLoopMutation,
+       &ExprMutationAnalyzer::Analyzer::findReferenceMutation,
+       &ExprMutationAnalyzer::Analyzer::findFunctionArgMutation},
+      Memorized.Results);
 }
 
-const Stmt *ExprMutationAnalyzer::findMutation(const Decl *Dec) {
-  return tryEachDeclRef(Dec, &ExprMutationAnalyzer::findMutation);
+const Stmt *ExprMutationAnalyzer::Analyzer::findMutation(const Decl *Dec) {
+  return tryEachDeclRef(Dec, &ExprMutationAnalyzer::Analyzer::findMutation);
 }
 
-const Stmt *ExprMutationAnalyzer::findPointeeMutation(const Expr *Exp) {
-  return findMutationMemoized(Exp, {/*TODO*/}, PointeeResults);
+const Stmt *
+ExprMutationAnalyzer::Analyzer::findPointeeMutation(const Expr *Exp) {
+  return findMutationMemoized(Exp, {/*TODO*/}, Memorized.PointeeResults);
 }
 
-const Stmt *ExprMutationAnalyzer::findPointeeMutation(const Decl *Dec) {
-  return tryEachDeclRef(Dec, &ExprMutationAnalyzer::findPointeeMutation);
+const Stmt *
+ExprMutationAnalyzer::Analyzer::findPointeeMutation(const Decl *Dec) {
+  return tryEachDeclRef(Dec,
+                        &ExprMutationAnalyzer::Analyzer::findPointeeMutation);
 }
 
-const Stmt *ExprMutationAnalyzer::findMutationMemoized(
+const Stmt *ExprMutationAnalyzer::Analyzer::findMutationMemoized(
     const Expr *Exp, llvm::ArrayRef<MutationFinder> Finders,
-    ResultMap &MemoizedResults) {
+    Memoized::ResultMap &MemoizedResults) {
   const auto Memoized = MemoizedResults.find(Exp);
   if (Memoized != MemoizedResults.end())
     return Memoized->second;
@@ -241,8 +246,9 @@ const Stmt *ExprMutationAnalyzer::findMutationMemoized(
   return MemoizedResults[Exp] = nullptr;
 }
 
-const Stmt *ExprMutationAnalyzer::tryEachDeclRef(const Decl *Dec,
-                                                 MutationFinder Finder) {
+const Stmt *
+ExprMutationAnalyzer::Analyzer::tryEachDeclRef(const Decl *Dec,
+                                               MutationFinder Finder) {
   const auto Refs = match(
       findAll(
           declRefExpr(to(
@@ -261,8 +267,9 @@ const Stmt *ExprMutationAnalyzer::tryEachDeclRef(const Decl *Dec,
   return nullptr;
 }
 
-bool ExprMutationAnalyzer::isUnevaluated(const Stmt *Exp, const Stmt &Stm,
-                                         ASTContext &Context) {
+bool ExprMutationAnalyzer::Analyzer::isUnevaluated(const Stmt *Exp,
+                                                   const Stmt &Stm,
+                                                   ASTContext &Context) {
   return selectFirst<Stmt>(
              NodeID<Expr>::value,
              match(
@@ -293,33 +300,36 @@ bool ExprMutationAnalyzer::isUnevaluated(const Stmt *Exp, const Stmt &Stm,
                  Stm, Context)) != nullptr;
 }
 
-bool ExprMutationAnalyzer::isUnevaluated(const Expr *Exp) {
+bool ExprMutationAnalyzer::Analyzer::isUnevaluated(const Expr *Exp) {
   return isUnevaluated(Exp, Stm, Context);
 }
 
 const Stmt *
-ExprMutationAnalyzer::findExprMutation(ArrayRef<BoundNodes> Matches) {
-  return tryEachMatch<Expr>(Matches, this, &ExprMutationAnalyzer::findMutation);
+ExprMutationAnalyzer::Analyzer::findExprMutation(ArrayRef<BoundNodes> Matches) {
+  return tryEachMatch<Expr>(Matches, this,
+                            &ExprMutationAnalyzer::Analyzer::findMutation);
 }
 
 const Stmt *
-ExprMutationAnalyzer::findDeclMutation(ArrayRef<BoundNodes> Matches) {
-  return tryEachMatch<Decl>(Matches, this, &ExprMutationAnalyzer::findMutation);
+ExprMutationAnalyzer::Analyzer::findDeclMutation(ArrayRef<BoundNodes> Matches) {
+  return tryEachMatch<Decl>(Matches, this,
+                            &ExprMutationAnalyzer::Analyzer::findMutation);
 }
 
-const Stmt *ExprMutationAnalyzer::findExprPointeeMutation(
+const Stmt *ExprMutationAnalyzer::Analyzer::findExprPointeeMutation(
     ArrayRef<ast_matchers::BoundNodes> Matches) {
-  return tryEachMatch<Expr>(Matches, this,
-                            &ExprMutationAnalyzer::findPointeeMutation);
+  return tryEachMatch<Expr>(
+      Matches, this, &ExprMutationAnalyzer::Analyzer::findPointeeMutation);
 }
 
-const Stmt *ExprMutationAnalyzer::findDeclPointeeMutation(
+const Stmt *ExprMutationAnalyzer::Analyzer::findDeclPointeeMutation(
     ArrayRef<ast_matchers::BoundNodes> Matches) {
-  return tryEachMatch<Decl>(Matches, this,
-                            &ExprMutationAnalyzer::findPointeeMutation);
+  return tryEachMatch<Decl>(
+      Matches, this, &ExprMutationAnalyzer::Analyzer::findPointeeMutation);
 }
 
-const Stmt *ExprMutationAnalyzer::findDirectMutation(const Expr *Exp) {
+const Stmt *
+ExprMutationAnalyzer::Analyzer::findDirectMutation(const Expr *Exp) {
   // LHS of any assignment operators.
   const auto AsAssignmentLhs =
       binaryOperator(isAssignmentOperator(), hasLHS(canResolveToExpr(Exp)));
@@ -426,7 +436,7 @@ const Stmt *ExprMutationAnalyzer::findDirectMutation(const Expr *Exp) {
   const auto AsNonConstRefReturn =
       returnStmt(hasReturnValue(canResolveToExpr(Exp)));
 
-  // It is used as a non-const-reference for initalizing a range-for loop.
+  // It is used as a non-const-reference for initializing a range-for loop.
   const auto AsNonConstRefRangeInit = cxxForRangeStmt(hasRangeInit(declRefExpr(
       allOf(canResolveToExpr(Exp), hasType(nonConstReferenceType())))));
 
@@ -443,7 +453,8 @@ const Stmt *ExprMutationAnalyzer::findDirectMutation(const Expr *Exp) {
   return selectFirst<Stmt>("stmt", Matches);
 }
 
-const Stmt *ExprMutationAnalyzer::findMemberMutation(const Expr *Exp) {
+const Stmt *
+ExprMutationAnalyzer::Analyzer::findMemberMutation(const Expr *Exp) {
   // Check whether any member of 'Exp' is mutated.
   const auto MemberExprs = match(
       findAll(expr(anyOf(memberExpr(hasObjectExpression(canResolveToExpr(Exp))),
@@ -456,7 +467,8 @@ const Stmt *ExprMutationAnalyzer::findMemberMutation(const Expr *Exp) {
   return findExprMutation(MemberExprs);
 }
 
-const Stmt *ExprMutationAnalyzer::findArrayElementMutation(const Expr *Exp) {
+const Stmt *
+ExprMutationAnalyzer::Analyzer::findArrayElementMutation(const Expr *Exp) {
   // Check whether any element of an array is mutated.
   const auto SubscriptExprs = match(
       findAll(arraySubscriptExpr(
@@ -469,7 +481,7 @@ const Stmt *ExprMutationAnalyzer::findArrayElementMutation(const Expr *Exp) {
   return findExprMutation(SubscriptExprs);
 }
 
-const Stmt *ExprMutationAnalyzer::findCastMutation(const Expr *Exp) {
+const Stmt *ExprMutationAnalyzer::Analyzer::findCastMutation(const Expr *Exp) {
   // If the 'Exp' is explicitly casted to a non-const reference type the
   // 'Exp' is considered to be modified.
   const auto ExplicitCast =
@@ -504,7 +516,8 @@ const Stmt *ExprMutationAnalyzer::findCastMutation(const Expr *Exp) {
   return findExprMutation(Calls);
 }
 
-const Stmt *ExprMutationAnalyzer::findRangeLoopMutation(const Expr *Exp) {
+const Stmt *
+ExprMutationAnalyzer::Analyzer::findRangeLoopMutation(const Expr *Exp) {
   // Keep the ordering for the specific initialization matches to happen first,
   // because it is cheaper to match all potential modifications of the loop
   // variable.
@@ -567,7 +580,8 @@ const Stmt *ExprMutationAnalyzer::findRangeLoopMutation(const Expr *Exp) {
   return findDeclMutation(LoopVars);
 }
 
-const Stmt *ExprMutationAnalyzer::findReferenceMutation(const Expr *Exp) {
+const Stmt *
+ExprMutationAnalyzer::Analyzer::findReferenceMutation(const Expr *Exp) {
   // Follow non-const reference returned by `operator*()` of move-only classes.
   // These are typically smart pointers with unique ownership so we treat
   // mutation of pointee as mutation of the smart pointer itself.
@@ -599,7 +613,8 @@ const Stmt *ExprMutationAnalyzer::findReferenceMutation(const Expr *Exp) {
   return findDeclMutation(Refs);
 }
 
-const Stmt *ExprMutationAnalyzer::findFunctionArgMutation(const Expr *Exp) {
+const Stmt *
+ExprMutationAnalyzer::Analyzer::findFunctionArgMutation(const Expr *Exp) {
   const auto NonConstRefParam = forEachArgumentWithParam(
       canResolveToExpr(Exp),
       parmVarDecl(hasType(nonConstReferenceType())).bind("parm"));
@@ -637,10 +652,9 @@ const Stmt *ExprMutationAnalyzer::findFunctionArgMutation(const Expr *Exp) {
     if (const auto *RefType = ParmType->getAs<RValueReferenceType>()) {
       if (!RefType->getPointeeType().getQualifiers() &&
           RefType->getPointeeType()->getAs<TemplateTypeParmType>()) {
-        std::unique_ptr<FunctionParmMutationAnalyzer> &Analyzer =
-            FuncParmAnalyzer[Func];
-        if (!Analyzer)
-          Analyzer.reset(new FunctionParmMutationAnalyzer(*Func, Context));
+        FunctionParmMutationAnalyzer *Analyzer =
+            FunctionParmMutationAnalyzer::getFunctionParmMutationAnalyzer(
+                *Func, Context, Memorized);
         if (Analyzer->findMutation(Parm))
           return Exp;
         continue;
@@ -653,13 +667,15 @@ const Stmt *ExprMutationAnalyzer::findFunctionArgMutation(const Expr *Exp) {
 }
 
 FunctionParmMutationAnalyzer::FunctionParmMutationAnalyzer(
-    const FunctionDecl &Func, ASTContext &Context)
-    : BodyAnalyzer(*Func.getBody(), Context) {
+    const FunctionDecl &Func, ASTContext &Context,
+    ExprMutationAnalyzer::Memoized &Memorized)
+    : BodyAnalyzer(*Func.getBody(), Context, Memorized) {
   if (const auto *Ctor = dyn_cast<CXXConstructorDecl>(&Func)) {
     // CXXCtorInitializer might also mutate Param but they're not part of
     // function body, check them eagerly here since they're typically trivial.
     for (const CXXCtorInitializer *Init : Ctor->inits()) {
-      ExprMutationAnalyzer InitAnalyzer(*Init->getInit(), Context);
+      ExprMutationAnalyzer::Analyzer InitAnalyzer(*Init->getInit(), Context,
+                                                  Memorized);
       for (const ParmVarDecl *Parm : Ctor->parameters()) {
         if (Results.contains(Parm))
           continue;
@@ -675,11 +691,14 @@ FunctionParmMutationAnalyzer::findMutation(const ParmVarDecl *Parm) {
   const auto Memoized = Results.find(Parm);
   if (Memoized != Results.end())
     return Memoized->second;
-
+  // To handle call A -> call B -> call A. Assume parameters of A is not mutated
+  // before analyzing parameters of A. Then when analyzing the second "call A",
+  // FunctionParmMutationAnalyzer can use this memoized value to avoid infinite
+  // recursion.
+  Results[Parm] = nullptr;
   if (const Stmt *S = BodyAnalyzer.findMutation(Parm))
     return Results[Parm] = S;
-
-  return Results[Parm] = nullptr;
+  return Results[Parm];
 }
 
 } // namespace clang
diff --git a/clang/lib/Analysis/FlowSensitive/ASTOps.cpp b/clang/lib/Analysis/FlowSensitive/ASTOps.cpp
new file mode 100644
index 0000000000000..75188aef4d1a4
--- /dev/null
+++ b/clang/lib/Analysis/FlowSensitive/ASTOps.cpp
@@ -0,0 +1,249 @@
+//===-- ASTOps.cc -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  Operations on AST nodes that are used in flow-sensitive analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Analysis/FlowSensitive/ASTOps.h"
+#include "clang/AST/ComputeDependence.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/DeclBase.h"
+#include "clang/AST/DeclCXX.h"
+#include "clang/AST/Expr.h"
+#include "clang/AST/ExprCXX.h"
+#include "clang/AST/Stmt.h"
+#include "clang/AST/Type.h"
+#include "clang/Analysis/FlowSensitive/StorageLocation.h"
+#include "clang/Basic/LLVM.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include <cassert>
+#include <iterator>
+#include <vector>
+
+#define DEBUG_TYPE "dataflow"
+
+namespace clang::dataflow {
+
+const Expr &ignoreCFGOmittedNodes(const Expr &E) {
+  const Expr *Current = &E;
+  if (auto *EWC = dyn_cast<ExprWithCleanups>(Current)) {
+    Current = EWC->getSubExpr();
+    assert(Current != nullptr);
+  }
+  Current = Current->IgnoreParens();
+  assert(Current != nullptr);
+  return *Current;
+}
+
+const Stmt &ignoreCFGOmittedNodes(const Stmt &S) {
+  if (auto *E = dyn_cast<Expr>(&S))
+    return ignoreCFGOmittedNodes(*E);
+  return S;
+}
+
+// FIXME: Does not precisely handle non-virtual diamond inheritance. A single
+// field decl will be modeled for all instances of the inherited field.
+static void getFieldsFromClassHierarchy(QualType Type, FieldSet &Fields) {
+  if (Type->isIncompleteType() || Type->isDependentType() ||
+      !Type->isRecordType())
+    return;
+
+  for (const FieldDecl *Field : Type->getAsRecordDecl()->fields())
+    Fields.insert(Field);
+  if (auto *CXXRecord = Type->getAsCXXRecordDecl())
+    for (const CXXBaseSpecifier &Base : CXXRecord->bases())
+      getFieldsFromClassHierarchy(Base.getType(), Fields);
+}
+
+/// Gets the set of all fields in the type.
+FieldSet getObjectFields(QualType Type) {
+  FieldSet Fields;
+  getFieldsFromClassHierarchy(Type, Fields);
+  return Fields;
+}
+
+bool containsSameFields(const FieldSet &Fields,
+                        const RecordStorageLocation::FieldToLoc &FieldLocs) {
+  if (Fields.size() != FieldLocs.size())
+    return false;
+  for ([[maybe_unused]] auto [Field, Loc] : FieldLocs)
+    if (!Fields.contains(cast_or_null<FieldDecl>(Field)))
+      return false;
+  return true;
+}
+
+/// Returns the fields of a `RecordDecl` that are initialized by an
+/// `InitListExpr`, in the order in which they appear in
+/// `InitListExpr::inits()`.
+/// `Init->getType()` must be a record type.
+static std::vector<const FieldDecl *>
+getFieldsForInitListExpr(const InitListExpr *InitList) {
+  const RecordDecl *RD = InitList->getType()->getAsRecordDecl();
+  assert(RD != nullptr);
+
+  std::vector<const FieldDecl *> Fields;
+
+  if (InitList->getType()->isUnionType()) {
+    Fields.push_back(InitList->getInitializedFieldInUnion());
+    return Fields;
+  }
+
+  // Unnamed bitfields are only used for padding and do not appear in
+  // `InitListExpr`'s inits. However, those fields do appear in `RecordDecl`'s
+  // field list, and we thus need to remove them before mapping inits to
+  // fields to avoid mapping inits to the wrongs fields.
+  llvm::copy_if(
+      RD->fields(), std::back_inserter(Fields),
+      [](const FieldDecl *Field) { return !Field->isUnnamedBitfield(); });
+  return Fields;
+}
+
+RecordInitListHelper::RecordInitListHelper(const InitListExpr *InitList) {
+  auto *RD = InitList->getType()->getAsCXXRecordDecl();
+  assert(RD != nullptr);
+
+  std::vector<const FieldDecl *> Fields = getFieldsForInitListExpr(InitList);
+  ArrayRef<Expr *> Inits = InitList->inits();
+
+  // Unions initialized with an empty initializer list need special treatment.
+  // For structs/classes initialized with an empty initializer list, Clang
+  // puts `ImplicitValueInitExpr`s in `InitListExpr::inits()`, but for unions,
+  // it doesn't do this -- so we create an `ImplicitValueInitExpr` ourselves.
+  SmallVector<Expr *> InitsForUnion;
+  if (InitList->getType()->isUnionType() && Inits.empty()) {
+    assert(Fields.size() == 1);
+    ImplicitValueInitForUnion.emplace(Fields.front()->getType());
+    InitsForUnion.push_back(&*ImplicitValueInitForUnion);
+    Inits = InitsForUnion;
+  }
+
+  size_t InitIdx = 0;
+
+  assert(Fields.size() + RD->getNumBases() == Inits.size());
+  for (const CXXBaseSpecifier &Base : RD->bases()) {
+    assert(InitIdx < Inits.size());
+    Expr *Init = Inits[InitIdx++];
+    BaseInits.emplace_back(&Base, Init);
+  }
+
+  assert(Fields.size() == Inits.size() - InitIdx);
+  for (const FieldDecl *Field : Fields) {
+    assert(InitIdx < Inits.size());
+    Expr *Init = Inits[InitIdx++];
+    FieldInits.emplace_back(Field, Init);
+  }
+}
+
+static void insertIfGlobal(const Decl &D,
+                           llvm::DenseSet<const VarDecl *> &Globals) {
+  if (auto *V = dyn_cast<VarDecl>(&D))
+    if (V->hasGlobalStorage())
+      Globals.insert(V);
+}
+
+static void insertIfFunction(const Decl &D,
+                             llvm::DenseSet<const FunctionDecl *> &Funcs) {
+  if (auto *FD = dyn_cast<FunctionDecl>(&D))
+    Funcs.insert(FD);
+}
+
+static MemberExpr *getMemberForAccessor(const CXXMemberCallExpr &C) {
+  // Use getCalleeDecl instead of getMethodDecl in order to handle
+  // pointer-to-member calls.
+  const auto *MethodDecl = dyn_cast_or_null<CXXMethodDecl>(C.getCalleeDecl());
+  if (!MethodDecl)
+    return nullptr;
+  auto *Body = dyn_cast_or_null<CompoundStmt>(MethodDecl->getBody());
+  if (!Body || Body->size() != 1)
+    return nullptr;
+  if (auto *RS = dyn_cast<ReturnStmt>(*Body->body_begin()))
+    if (auto *Return = RS->getRetValue())
+      return dyn_cast<MemberExpr>(Return->IgnoreParenImpCasts());
+  return nullptr;
+}
+
+static void getReferencedDecls(const Decl &D, ReferencedDecls &Referenced) {
+  insertIfGlobal(D, Referenced.Globals);
+  insertIfFunction(D, Referenced.Functions);
+  if (const auto *Decomp = dyn_cast<DecompositionDecl>(&D))
+    for (const auto *B : Decomp->bindings())
+      if (auto *ME = dyn_cast_or_null<MemberExpr>(B->getBinding()))
+        // FIXME: should we be using `E->getFoundDecl()`?
+        if (const auto *FD = dyn_cast<FieldDecl>(ME->getMemberDecl()))
+          Referenced.Fields.insert(FD);
+}
+
+/// Traverses `S` and inserts into `Referenced` any declarations that are
+/// declared in or referenced from sub-statements.
+static void getReferencedDecls(const Stmt &S, ReferencedDecls &Referenced) {
+  for (auto *Child : S.children())
+    if (Child != nullptr)
+      getReferencedDecls(*Child, Referenced);
+  if (const auto *DefaultArg = dyn_cast<CXXDefaultArgExpr>(&S))
+    getReferencedDecls(*DefaultArg->getExpr(), Referenced);
+  if (const auto *DefaultInit = dyn_cast<CXXDefaultInitExpr>(&S))
+    getReferencedDecls(*DefaultInit->getExpr(), Referenced);
+
+  if (auto *DS = dyn_cast<DeclStmt>(&S)) {
+    if (DS->isSingleDecl())
+      getReferencedDecls(*DS->getSingleDecl(), Referenced);
+    else
+      for (auto *D : DS->getDeclGroup())
+        getReferencedDecls(*D, Referenced);
+  } else if (auto *E = dyn_cast<DeclRefExpr>(&S)) {
+    insertIfGlobal(*E->getDecl(), Referenced.Globals);
+    insertIfFunction(*E->getDecl(), Referenced.Functions);
+  } else if (const auto *C = dyn_cast<CXXMemberCallExpr>(&S)) {
+    // If this is a method that returns a member variable but does nothing else,
+    // model the field of the return value.
+    if (MemberExpr *E = getMemberForAccessor(*C))
+      if (const auto *FD = dyn_cast<FieldDecl>(E->getMemberDecl()))
+        Referenced.Fields.insert(FD);
+  } else if (auto *E = dyn_cast<MemberExpr>(&S)) {
+    // FIXME: should we be using `E->getFoundDecl()`?
+    const ValueDecl *VD = E->getMemberDecl();
+    insertIfGlobal(*VD, Referenced.Globals);
+    insertIfFunction(*VD, Referenced.Functions);
+    if (const auto *FD = dyn_cast<FieldDecl>(VD))
+      Referenced.Fields.insert(FD);
+  } else if (auto *InitList = dyn_cast<InitListExpr>(&S)) {
+    if (InitList->getType()->isRecordType())
+      for (const auto *FD : getFieldsForInitListExpr(InitList))
+        Referenced.Fields.insert(FD);
+  }
+}
+
+ReferencedDecls getReferencedDecls(const FunctionDecl &FD) {
+  ReferencedDecls Result;
+  // Look for global variable and field references in the
+  // constructor-initializers.
+  if (const auto *CtorDecl = dyn_cast<CXXConstructorDecl>(&FD)) {
+    for (const auto *Init : CtorDecl->inits()) {
+      if (Init->isMemberInitializer()) {
+        Result.Fields.insert(Init->getMember());
+      } else if (Init->isIndirectMemberInitializer()) {
+        for (const auto *I : Init->getIndirectMember()->chain())
+          Result.Fields.insert(cast<FieldDecl>(I));
+      }
+      const Expr *E = Init->getInit();
+      assert(E != nullptr);
+      getReferencedDecls(*E, Result);
+    }
+    // Add all fields mentioned in default member initializers.
+    for (const FieldDecl *F : CtorDecl->getParent()->fields())
+      if (const auto *I = F->getInClassInitializer())
+        getReferencedDecls(*I, Result);
+  }
+  getReferencedDecls(*FD.getBody(), Result);
+
+  return Result;
+}
+
+} // namespace clang::dataflow
diff --git a/clang/lib/Analysis/FlowSensitive/CMakeLists.txt b/clang/lib/Analysis/FlowSensitive/CMakeLists.txt
index a3b5d9adc24bd..6631fe27f3d90 100644
--- a/clang/lib/Analysis/FlowSensitive/CMakeLists.txt
+++ b/clang/lib/Analysis/FlowSensitive/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_clang_library(clangAnalysisFlowSensitive
   AdornedCFG.cpp
   Arena.cpp
+  ASTOps.cpp
   DataflowAnalysisContext.cpp
   DataflowEnvironment.cpp
   Formula.cpp
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp b/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
index d520539dd2535..e94fd39c45dc1 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp
@@ -14,6 +14,7 @@
 
 #include "clang/Analysis/FlowSensitive/DataflowAnalysisContext.h"
 #include "clang/AST/ExprCXX.h"
+#include "clang/Analysis/FlowSensitive/ASTOps.h"
 #include "clang/Analysis/FlowSensitive/DebugSupport.h"
 #include "clang/Analysis/FlowSensitive/Formula.h"
 #include "clang/Analysis/FlowSensitive/Logger.h"
@@ -359,55 +360,3 @@ DataflowAnalysisContext::~DataflowAnalysisContext() = default;
 
 } // namespace dataflow
 } // namespace clang
-
-using namespace clang;
-
-const Expr &clang::dataflow::ignoreCFGOmittedNodes(const Expr &E) {
-  const Expr *Current = &E;
-  if (auto *EWC = dyn_cast<ExprWithCleanups>(Current)) {
-    Current = EWC->getSubExpr();
-    assert(Current != nullptr);
-  }
-  Current = Current->IgnoreParens();
-  assert(Current != nullptr);
-  return *Current;
-}
-
-const Stmt &clang::dataflow::ignoreCFGOmittedNodes(const Stmt &S) {
-  if (auto *E = dyn_cast<Expr>(&S))
-    return ignoreCFGOmittedNodes(*E);
-  return S;
-}
-
-// FIXME: Does not precisely handle non-virtual diamond inheritance. A single
-// field decl will be modeled for all instances of the inherited field.
-static void getFieldsFromClassHierarchy(QualType Type,
-                                        clang::dataflow::FieldSet &Fields) {
-  if (Type->isIncompleteType() || Type->isDependentType() ||
-      !Type->isRecordType())
-    return;
-
-  for (const FieldDecl *Field : Type->getAsRecordDecl()->fields())
-    Fields.insert(Field);
-  if (auto *CXXRecord = Type->getAsCXXRecordDecl())
-    for (const CXXBaseSpecifier &Base : CXXRecord->bases())
-      getFieldsFromClassHierarchy(Base.getType(), Fields);
-}
-
-/// Gets the set of all fields in the type.
-clang::dataflow::FieldSet clang::dataflow::getObjectFields(QualType Type) {
-  FieldSet Fields;
-  getFieldsFromClassHierarchy(Type, Fields);
-  return Fields;
-}
-
-bool clang::dataflow::containsSameFields(
-    const clang::dataflow::FieldSet &Fields,
-    const clang::dataflow::RecordStorageLocation::FieldToLoc &FieldLocs) {
-  if (Fields.size() != FieldLocs.size())
-    return false;
-  for ([[maybe_unused]] auto [Field, Loc] : FieldLocs)
-    if (!Fields.contains(cast_or_null<FieldDecl>(Field)))
-      return false;
-  return true;
-}
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index bea15ce9bd24d..3f1600d9ac5d8 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -17,6 +17,7 @@
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/RecursiveASTVisitor.h"
 #include "clang/AST/Type.h"
+#include "clang/Analysis/FlowSensitive/ASTOps.h"
 #include "clang/Analysis/FlowSensitive/DataflowLattice.h"
 #include "clang/Analysis/FlowSensitive/Value.h"
 #include "llvm/ADT/DenseMap.h"
@@ -304,93 +305,6 @@ widenKeyToValueMap(const llvm::MapVector<Key, Value *> &CurMap,
   return WidenedMap;
 }
 
-/// Initializes a global storage value.
-static void insertIfGlobal(const Decl &D,
-                           llvm::DenseSet<const VarDecl *> &Vars) {
-  if (auto *V = dyn_cast<VarDecl>(&D))
-    if (V->hasGlobalStorage())
-      Vars.insert(V);
-}
-
-static void insertIfFunction(const Decl &D,
-                             llvm::DenseSet<const FunctionDecl *> &Funcs) {
-  if (auto *FD = dyn_cast<FunctionDecl>(&D))
-    Funcs.insert(FD);
-}
-
-static MemberExpr *getMemberForAccessor(const CXXMemberCallExpr &C) {
-  // Use getCalleeDecl instead of getMethodDecl in order to handle
-  // pointer-to-member calls.
-  const auto *MethodDecl = dyn_cast_or_null<CXXMethodDecl>(C.getCalleeDecl());
-  if (!MethodDecl)
-    return nullptr;
-  auto *Body = dyn_cast_or_null<CompoundStmt>(MethodDecl->getBody());
-  if (!Body || Body->size() != 1)
-    return nullptr;
-  if (auto *RS = dyn_cast<ReturnStmt>(*Body->body_begin()))
-    if (auto *Return = RS->getRetValue())
-      return dyn_cast<MemberExpr>(Return->IgnoreParenImpCasts());
-  return nullptr;
-}
-
-static void
-getFieldsGlobalsAndFuncs(const Decl &D, FieldSet &Fields,
-                         llvm::DenseSet<const VarDecl *> &Vars,
-                         llvm::DenseSet<const FunctionDecl *> &Funcs) {
-  insertIfGlobal(D, Vars);
-  insertIfFunction(D, Funcs);
-  if (const auto *Decomp = dyn_cast<DecompositionDecl>(&D))
-    for (const auto *B : Decomp->bindings())
-      if (auto *ME = dyn_cast_or_null<MemberExpr>(B->getBinding()))
-        // FIXME: should we be using `E->getFoundDecl()`?
-        if (const auto *FD = dyn_cast<FieldDecl>(ME->getMemberDecl()))
-          Fields.insert(FD);
-}
-
-/// Traverses `S` and inserts into `Fields`, `Vars` and `Funcs` any fields,
-/// global variables and functions that are declared in or referenced from
-/// sub-statements.
-static void
-getFieldsGlobalsAndFuncs(const Stmt &S, FieldSet &Fields,
-                         llvm::DenseSet<const VarDecl *> &Vars,
-                         llvm::DenseSet<const FunctionDecl *> &Funcs) {
-  for (auto *Child : S.children())
-    if (Child != nullptr)
-      getFieldsGlobalsAndFuncs(*Child, Fields, Vars, Funcs);
-  if (const auto *DefaultArg = dyn_cast<CXXDefaultArgExpr>(&S))
-    getFieldsGlobalsAndFuncs(*DefaultArg->getExpr(), Fields, Vars, Funcs);
-  if (const auto *DefaultInit = dyn_cast<CXXDefaultInitExpr>(&S))
-    getFieldsGlobalsAndFuncs(*DefaultInit->getExpr(), Fields, Vars, Funcs);
-
-  if (auto *DS = dyn_cast<DeclStmt>(&S)) {
-    if (DS->isSingleDecl())
-      getFieldsGlobalsAndFuncs(*DS->getSingleDecl(), Fields, Vars, Funcs);
-    else
-      for (auto *D : DS->getDeclGroup())
-        getFieldsGlobalsAndFuncs(*D, Fields, Vars, Funcs);
-  } else if (auto *E = dyn_cast<DeclRefExpr>(&S)) {
-    insertIfGlobal(*E->getDecl(), Vars);
-    insertIfFunction(*E->getDecl(), Funcs);
-  } else if (const auto *C = dyn_cast<CXXMemberCallExpr>(&S)) {
-    // If this is a method that returns a member variable but does nothing else,
-    // model the field of the return value.
-    if (MemberExpr *E = getMemberForAccessor(*C))
-      if (const auto *FD = dyn_cast<FieldDecl>(E->getMemberDecl()))
-        Fields.insert(FD);
-  } else if (auto *E = dyn_cast<MemberExpr>(&S)) {
-    // FIXME: should we be using `E->getFoundDecl()`?
-    const ValueDecl *VD = E->getMemberDecl();
-    insertIfGlobal(*VD, Vars);
-    insertIfFunction(*VD, Funcs);
-    if (const auto *FD = dyn_cast<FieldDecl>(VD))
-      Fields.insert(FD);
-  } else if (auto *InitList = dyn_cast<InitListExpr>(&S)) {
-    if (InitList->getType()->isRecordType())
-      for (const auto *FD : getFieldsForInitListExpr(InitList))
-        Fields.insert(FD);
-  }
-}
-
 namespace {
 
 // Visitor that builds a map from record prvalues to result objects.
@@ -505,7 +419,16 @@ class ResultObjectVisitor : public RecursiveASTVisitor<ResultObjectVisitor> {
     // below them can initialize the same object (or part of it).
     if (isa<CXXConstructExpr>(E) || isa<CallExpr>(E) || isa<LambdaExpr>(E) ||
         isa<CXXDefaultArgExpr>(E) || isa<CXXDefaultInitExpr>(E) ||
-        isa<CXXStdInitializerListExpr>(E)) {
+        isa<CXXStdInitializerListExpr>(E) ||
+        // We treat `BuiltinBitCastExpr` as an "original initializer" too as
+        // it may not even be casting from a record type -- and even if it is,
+        // the two objects are in general of unrelated type.
+        isa<BuiltinBitCastExpr>(E)) {
+      return;
+    }
+    if (auto *Op = dyn_cast<BinaryOperator>(E);
+        Op && Op->getOpcode() == BO_Cmp) {
+      // Builtin `<=>` returns a `std::strong_ordering` object.
       return;
     }
 
@@ -551,6 +474,11 @@ class ResultObjectVisitor : public RecursiveASTVisitor<ResultObjectVisitor> {
       return;
     }
 
+    if (auto *SE = dyn_cast<StmtExpr>(E)) {
+      PropagateResultObject(cast<Expr>(SE->getSubStmt()->body_back()), Loc);
+      return;
+    }
+
     // All other expression nodes that propagate a record prvalue should have
     // exactly one child.
     SmallVector<Stmt *, 1> Children(E->child_begin(), E->child_end());
@@ -648,36 +576,13 @@ void Environment::initialize() {
 void Environment::initFieldsGlobalsAndFuncs(const FunctionDecl *FuncDecl) {
   assert(FuncDecl->doesThisDeclarationHaveABody());
 
-  FieldSet Fields;
-  llvm::DenseSet<const VarDecl *> Vars;
-  llvm::DenseSet<const FunctionDecl *> Funcs;
-
-  // Look for global variable and field references in the
-  // constructor-initializers.
-  if (const auto *CtorDecl = dyn_cast<CXXConstructorDecl>(FuncDecl)) {
-    for (const auto *Init : CtorDecl->inits()) {
-      if (Init->isMemberInitializer()) {
-        Fields.insert(Init->getMember());
-      } else if (Init->isIndirectMemberInitializer()) {
-        for (const auto *I : Init->getIndirectMember()->chain())
-          Fields.insert(cast<FieldDecl>(I));
-      }
-      const Expr *E = Init->getInit();
-      assert(E != nullptr);
-      getFieldsGlobalsAndFuncs(*E, Fields, Vars, Funcs);
-    }
-    // Add all fields mentioned in default member initializers.
-    for (const FieldDecl *F : CtorDecl->getParent()->fields())
-      if (const auto *I = F->getInClassInitializer())
-          getFieldsGlobalsAndFuncs(*I, Fields, Vars, Funcs);
-  }
-  getFieldsGlobalsAndFuncs(*FuncDecl->getBody(), Fields, Vars, Funcs);
+  ReferencedDecls Referenced = getReferencedDecls(*FuncDecl);
 
   // These have to be added before the lines that follow to ensure that
   // `create*` work correctly for structs.
-  DACtx->addModeledFields(Fields);
+  DACtx->addModeledFields(Referenced.Fields);
 
-  for (const VarDecl *D : Vars) {
+  for (const VarDecl *D : Referenced.Globals) {
     if (getStorageLocation(*D) != nullptr)
       continue;
 
@@ -689,7 +594,7 @@ void Environment::initFieldsGlobalsAndFuncs(const FunctionDecl *FuncDecl) {
     setStorageLocation(*D, createObject(*D, nullptr));
   }
 
-  for (const FunctionDecl *FD : Funcs) {
+  for (const FunctionDecl *FD : Referenced.Functions) {
     if (getStorageLocation(*FD) != nullptr)
       continue;
     auto &Loc = createStorageLocation(*FD);
@@ -1349,64 +1254,6 @@ RecordStorageLocation *getBaseObjectLocation(const MemberExpr &ME,
   return Env.get<RecordStorageLocation>(*Base);
 }
 
-std::vector<const FieldDecl *>
-getFieldsForInitListExpr(const InitListExpr *InitList) {
-  const RecordDecl *RD = InitList->getType()->getAsRecordDecl();
-  assert(RD != nullptr);
-
-  std::vector<const FieldDecl *> Fields;
-
-  if (InitList->getType()->isUnionType()) {
-    Fields.push_back(InitList->getInitializedFieldInUnion());
-    return Fields;
-  }
-
-  // Unnamed bitfields are only used for padding and do not appear in
-  // `InitListExpr`'s inits. However, those fields do appear in `RecordDecl`'s
-  // field list, and we thus need to remove them before mapping inits to
-  // fields to avoid mapping inits to the wrongs fields.
-  llvm::copy_if(
-      RD->fields(), std::back_inserter(Fields),
-      [](const FieldDecl *Field) { return !Field->isUnnamedBitfield(); });
-  return Fields;
-}
-
-RecordInitListHelper::RecordInitListHelper(const InitListExpr *InitList) {
-  auto *RD = InitList->getType()->getAsCXXRecordDecl();
-  assert(RD != nullptr);
-
-  std::vector<const FieldDecl *> Fields = getFieldsForInitListExpr(InitList);
-  ArrayRef<Expr *> Inits = InitList->inits();
-
-  // Unions initialized with an empty initializer list need special treatment.
-  // For structs/classes initialized with an empty initializer list, Clang
-  // puts `ImplicitValueInitExpr`s in `InitListExpr::inits()`, but for unions,
-  // it doesn't do this -- so we create an `ImplicitValueInitExpr` ourselves.
-  SmallVector<Expr *> InitsForUnion;
-  if (InitList->getType()->isUnionType() && Inits.empty()) {
-    assert(Fields.size() == 1);
-    ImplicitValueInitForUnion.emplace(Fields.front()->getType());
-    InitsForUnion.push_back(&*ImplicitValueInitForUnion);
-    Inits = InitsForUnion;
-  }
-
-  size_t InitIdx = 0;
-
-  assert(Fields.size() + RD->getNumBases() == Inits.size());
-  for (const CXXBaseSpecifier &Base : RD->bases()) {
-    assert(InitIdx < Inits.size());
-    Expr *Init = Inits[InitIdx++];
-    BaseInits.emplace_back(&Base, Init);
-  }
-
-  assert(Fields.size() == Inits.size() - InitIdx);
-  for (const FieldDecl *Field : Fields) {
-    assert(InitIdx < Inits.size());
-    Expr *Init = Inits[InitIdx++];
-    FieldInits.emplace_back(Field, Init);
-  }
-}
-
 RecordValue &refreshRecordValue(RecordStorageLocation &Loc, Environment &Env) {
   auto &NewVal = Env.create<RecordValue>(Loc);
   Env.setValue(Loc, NewVal);
diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index 88a9c0eccbebc..1e034771014ea 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -20,7 +20,9 @@
 #include "clang/AST/OperationKinds.h"
 #include "clang/AST/Stmt.h"
 #include "clang/AST/StmtVisitor.h"
+#include "clang/Analysis/FlowSensitive/ASTOps.h"
 #include "clang/Analysis/FlowSensitive/AdornedCFG.h"
+#include "clang/Analysis/FlowSensitive/DataflowAnalysisContext.h"
 #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
 #include "clang/Analysis/FlowSensitive/NoopAnalysis.h"
 #include "clang/Analysis/FlowSensitive/RecordOps.h"
diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index e03fe1b683004..c42e70d5b95ac 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -1114,7 +1114,7 @@ class UPCAddressofArraySubscriptGadget : public FixableGadget {
   virtual DeclUseList getClaimedVarUseSites() const override {
     const auto *ArraySubst = cast<ArraySubscriptExpr>(Node->getSubExpr());
     const auto *DRE =
-        cast<DeclRefExpr>(ArraySubst->getBase()->IgnoreImpCasts());
+        cast<DeclRefExpr>(ArraySubst->getBase()->IgnoreParenImpCasts());
     return {DRE};
   }
 };
diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp
index 1b1da6a1356f2..113483db5729b 100644
--- a/clang/lib/Basic/Cuda.cpp
+++ b/clang/lib/Basic/Cuda.cpp
@@ -86,7 +86,7 @@ static const CudaArchToStringMap arch_names[] = {
     // clang-format off
     {CudaArch::UNUSED, "", ""},
     SM2(20, "compute_20"), SM2(21, "compute_20"), // Fermi
-    SM(30), SM(32), SM(35), SM(37),  // Kepler
+    SM(30), {CudaArch::SM_32_, "sm_32", "compute_32"}, SM(35), SM(37),  // Kepler
     SM(50), SM(52), SM(53),          // Maxwell
     SM(60), SM(61), SM(62),          // Pascal
     SM(70), SM(72),                  // Volta
@@ -186,7 +186,7 @@ CudaVersion MinVersionForCudaArch(CudaArch A) {
   case CudaArch::SM_20:
   case CudaArch::SM_21:
   case CudaArch::SM_30:
-  case CudaArch::SM_32:
+  case CudaArch::SM_32_:
   case CudaArch::SM_35:
   case CudaArch::SM_37:
   case CudaArch::SM_50:
@@ -231,7 +231,7 @@ CudaVersion MaxVersionForCudaArch(CudaArch A) {
   case CudaArch::SM_21:
     return CudaVersion::CUDA_80;
   case CudaArch::SM_30:
-  case CudaArch::SM_32:
+  case CudaArch::SM_32_:
     return CudaVersion::CUDA_102;
   case CudaArch::SM_35:
   case CudaArch::SM_37:
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index b47c399fef604..8ad9e6e5f5891 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -239,7 +239,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
         return "210";
       case CudaArch::SM_30:
         return "300";
-      case CudaArch::SM_32:
+      case CudaArch::SM_32_:
         return "320";
       case CudaArch::SM_35:
         return "350";
diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp
index f3d705e1551fe..a7ce9dda34bdd 100644
--- a/clang/lib/Basic/Targets/RISCV.cpp
+++ b/clang/lib/Basic/Targets/RISCV.cpp
@@ -353,7 +353,8 @@ bool RISCVTargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
   if (ISAInfo->hasExtension("zfh") || ISAInfo->hasExtension("zhinx"))
     HasLegalHalfType = true;
 
-  FastUnalignedAccess = llvm::is_contained(Features, "+fast-unaligned-access");
+  FastUnalignedAccess = llvm::is_contained(Features, "+unaligned-scalar-mem") &&
+                        llvm::is_contained(Features, "+unaligned-vector-mem");
 
   if (llvm::is_contained(Features, "+experimental"))
     HasExperimental = true;
diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h
index e25991e3dfe82..44265445ff004 100644
--- a/clang/lib/Basic/Targets/SPIR.h
+++ b/clang/lib/Basic/Targets/SPIR.h
@@ -259,7 +259,7 @@ class LLVM_LIBRARY_VISIBILITY SPIR32TargetInfo : public SPIRTargetInfo {
     SizeType = TargetInfo::UnsignedInt;
     PtrDiffType = IntPtrType = TargetInfo::SignedInt;
     resetDataLayout("e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-"
-                    "v96:128-v192:256-v256:256-v512:512-v1024:1024");
+                    "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1");
   }
 
   void getTargetDefines(const LangOptions &Opts,
@@ -276,7 +276,7 @@ class LLVM_LIBRARY_VISIBILITY SPIR64TargetInfo : public SPIRTargetInfo {
     SizeType = TargetInfo::UnsignedLong;
     PtrDiffType = IntPtrType = TargetInfo::SignedLong;
     resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-"
-                    "v96:128-v192:256-v256:256-v512:512-v1024:1024");
+                    "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1");
   }
 
   void getTargetDefines(const LangOptions &Opts,
@@ -315,7 +315,7 @@ class LLVM_LIBRARY_VISIBILITY SPIRVTargetInfo : public BaseSPIRVTargetInfo {
     // SPIR-V IDs are represented with a single 32-bit word.
     SizeType = TargetInfo::UnsignedInt;
     resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-"
-                    "v96:128-v192:256-v256:256-v512:512-v1024:1024");
+                    "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1");
   }
 
   void getTargetDefines(const LangOptions &Opts,
@@ -336,7 +336,7 @@ class LLVM_LIBRARY_VISIBILITY SPIRV32TargetInfo : public BaseSPIRVTargetInfo {
     SizeType = TargetInfo::UnsignedInt;
     PtrDiffType = IntPtrType = TargetInfo::SignedInt;
     resetDataLayout("e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-"
-                    "v96:128-v192:256-v256:256-v512:512-v1024:1024");
+                    "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1");
   }
 
   void getTargetDefines(const LangOptions &Opts,
@@ -357,7 +357,7 @@ class LLVM_LIBRARY_VISIBILITY SPIRV64TargetInfo : public BaseSPIRVTargetInfo {
     SizeType = TargetInfo::UnsignedLong;
     PtrDiffType = IntPtrType = TargetInfo::SignedLong;
     resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-"
-                    "v96:128-v192:256-v256:256-v512:512-v1024:1024");
+                    "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1");
   }
 
   void getTargetDefines(const LangOptions &Opts,
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 6cc00b85664f4..22c3f8642ad8e 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -104,6 +104,21 @@ static cl::opt<bool> ClSanitizeOnOptimizerEarlyEP(
     "sanitizer-early-opt-ep", cl::Optional,
     cl::desc("Insert sanitizers on OptimizerEarlyEP."));
 
+// Experiment to mark cold functions as optsize/minsize/optnone.
+// TODO: remove once this is exposed as a proper driver flag.
+static cl::opt<PGOOptions::ColdFuncOpt> ClPGOColdFuncAttr(
+    "pgo-cold-func-opt", cl::init(PGOOptions::ColdFuncOpt::Default), cl::Hidden,
+    cl::desc(
+        "Function attribute to apply to cold functions as determined by PGO"),
+    cl::values(clEnumValN(PGOOptions::ColdFuncOpt::Default, "default",
+                          "Default (no attribute)"),
+               clEnumValN(PGOOptions::ColdFuncOpt::OptSize, "optsize",
+                          "Mark cold functions with optsize."),
+               clEnumValN(PGOOptions::ColdFuncOpt::MinSize, "minsize",
+                          "Mark cold functions with minsize."),
+               clEnumValN(PGOOptions::ColdFuncOpt::OptNone, "optnone",
+                          "Mark cold functions with optnone.")));
+
 extern cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate;
 
 // Re-link builtin bitcodes after optimization
@@ -768,42 +783,41 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
         CodeGenOpts.InstrProfileOutput.empty() ? getDefaultProfileGenName()
                                                : CodeGenOpts.InstrProfileOutput,
         "", "", CodeGenOpts.MemoryProfileUsePath, nullptr, PGOOptions::IRInstr,
-        PGOOptions::NoCSAction, PGOOptions::ColdFuncOpt::Default,
+        PGOOptions::NoCSAction, ClPGOColdFuncAttr,
         CodeGenOpts.DebugInfoForProfiling,
         /*PseudoProbeForProfiling=*/false, CodeGenOpts.AtomicProfileUpdate);
   else if (CodeGenOpts.hasProfileIRUse()) {
     // -fprofile-use.
     auto CSAction = CodeGenOpts.hasProfileCSIRUse() ? PGOOptions::CSIRUse
                                                     : PGOOptions::NoCSAction;
-    PGOOpt = PGOOptions(
-        CodeGenOpts.ProfileInstrumentUsePath, "",
-        CodeGenOpts.ProfileRemappingFile, CodeGenOpts.MemoryProfileUsePath, VFS,
-        PGOOptions::IRUse, CSAction, PGOOptions::ColdFuncOpt::Default,
-        CodeGenOpts.DebugInfoForProfiling);
+    PGOOpt = PGOOptions(CodeGenOpts.ProfileInstrumentUsePath, "",
+                        CodeGenOpts.ProfileRemappingFile,
+                        CodeGenOpts.MemoryProfileUsePath, VFS,
+                        PGOOptions::IRUse, CSAction, ClPGOColdFuncAttr,
+                        CodeGenOpts.DebugInfoForProfiling);
   } else if (!CodeGenOpts.SampleProfileFile.empty())
     // -fprofile-sample-use
     PGOOpt = PGOOptions(
         CodeGenOpts.SampleProfileFile, "", CodeGenOpts.ProfileRemappingFile,
         CodeGenOpts.MemoryProfileUsePath, VFS, PGOOptions::SampleUse,
-        PGOOptions::NoCSAction, PGOOptions::ColdFuncOpt::Default,
+        PGOOptions::NoCSAction, ClPGOColdFuncAttr,
         CodeGenOpts.DebugInfoForProfiling, CodeGenOpts.PseudoProbeForProfiling);
   else if (!CodeGenOpts.MemoryProfileUsePath.empty())
     // -fmemory-profile-use (without any of the above options)
     PGOOpt = PGOOptions("", "", "", CodeGenOpts.MemoryProfileUsePath, VFS,
                         PGOOptions::NoAction, PGOOptions::NoCSAction,
-                        PGOOptions::ColdFuncOpt::Default,
-                        CodeGenOpts.DebugInfoForProfiling);
+                        ClPGOColdFuncAttr, CodeGenOpts.DebugInfoForProfiling);
   else if (CodeGenOpts.PseudoProbeForProfiling)
     // -fpseudo-probe-for-profiling
-    PGOOpt = PGOOptions("", "", "", /*MemoryProfile=*/"", nullptr,
-                        PGOOptions::NoAction, PGOOptions::NoCSAction,
-                        PGOOptions::ColdFuncOpt::Default,
-                        CodeGenOpts.DebugInfoForProfiling, true);
+    PGOOpt =
+        PGOOptions("", "", "", /*MemoryProfile=*/"", nullptr,
+                   PGOOptions::NoAction, PGOOptions::NoCSAction,
+                   ClPGOColdFuncAttr, CodeGenOpts.DebugInfoForProfiling, true);
   else if (CodeGenOpts.DebugInfoForProfiling)
     // -fdebug-info-for-profiling
     PGOOpt = PGOOptions("", "", "", /*MemoryProfile=*/"", nullptr,
                         PGOOptions::NoAction, PGOOptions::NoCSAction,
-                        PGOOptions::ColdFuncOpt::Default, true);
+                        ClPGOColdFuncAttr, true);
 
   // Check to see if we want to generate a CS profile.
   if (CodeGenOpts.hasProfileCSIRInstr()) {
@@ -820,14 +834,13 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
                                      : CodeGenOpts.InstrProfileOutput;
       PGOOpt->CSAction = PGOOptions::CSIRInstr;
     } else
-      PGOOpt =
-          PGOOptions("",
-                     CodeGenOpts.InstrProfileOutput.empty()
-                         ? getDefaultProfileGenName()
-                         : CodeGenOpts.InstrProfileOutput,
-                     "", /*MemoryProfile=*/"", nullptr, PGOOptions::NoAction,
-                     PGOOptions::CSIRInstr, PGOOptions::ColdFuncOpt::Default,
-                     CodeGenOpts.DebugInfoForProfiling);
+      PGOOpt = PGOOptions("",
+                          CodeGenOpts.InstrProfileOutput.empty()
+                              ? getDefaultProfileGenName()
+                              : CodeGenOpts.InstrProfileOutput,
+                          "", /*MemoryProfile=*/"", nullptr,
+                          PGOOptions::NoAction, PGOOptions::CSIRInstr,
+                          ClPGOColdFuncAttr, CodeGenOpts.DebugInfoForProfiling);
   }
   if (TM)
     TM->setPGOOption(PGOOpt);
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 9f95697f284c4..a05874e63c73c 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -3436,6 +3436,15 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     Builder.CreateAssumption(ConstantInt::getTrue(getLLVMContext()), {OBD});
     return RValue::get(nullptr);
   }
+  case Builtin::BI__builtin_allow_runtime_check: {
+    StringRef Kind =
+        cast<StringLiteral>(E->getArg(0)->IgnoreParenCasts())->getString();
+    LLVMContext &Ctx = CGM.getLLVMContext();
+    llvm::Value *Allow = Builder.CreateCall(
+        CGM.getIntrinsic(llvm::Intrinsic::allow_runtime_check),
+        llvm::MetadataAsValue::get(Ctx, llvm::MDString::get(Ctx, Kind)));
+    return RValue::get(Allow);
+  }
   case Builtin::BI__arithmetic_fence: {
     // Create the builtin call if FastMath is selected, and the target
     // supports the builtin, otherwise just return the argument.
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 7a0bc6fa77b88..3f5463a9a70e9 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -4124,8 +4124,7 @@ static bool isProvablyNull(llvm::Value *addr) {
 }
 
 static bool isProvablyNonNull(Address Addr, CodeGenFunction &CGF) {
-  return llvm::isKnownNonZero(Addr.getBasePointer(), /*Depth=*/0,
-                              CGF.CGM.getDataLayout());
+  return llvm::isKnownNonZero(Addr.getBasePointer(), CGF.CGM.getDataLayout());
 }
 
 /// Emit the actual writing-back of a writeback.
diff --git a/clang/lib/CodeGen/CGCleanup.cpp b/clang/lib/CodeGen/CGCleanup.cpp
index 5bf48bc22a549..e6f8e6873004f 100644
--- a/clang/lib/CodeGen/CGCleanup.cpp
+++ b/clang/lib/CodeGen/CGCleanup.cpp
@@ -667,8 +667,7 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
 
   // - whether there's a fallthrough
   llvm::BasicBlock *FallthroughSource = Builder.GetInsertBlock();
-  bool HasFallthrough =
-      FallthroughSource != nullptr && (IsActive || HasExistingBranches);
+  bool HasFallthrough = (FallthroughSource != nullptr && IsActive);
 
   // Branch-through fall-throughs leave the insertion point set to the
   // end of the last cleanup, which points to the current scope.  The
@@ -693,11 +692,7 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
 
   // If we have a prebranched fallthrough into an inactive normal
   // cleanup, rewrite it so that it leads to the appropriate place.
-  if (Scope.isNormalCleanup() && HasPrebranchedFallthrough &&
-      !RequiresNormalCleanup) {
-    // FIXME: Come up with a program which would need forwarding prebranched
-    // fallthrough and add tests. Otherwise delete this and assert against it.
-    assert(!IsActive);
+  if (Scope.isNormalCleanup() && HasPrebranchedFallthrough && !IsActive) {
     llvm::BasicBlock *prebranchDest;
 
     // If the prebranch is semantically branching through the next
@@ -770,7 +765,6 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
         EmitSehCppScopeEnd();
     }
     destroyOptimisticNormalEntry(*this, Scope);
-    Scope.MarkEmitted();
     EHStack.popCleanup();
   } else {
     // If we have a fallthrough and no other need for the cleanup,
@@ -787,7 +781,6 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
       }
 
       destroyOptimisticNormalEntry(*this, Scope);
-      Scope.MarkEmitted();
       EHStack.popCleanup();
 
       EmitCleanup(*this, Fn, cleanupFlags, NormalActiveFlag);
@@ -923,7 +916,6 @@ void CodeGenFunction::PopCleanupBlock(bool FallthroughIsBranchThrough) {
       }
 
       // IV.  Pop the cleanup and emit it.
-      Scope.MarkEmitted();
       EHStack.popCleanup();
       assert(EHStack.hasNormalCleanups() == HasEnclosingCleanups);
 
diff --git a/clang/lib/CodeGen/CGCleanup.h b/clang/lib/CodeGen/CGCleanup.h
index c73c97146abc4..03e4a29d7b3db 100644
--- a/clang/lib/CodeGen/CGCleanup.h
+++ b/clang/lib/CodeGen/CGCleanup.h
@@ -16,11 +16,8 @@
 #include "EHScopeStack.h"
 
 #include "Address.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/Instruction.h"
 
 namespace llvm {
 class BasicBlock;
@@ -269,51 +266,6 @@ class alignas(8) EHCleanupScope : public EHScope {
   };
   mutable struct ExtInfo *ExtInfo;
 
-  /// Erases auxillary allocas and their usages for an unused cleanup.
-  /// Cleanups should mark these allocas as 'used' if the cleanup is
-  /// emitted, otherwise these instructions would be erased.
-  struct AuxillaryAllocas {
-    SmallVector<llvm::Instruction *, 1> AuxAllocas;
-    bool used = false;
-
-    // Records a potentially unused instruction to be erased later.
-    void Add(llvm::AllocaInst *Alloca) { AuxAllocas.push_back(Alloca); }
-
-    // Mark all recorded instructions as used. These will not be erased later.
-    void MarkUsed() {
-      used = true;
-      AuxAllocas.clear();
-    }
-
-    ~AuxillaryAllocas() {
-      if (used)
-        return;
-      llvm::SetVector<llvm::Instruction *> Uses;
-      for (auto *Inst : llvm::reverse(AuxAllocas))
-        CollectUses(Inst, Uses);
-      // Delete uses in the reverse order of insertion.
-      for (auto *I : llvm::reverse(Uses))
-        I->eraseFromParent();
-    }
-
-  private:
-    void CollectUses(llvm::Instruction *I,
-                     llvm::SetVector<llvm::Instruction *> &Uses) {
-      if (!I || !Uses.insert(I))
-        return;
-      for (auto *User : I->users())
-        CollectUses(cast<llvm::Instruction>(User), Uses);
-    }
-  };
-  mutable struct AuxillaryAllocas *AuxAllocas;
-
-  AuxillaryAllocas &getAuxillaryAllocas() {
-    if (!AuxAllocas) {
-      AuxAllocas = new struct AuxillaryAllocas();
-    }
-    return *AuxAllocas;
-  }
-
   /// The number of fixups required by enclosing scopes (not including
   /// this one).  If this is the top cleanup scope, all the fixups
   /// from this index onwards belong to this scope.
@@ -346,7 +298,7 @@ class alignas(8) EHCleanupScope : public EHScope {
                  EHScopeStack::stable_iterator enclosingEH)
       : EHScope(EHScope::Cleanup, enclosingEH),
         EnclosingNormal(enclosingNormal), NormalBlock(nullptr),
-        ActiveFlag(Address::invalid()), ExtInfo(nullptr), AuxAllocas(nullptr),
+        ActiveFlag(Address::invalid()), ExtInfo(nullptr),
         FixupDepth(fixupDepth) {
     CleanupBits.IsNormalCleanup = isNormal;
     CleanupBits.IsEHCleanup = isEH;
@@ -360,15 +312,8 @@ class alignas(8) EHCleanupScope : public EHScope {
   }
 
   void Destroy() {
-    if (AuxAllocas)
-      delete AuxAllocas;
     delete ExtInfo;
   }
-  void AddAuxAllocas(llvm::SmallVector<llvm::AllocaInst *> Allocas) {
-    for (auto *Alloca : Allocas)
-      getAuxillaryAllocas().Add(Alloca);
-  }
-  void MarkEmitted() { getAuxillaryAllocas().MarkUsed(); }
   // Objects of EHCleanupScope are not destructed. Use Destroy().
   ~EHCleanupScope() = delete;
 
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index 3f05ebb561da5..ce6d6d8956076 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -19,7 +19,6 @@
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "ConstantEmitter.h"
-#include "EHScopeStack.h"
 #include "PatternInit.h"
 #include "TargetInfo.h"
 #include "clang/AST/ASTContext.h"
@@ -2202,27 +2201,6 @@ void CodeGenFunction::pushDestroy(CleanupKind cleanupKind, Address addr,
                                      destroyer, useEHCleanupForArray);
 }
 
-// Pushes a destroy and defers its deactivation until its
-// CleanupDeactivationScope is exited.
-void CodeGenFunction::pushDestroyAndDeferDeactivation(
-    QualType::DestructionKind dtorKind, Address addr, QualType type) {
-  assert(dtorKind && "cannot push destructor for trivial type");
-
-  CleanupKind cleanupKind = getCleanupKind(dtorKind);
-  pushDestroyAndDeferDeactivation(
-      cleanupKind, addr, type, getDestroyer(dtorKind), cleanupKind & EHCleanup);
-}
-
-void CodeGenFunction::pushDestroyAndDeferDeactivation(
-    CleanupKind cleanupKind, Address addr, QualType type, Destroyer *destroyer,
-    bool useEHCleanupForArray) {
-  llvm::Instruction *DominatingIP =
-      Builder.CreateFlagLoad(llvm::Constant::getNullValue(Int8PtrTy));
-  pushDestroy(cleanupKind, addr, type, destroyer, useEHCleanupForArray);
-  DeferredDeactivationCleanupStack.push_back(
-      {EHStack.stable_begin(), DominatingIP});
-}
-
 void CodeGenFunction::pushStackRestore(CleanupKind Kind, Address SPMem) {
   EHStack.pushCleanup<CallStackRestore>(Kind, SPMem);
 }
@@ -2239,19 +2217,16 @@ void CodeGenFunction::pushLifetimeExtendedDestroy(CleanupKind cleanupKind,
   // If we're not in a conditional branch, we don't need to bother generating a
   // conditional cleanup.
   if (!isInConditionalBranch()) {
+    // Push an EH-only cleanup for the object now.
     // FIXME: When popping normal cleanups, we need to keep this EH cleanup
     // around in case a temporary's destructor throws an exception.
+    if (cleanupKind & EHCleanup)
+      EHStack.pushCleanup<DestroyObject>(
+          static_cast<CleanupKind>(cleanupKind & ~NormalCleanup), addr, type,
+          destroyer, useEHCleanupForArray);
 
-    // Add the cleanup to the EHStack. After the full-expr, this would be
-    // deactivated before being popped from the stack.
-    pushDestroyAndDeferDeactivation(cleanupKind, addr, type, destroyer,
-                                    useEHCleanupForArray);
-
-    // Since this is lifetime-extended, push it once again to the EHStack after
-    // the full expression.
     return pushCleanupAfterFullExprWithActiveFlag<DestroyObject>(
-        cleanupKind, Address::invalid(), addr, type, destroyer,
-        useEHCleanupForArray);
+        cleanupKind, Address::invalid(), addr, type, destroyer, useEHCleanupForArray);
   }
 
   // Otherwise, we should only destroy the object if it's been initialized.
@@ -2266,12 +2241,13 @@ void CodeGenFunction::pushLifetimeExtendedDestroy(CleanupKind cleanupKind,
   Address ActiveFlag = createCleanupActiveFlag();
   SavedType SavedAddr = saveValueInCond(addr);
 
-  pushCleanupAndDeferDeactivation<ConditionalCleanupType>(
-      cleanupKind, SavedAddr, type, destroyer, useEHCleanupForArray);
-  initFullExprCleanupWithFlag(ActiveFlag);
+  if (cleanupKind & EHCleanup) {
+    EHStack.pushCleanup<ConditionalCleanupType>(
+        static_cast<CleanupKind>(cleanupKind & ~NormalCleanup), SavedAddr, type,
+        destroyer, useEHCleanupForArray);
+    initFullExprCleanupWithFlag(ActiveFlag);
+  }
 
-  // Since this is lifetime-extended, push it once again to the EHStack after
-  // the full expression.
   pushCleanupAfterFullExprWithActiveFlag<ConditionalCleanupType>(
       cleanupKind, ActiveFlag, SavedAddr, type, destroyer,
       useEHCleanupForArray);
@@ -2466,9 +2442,9 @@ namespace {
   };
 } // end anonymous namespace
 
-/// pushIrregularPartialArrayCleanup - Push a NormalAndEHCleanup to
-/// destroy already-constructed elements of the given array.  The cleanup may be
-/// popped with DeactivateCleanupBlock or PopCleanupBlock.
+/// pushIrregularPartialArrayCleanup - Push an EH cleanup to destroy
+/// already-constructed elements of the given array.  The cleanup
+/// may be popped with DeactivateCleanupBlock or PopCleanupBlock.
 ///
 /// \param elementType - the immediate element type of the array;
 ///   possibly still an array type
@@ -2477,9 +2453,10 @@ void CodeGenFunction::pushIrregularPartialArrayCleanup(llvm::Value *arrayBegin,
                                                        QualType elementType,
                                                        CharUnits elementAlign,
                                                        Destroyer *destroyer) {
-  pushFullExprCleanup<IrregularPartialArrayDestroy>(
-      NormalAndEHCleanup, arrayBegin, arrayEndPointer, elementType,
-      elementAlign, destroyer);
+  pushFullExprCleanup<IrregularPartialArrayDestroy>(EHCleanup,
+                                                    arrayBegin, arrayEndPointer,
+                                                    elementType, elementAlign,
+                                                    destroyer);
 }
 
 /// pushRegularPartialArrayCleanup - Push an EH cleanup to destroy
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index c85a339f5e3f8..cf696a1c9f560 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -115,16 +115,10 @@ RawAddress CodeGenFunction::CreateTempAlloca(llvm::Type *Ty, CharUnits Align,
 llvm::AllocaInst *CodeGenFunction::CreateTempAlloca(llvm::Type *Ty,
                                                     const Twine &Name,
                                                     llvm::Value *ArraySize) {
-  llvm::AllocaInst *Alloca;
   if (ArraySize)
-    Alloca = Builder.CreateAlloca(Ty, ArraySize, Name);
-  else
-    Alloca = new llvm::AllocaInst(Ty, CGM.getDataLayout().getAllocaAddrSpace(),
-                                  ArraySize, Name, AllocaInsertPt);
-  if (Allocas) {
-    Allocas->Add(Alloca);
-  }
-  return Alloca;
+    return Builder.CreateAlloca(Ty, ArraySize, Name);
+  return new llvm::AllocaInst(Ty, CGM.getDataLayout().getAllocaAddrSpace(),
+                              ArraySize, Name, AllocaInsertPt);
 }
 
 /// CreateDefaultAlignTempAlloca - This creates an alloca with the
diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
index 560a9e2c5ead5..1b9287ea23934 100644
--- a/clang/lib/CodeGen/CGExprAgg.cpp
+++ b/clang/lib/CodeGen/CGExprAgg.cpp
@@ -15,7 +15,6 @@
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "ConstantEmitter.h"
-#include "EHScopeStack.h"
 #include "TargetInfo.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Attr.h"
@@ -25,7 +24,6 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 using namespace clang;
@@ -560,27 +558,24 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, llvm::ArrayType *AType,
   // For that, we'll need an EH cleanup.
   QualType::DestructionKind dtorKind = elementType.isDestructedType();
   Address endOfInit = Address::invalid();
-  CodeGenFunction::CleanupDeactivationScope deactivation(CGF);
-
-  if (dtorKind) {
-    CodeGenFunction::AllocaTrackerRAII allocaTracker(CGF);
+  EHScopeStack::stable_iterator cleanup;
+  llvm::Instruction *cleanupDominator = nullptr;
+  if (CGF.needsEHCleanup(dtorKind)) {
     // In principle we could tell the cleanup where we are more
     // directly, but the control flow can get so varied here that it
     // would actually be quite complex.  Therefore we go through an
     // alloca.
-    llvm::Instruction *dominatingIP =
-        Builder.CreateFlagLoad(llvm::ConstantInt::getNullValue(CGF.Int8PtrTy));
     endOfInit = CGF.CreateTempAlloca(begin->getType(), CGF.getPointerAlign(),
                                      "arrayinit.endOfInit");
-    Builder.CreateStore(begin, endOfInit);
+    cleanupDominator = Builder.CreateStore(begin, endOfInit);
     CGF.pushIrregularPartialArrayCleanup(begin, endOfInit, elementType,
                                          elementAlign,
                                          CGF.getDestroyer(dtorKind));
-    cast<EHCleanupScope>(*CGF.EHStack.find(CGF.EHStack.stable_begin()))
-        .AddAuxAllocas(allocaTracker.Take());
+    cleanup = CGF.EHStack.stable_begin();
 
-    CGF.DeferredDeactivationCleanupStack.push_back(
-        {CGF.EHStack.stable_begin(), dominatingIP});
+  // Otherwise, remember that we didn't need a cleanup.
+  } else {
+    dtorKind = QualType::DK_none;
   }
 
   llvm::Value *one = llvm::ConstantInt::get(CGF.SizeTy, 1);
@@ -676,6 +671,9 @@ void AggExprEmitter::EmitArrayInit(Address DestPtr, llvm::ArrayType *AType,
 
     CGF.EmitBlock(endBB);
   }
+
+  // Leave the partial-array cleanup if we entered one.
+  if (dtorKind) CGF.DeactivateCleanupBlock(cleanup, cleanupDominator);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1376,8 +1374,9 @@ AggExprEmitter::VisitLambdaExpr(LambdaExpr *E) {
   LValue SlotLV = CGF.MakeAddrLValue(Slot.getAddress(), E->getType());
 
   // We'll need to enter cleanup scopes in case any of the element
-  // initializers throws an exception or contains branch out of the expressions.
-  CodeGenFunction::CleanupDeactivationScope scope(CGF);
+  // initializers throws an exception.
+  SmallVector<EHScopeStack::stable_iterator, 16> Cleanups;
+  llvm::Instruction *CleanupDominator = nullptr;
 
   CXXRecordDecl::field_iterator CurField = E->getLambdaClass()->field_begin();
   for (LambdaExpr::const_capture_init_iterator i = E->capture_init_begin(),
@@ -1396,12 +1395,28 @@ AggExprEmitter::VisitLambdaExpr(LambdaExpr *E) {
     if (QualType::DestructionKind DtorKind =
             CurField->getType().isDestructedType()) {
       assert(LV.isSimple());
-      if (DtorKind)
-        CGF.pushDestroyAndDeferDeactivation(
-            NormalAndEHCleanup, LV.getAddress(CGF), CurField->getType(),
-            CGF.getDestroyer(DtorKind), false);
+      if (CGF.needsEHCleanup(DtorKind)) {
+        if (!CleanupDominator)
+          CleanupDominator = CGF.Builder.CreateAlignedLoad(
+              CGF.Int8Ty,
+              llvm::Constant::getNullValue(CGF.Int8PtrTy),
+              CharUnits::One()); // placeholder
+
+        CGF.pushDestroy(EHCleanup, LV.getAddress(CGF), CurField->getType(),
+                        CGF.getDestroyer(DtorKind), false);
+        Cleanups.push_back(CGF.EHStack.stable_begin());
+      }
     }
   }
+
+  // Deactivate all the partial cleanups in reverse order, which
+  // generally means popping them.
+  for (unsigned i = Cleanups.size(); i != 0; --i)
+    CGF.DeactivateCleanupBlock(Cleanups[i-1], CleanupDominator);
+
+  // Destroy the placeholder if we made one.
+  if (CleanupDominator)
+    CleanupDominator->eraseFromParent();
 }
 
 void AggExprEmitter::VisitExprWithCleanups(ExprWithCleanups *E) {
@@ -1690,7 +1705,14 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr(
   // We'll need to enter cleanup scopes in case any of the element
   // initializers throws an exception.
   SmallVector<EHScopeStack::stable_iterator, 16> cleanups;
-  CodeGenFunction::CleanupDeactivationScope DeactivateCleanups(CGF);
+  llvm::Instruction *cleanupDominator = nullptr;
+  auto addCleanup = [&](const EHScopeStack::stable_iterator &cleanup) {
+    cleanups.push_back(cleanup);
+    if (!cleanupDominator) // create placeholder once needed
+      cleanupDominator = CGF.Builder.CreateAlignedLoad(
+          CGF.Int8Ty, llvm::Constant::getNullValue(CGF.Int8PtrTy),
+          CharUnits::One());
+  };
 
   unsigned curInitIndex = 0;
 
@@ -1713,8 +1735,10 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr(
       CGF.EmitAggExpr(InitExprs[curInitIndex++], AggSlot);
 
       if (QualType::DestructionKind dtorKind =
-              Base.getType().isDestructedType())
-        CGF.pushDestroyAndDeferDeactivation(dtorKind, V, Base.getType());
+              Base.getType().isDestructedType()) {
+        CGF.pushDestroy(dtorKind, V, Base.getType());
+        addCleanup(CGF.EHStack.stable_begin());
+      }
     }
   }
 
@@ -1789,10 +1813,10 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr(
     if (QualType::DestructionKind dtorKind
           = field->getType().isDestructedType()) {
       assert(LV.isSimple());
-      if (dtorKind) {
-        CGF.pushDestroyAndDeferDeactivation(
-            NormalAndEHCleanup, LV.getAddress(CGF), field->getType(),
-            CGF.getDestroyer(dtorKind), false);
+      if (CGF.needsEHCleanup(dtorKind)) {
+        CGF.pushDestroy(EHCleanup, LV.getAddress(CGF), field->getType(),
+                        CGF.getDestroyer(dtorKind), false);
+        addCleanup(CGF.EHStack.stable_begin());
         pushedCleanup = true;
       }
     }
@@ -1805,6 +1829,17 @@ void AggExprEmitter::VisitCXXParenListOrInitListExpr(
         if (GEP->use_empty())
           GEP->eraseFromParent();
   }
+
+  // Deactivate all the partial cleanups in reverse order, which
+  // generally means popping them.
+  assert((cleanupDominator || cleanups.empty()) &&
+         "Missing cleanupDominator before deactivating cleanup blocks");
+  for (unsigned i = cleanups.size(); i != 0; --i)
+    CGF.DeactivateCleanupBlock(cleanups[i-1], cleanupDominator);
+
+  // Destroy the placeholder if we made one.
+  if (cleanupDominator)
+    cleanupDominator->eraseFromParent();
 }
 
 void AggExprEmitter::VisitArrayInitLoopExpr(const ArrayInitLoopExpr *E,
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index a88b29b326bb9..a4fb673284cec 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -1008,8 +1008,8 @@ void CodeGenFunction::EmitNewArrayInitializer(
   const Expr *Init = E->getInitializer();
   Address EndOfInit = Address::invalid();
   QualType::DestructionKind DtorKind = ElementType.isDestructedType();
-  CleanupDeactivationScope deactivation(*this);
-  bool pushedCleanup = false;
+  EHScopeStack::stable_iterator Cleanup;
+  llvm::Instruction *CleanupDominator = nullptr;
 
   CharUnits ElementSize = getContext().getTypeSizeInChars(ElementType);
   CharUnits ElementAlign =
@@ -1105,24 +1105,19 @@ void CodeGenFunction::EmitNewArrayInitializer(
     }
 
     // Enter a partial-destruction Cleanup if necessary.
-    if (DtorKind) {
-      AllocaTrackerRAII AllocaTracker(*this);
+    if (needsEHCleanup(DtorKind)) {
       // In principle we could tell the Cleanup where we are more
       // directly, but the control flow can get so varied here that it
       // would actually be quite complex.  Therefore we go through an
       // alloca.
-      llvm::Instruction *DominatingIP =
-          Builder.CreateFlagLoad(llvm::ConstantInt::getNullValue(Int8PtrTy));
       EndOfInit = CreateTempAlloca(BeginPtr.getType(), getPointerAlign(),
                                    "array.init.end");
+      CleanupDominator =
+          Builder.CreateStore(BeginPtr.emitRawPointer(*this), EndOfInit);
       pushIrregularPartialArrayCleanup(BeginPtr.emitRawPointer(*this),
                                        EndOfInit, ElementType, ElementAlign,
                                        getDestroyer(DtorKind));
-      cast<EHCleanupScope>(*EHStack.find(EHStack.stable_begin()))
-          .AddAuxAllocas(AllocaTracker.Take());
-      DeferredDeactivationCleanupStack.push_back(
-          {EHStack.stable_begin(), DominatingIP});
-      pushedCleanup = true;
+      Cleanup = EHStack.stable_begin();
     }
 
     CharUnits StartAlign = CurPtr.getAlignment();
@@ -1169,6 +1164,9 @@ void CodeGenFunction::EmitNewArrayInitializer(
   // initialization.
   llvm::ConstantInt *ConstNum = dyn_cast<llvm::ConstantInt>(NumElements);
   if (ConstNum && ConstNum->getZExtValue() <= InitListElements) {
+    // If there was a Cleanup, deactivate it.
+    if (CleanupDominator)
+      DeactivateCleanupBlock(Cleanup, CleanupDominator);
     return;
   }
 
@@ -1283,14 +1281,13 @@ void CodeGenFunction::EmitNewArrayInitializer(
     Builder.CreateStore(CurPtr.emitRawPointer(*this), EndOfInit);
 
   // Enter a partial-destruction Cleanup if necessary.
-  if (!pushedCleanup && needsEHCleanup(DtorKind)) {
-    llvm::Instruction *DominatingIP =
-        Builder.CreateFlagLoad(llvm::ConstantInt::getNullValue(Int8PtrTy));
-    pushRegularPartialArrayCleanup(BeginPtr.emitRawPointer(*this),
-                                   CurPtr.emitRawPointer(*this), ElementType,
+  if (!CleanupDominator && needsEHCleanup(DtorKind)) {
+    llvm::Value *BeginPtrRaw = BeginPtr.emitRawPointer(*this);
+    llvm::Value *CurPtrRaw = CurPtr.emitRawPointer(*this);
+    pushRegularPartialArrayCleanup(BeginPtrRaw, CurPtrRaw, ElementType,
                                    ElementAlign, getDestroyer(DtorKind));
-    DeferredDeactivationCleanupStack.push_back(
-        {EHStack.stable_begin(), DominatingIP});
+    Cleanup = EHStack.stable_begin();
+    CleanupDominator = Builder.CreateUnreachable();
   }
 
   // Emit the initializer into this element.
@@ -1298,7 +1295,10 @@ void CodeGenFunction::EmitNewArrayInitializer(
                           AggValueSlot::DoesNotOverlap);
 
   // Leave the Cleanup if we entered one.
-  deactivation.ForceDeactivate();
+  if (CleanupDominator) {
+    DeactivateCleanupBlock(Cleanup, CleanupDominator);
+    CleanupDominator->eraseFromParent();
+  }
 
   // Advance to the next element by adjusting the pointer type as necessary.
   llvm::Value *NextPtr = Builder.CreateConstInBoundsGEP1_32(
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 59ba03c6b8625..eb716520e5ff5 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -3466,7 +3466,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
       case CudaArch::SM_20:
       case CudaArch::SM_21:
       case CudaArch::SM_30:
-      case CudaArch::SM_32:
+      case CudaArch::SM_32_:
       case CudaArch::SM_35:
       case CudaArch::SM_37:
       case CudaArch::SM_50:
diff --git a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
index 634a55fec5182..868b1ab98e048 100644
--- a/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
+++ b/clang/lib/CodeGen/CGRecordLayoutBuilder.cpp
@@ -41,10 +41,11 @@ namespace {
 ///   contains enough information to determine where the runs break.  Microsoft
 ///   and Itanium follow different rules and use different codepaths.
 /// * It is desired that, when possible, bitfields use the appropriate iN type
-///   when lowered to llvm types.  For example unsigned x : 24 gets lowered to
+///   when lowered to llvm types. For example unsigned x : 24 gets lowered to
 ///   i24.  This isn't always possible because i24 has storage size of 32 bit
-///   and if it is possible to use that extra byte of padding we must use
-///   [i8 x 3] instead of i24.  The function clipTailPadding does this.
+///   and if it is possible to use that extra byte of padding we must use [i8 x
+///   3] instead of i24. This is computed when accumulating bitfields in
+///   accumulateBitfields.
 ///   C++ examples that require clipping:
 ///   struct { int a : 24; char b; }; // a must be clipped, b goes at offset 3
 ///   struct A { int a : 24; ~A(); }; // a must be clipped because:
@@ -62,11 +63,7 @@ namespace {
 ///   that the tail padding is not used in the complete class.) However,
 ///   because LLVM reads from the complete type it can generate incorrect code
 ///   if we do not clip the tail padding off of the bitfield in the complete
-///   layout.  This introduces a somewhat awkward extra unnecessary clip stage.
-///   The location of the clip is stored internally as a sentinel of type
-///   SCISSOR.  If LLVM were updated to read base types (which it probably
-///   should because locations of things such as VBases are bogus in the llvm
-///   type anyway) then we could eliminate the SCISSOR.
+///   layout.
 /// * Itanium allows nearly empty primary virtual bases.  These bases don't get
 ///   get their own storage because they're laid out as part of another base
 ///   or at the beginning of the structure.  Determining if a VBase actually
@@ -200,9 +197,7 @@ struct CGRecordLowering {
                      const CXXRecordDecl *Query) const;
   void calculateZeroInit();
   CharUnits calculateTailClippingOffset(bool isNonVirtualBaseType) const;
-  /// Lowers bitfield storage types to I8 arrays for bitfields with tail
-  /// padding that is or can potentially be used.
-  void clipTailPadding();
+  void checkBitfieldClipping() const;
   /// Determines if we need a packed llvm struct.
   void determinePacked(bool NVBaseType);
   /// Inserts padding everywhere it's needed.
@@ -305,7 +300,7 @@ void CGRecordLowering::lower(bool NVBaseType) {
   }
   llvm::stable_sort(Members);
   Members.push_back(StorageInfo(Size, getIntNType(8)));
-  clipTailPadding();
+  checkBitfieldClipping();
   determinePacked(NVBaseType);
   insertPadding();
   Members.pop_back();
@@ -531,6 +526,7 @@ CGRecordLowering::accumulateBitFields(bool isNonVirtualBaseType,
   // available padding characters.
   RecordDecl::field_iterator BestEnd = Begin;
   CharUnits BestEndOffset;
+  bool BestClipped; // Whether the representation must be in a byte array.
 
   for (;;) {
     // AtAlignedBoundary is true iff Field is the (potential) start of a new
@@ -593,10 +589,9 @@ CGRecordLowering::accumulateBitFields(bool isNonVirtualBaseType,
         // this is the best seen so far.
         BestEnd = Field;
         BestEndOffset = BeginOffset + AccessSize;
-        if (Types.getCodeGenOpts().FineGrainedBitfieldAccesses)
-          // Fine-grained access, so no merging of spans.
-          InstallBest = true;
-        else if (!BitSizeSinceBegin)
+        // Assume clipped until proven not below.
+        BestClipped = true;
+        if (!BitSizeSinceBegin)
           // A zero-sized initial span -- this will install nothing and reset
           // for another.
           InstallBest = true;
@@ -624,6 +619,12 @@ CGRecordLowering::accumulateBitFields(bool isNonVirtualBaseType,
             // The access unit is not at a naturally aligned offset within the
             // structure.
             InstallBest = true;
+
+          if (InstallBest && BestEnd == Field)
+            // We're installing the first span, whose clipping was presumed
+            // above. Compute it correctly.
+            if (getSize(Type) == AccessSize)
+              BestClipped = false;
         }
 
         if (!InstallBest) {
@@ -656,11 +657,15 @@ CGRecordLowering::accumulateBitFields(bool isNonVirtualBaseType,
             // access unit.
             BestEndOffset = BeginOffset + TypeSize;
             BestEnd = Field;
+            BestClipped = false;
           }
 
           if (Barrier)
             // The next field is a barrier that we cannot merge across.
             InstallBest = true;
+          else if (Types.getCodeGenOpts().FineGrainedBitfieldAccesses)
+            // Fine-grained access, so no merging of spans.
+            InstallBest = true;
           else
             // Otherwise, we're not installing. Update the bit size
             // of the current span to go all the way to LimitOffset, which is
@@ -679,7 +684,17 @@ CGRecordLowering::accumulateBitFields(bool isNonVirtualBaseType,
         // Add the storage member for the access unit to the record. The
         // bitfields get the offset of their storage but come afterward and
         // remain there after a stable sort.
-        llvm::Type *Type = getIntNType(Context.toBits(AccessSize));
+        llvm::Type *Type;
+        if (BestClipped) {
+          assert(getSize(getIntNType(Context.toBits(AccessSize))) >
+                     AccessSize &&
+                 "Clipped access need not be clipped");
+          Type = getByteArrayType(AccessSize);
+        } else {
+          Type = getIntNType(Context.toBits(AccessSize));
+          assert(getSize(Type) == AccessSize &&
+                 "Unclipped access must be clipped");
+        }
         Members.push_back(StorageInfo(BeginOffset, Type));
         for (; Begin != BestEnd; ++Begin)
           if (!Begin->isZeroLengthBitField(Context))
@@ -934,32 +949,21 @@ void CGRecordLowering::calculateZeroInit() {
   }
 }
 
-void CGRecordLowering::clipTailPadding() {
-  std::vector<MemberInfo>::iterator Prior = Members.begin();
-  CharUnits Tail = getSize(Prior->Data);
-  for (std::vector<MemberInfo>::iterator Member = Prior + 1,
-                                         MemberEnd = Members.end();
-       Member != MemberEnd; ++Member) {
+// Verify accumulateBitfields computed the correct storage representations.
+void CGRecordLowering::checkBitfieldClipping() const {
+#ifndef NDEBUG
+  auto Tail = CharUnits::Zero();
+  for (const auto &M : Members) {
     // Only members with data and the scissor can cut into tail padding.
-    if (!Member->Data && Member->Kind != MemberInfo::Scissor)
+    if (!M.Data && M.Kind != MemberInfo::Scissor)
       continue;
-    if (Member->Offset < Tail) {
-      assert(Prior->Kind == MemberInfo::Field &&
-             "Only storage fields have tail padding!");
-      if (!Prior->FD || Prior->FD->isBitField())
-        Prior->Data = getByteArrayType(bitsToCharUnits(llvm::alignTo(
-            cast<llvm::IntegerType>(Prior->Data)->getIntegerBitWidth(), 8)));
-      else {
-        assert(Prior->FD->hasAttr<NoUniqueAddressAttr>() &&
-               "should not have reused this field's tail padding");
-        Prior->Data = getByteArrayType(
-            Context.getTypeInfoDataSizeInChars(Prior->FD->getType()).Width);
-      }
-    }
-    if (Member->Data)
-      Prior = Member;
-    Tail = Prior->Offset + getSize(Prior->Data);
+
+    assert(M.Offset >= Tail && "Bitfield access unit is not clipped");
+    Tail = M.Offset;
+    if (M.Data)
+      Tail += getSize(M.Data);
   }
+#endif
 }
 
 void CGRecordLowering::determinePacked(bool NVBaseType) {
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 87766a758311d..86a6ddd80cc11 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -91,8 +91,6 @@ CodeGenFunction::CodeGenFunction(CodeGenModule &cgm, bool suppressNewContext)
 
 CodeGenFunction::~CodeGenFunction() {
   assert(LifetimeExtendedCleanupStack.empty() && "failed to emit a cleanup");
-  assert(DeferredDeactivationCleanupStack.empty() &&
-         "missed to deactivate a cleanup");
 
   if (getLangOpts().OpenMP && CurFn)
     CGM.getOpenMPRuntime().functionFinished(*this);
@@ -348,10 +346,6 @@ static void EmitIfUsed(CodeGenFunction &CGF, llvm::BasicBlock *BB) {
 void CodeGenFunction::FinishFunction(SourceLocation EndLoc) {
   assert(BreakContinueStack.empty() &&
          "mismatched push/pop in break/continue stack!");
-  assert(LifetimeExtendedCleanupStack.empty() &&
-         "mismatched push/pop of cleanups in EHStack!");
-  assert(DeferredDeactivationCleanupStack.empty() &&
-         "mismatched activate/deactivate of cleanups!");
 
   bool OnlySimpleReturnStmts = NumSimpleReturnExprs > 0
     && NumSimpleReturnExprs == NumReturnExprs
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index c49e9fd00c8d3..ff1873325d409 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -39,7 +39,6 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/SanitizerStats.h"
@@ -671,51 +670,6 @@ class CodeGenFunction : public CodeGenTypeCache {
 
   EHScopeStack EHStack;
   llvm::SmallVector<char, 256> LifetimeExtendedCleanupStack;
-
-  // A stack of cleanups which were added to EHStack but have to be deactivated
-  // later before being popped or emitted. These are usually deactivated on
-  // exiting a `CleanupDeactivationScope` scope. For instance, after a
-  // full-expr.
-  //
-  // These are specially useful for correctly emitting cleanups while
-  // encountering branches out of expression (through stmt-expr or coroutine
-  // suspensions).
-  struct DeferredDeactivateCleanup {
-    EHScopeStack::stable_iterator Cleanup;
-    llvm::Instruction *DominatingIP;
-  };
-  llvm::SmallVector<DeferredDeactivateCleanup> DeferredDeactivationCleanupStack;
-
-  // Enters a new scope for capturing cleanups which are deferred to be
-  // deactivated, all of which will be deactivated once the scope is exited.
-  struct CleanupDeactivationScope {
-    CodeGenFunction &CGF;
-    size_t OldDeactivateCleanupStackSize;
-    bool Deactivated;
-    CleanupDeactivationScope(CodeGenFunction &CGF)
-        : CGF(CGF), OldDeactivateCleanupStackSize(
-                        CGF.DeferredDeactivationCleanupStack.size()),
-          Deactivated(false) {}
-
-    void ForceDeactivate() {
-      assert(!Deactivated && "Deactivating already deactivated scope");
-      auto &Stack = CGF.DeferredDeactivationCleanupStack;
-      for (size_t I = Stack.size(); I > OldDeactivateCleanupStackSize; I--) {
-        CGF.DeactivateCleanupBlock(Stack[I - 1].Cleanup,
-                                   Stack[I - 1].DominatingIP);
-        Stack[I - 1].DominatingIP->eraseFromParent();
-      }
-      Stack.resize(OldDeactivateCleanupStackSize);
-      Deactivated = true;
-    }
-
-    ~CleanupDeactivationScope() {
-      if (Deactivated)
-        return;
-      ForceDeactivate();
-    }
-  };
-
   llvm::SmallVector<const JumpDest *, 2> SEHTryEpilogueStack;
 
   llvm::Instruction *CurrentFuncletPad = nullptr;
@@ -921,19 +875,6 @@ class CodeGenFunction : public CodeGenTypeCache {
       new (Buffer + sizeof(Header) + sizeof(T)) RawAddress(ActiveFlag);
   }
 
-  // Push a cleanup onto EHStack and deactivate it later. It is usually
-  // deactivated when exiting a `CleanupDeactivationScope` (for example: after a
-  // full expression).
-  template <class T, class... As>
-  void pushCleanupAndDeferDeactivation(CleanupKind Kind, As... A) {
-    // Placeholder dominating IP for this cleanup.
-    llvm::Instruction *DominatingIP =
-        Builder.CreateFlagLoad(llvm::Constant::getNullValue(Int8PtrTy));
-    EHStack.pushCleanup<T>(Kind, A...);
-    DeferredDeactivationCleanupStack.push_back(
-        {EHStack.stable_begin(), DominatingIP});
-  }
-
   /// Set up the last cleanup that was pushed as a conditional
   /// full-expression cleanup.
   void initFullExprCleanup() {
@@ -985,7 +926,6 @@ class CodeGenFunction : public CodeGenTypeCache {
   class RunCleanupsScope {
     EHScopeStack::stable_iterator CleanupStackDepth, OldCleanupScopeDepth;
     size_t LifetimeExtendedCleanupStackSize;
-    CleanupDeactivationScope DeactivateCleanups;
     bool OldDidCallStackSave;
   protected:
     bool PerformCleanup;
@@ -1000,7 +940,8 @@ class CodeGenFunction : public CodeGenTypeCache {
   public:
     /// Enter a new cleanup scope.
     explicit RunCleanupsScope(CodeGenFunction &CGF)
-        : DeactivateCleanups(CGF), PerformCleanup(true), CGF(CGF) {
+      : PerformCleanup(true), CGF(CGF)
+    {
       CleanupStackDepth = CGF.EHStack.stable_begin();
       LifetimeExtendedCleanupStackSize =
           CGF.LifetimeExtendedCleanupStack.size();
@@ -1030,7 +971,6 @@ class CodeGenFunction : public CodeGenTypeCache {
     void ForceCleanup(std::initializer_list<llvm::Value**> ValuesToReload = {}) {
       assert(PerformCleanup && "Already forced cleanup");
       CGF.DidCallStackSave = OldDidCallStackSave;
-      DeactivateCleanups.ForceDeactivate();
       CGF.PopCleanupBlocks(CleanupStackDepth, LifetimeExtendedCleanupStackSize,
                            ValuesToReload);
       PerformCleanup = false;
@@ -2220,11 +2160,6 @@ class CodeGenFunction : public CodeGenTypeCache {
                      Address addr, QualType type);
   void pushDestroy(CleanupKind kind, Address addr, QualType type,
                    Destroyer *destroyer, bool useEHCleanupForArray);
-  void pushDestroyAndDeferDeactivation(QualType::DestructionKind dtorKind,
-                                       Address addr, QualType type);
-  void pushDestroyAndDeferDeactivation(CleanupKind cleanupKind, Address addr,
-                                       QualType type, Destroyer *destroyer,
-                                       bool useEHCleanupForArray);
   void pushLifetimeExtendedDestroy(CleanupKind kind, Address addr,
                                    QualType type, Destroyer *destroyer,
                                    bool useEHCleanupForArray);
@@ -2763,33 +2698,6 @@ class CodeGenFunction : public CodeGenTypeCache {
                             TBAAAccessInfo *TBAAInfo = nullptr);
   LValue EmitLoadOfPointerLValue(Address Ptr, const PointerType *PtrTy);
 
-private:
-  struct AllocaTracker {
-    void Add(llvm::AllocaInst *I) { Allocas.push_back(I); }
-    llvm::SmallVector<llvm::AllocaInst *> Take() { return std::move(Allocas); }
-
-  private:
-    llvm::SmallVector<llvm::AllocaInst *> Allocas;
-  };
-  AllocaTracker *Allocas = nullptr;
-
-public:
-  // Captures all the allocas created during the scope of its RAII object.
-  struct AllocaTrackerRAII {
-    AllocaTrackerRAII(CodeGenFunction &CGF)
-        : CGF(CGF), OldTracker(CGF.Allocas) {
-      CGF.Allocas = &Tracker;
-    }
-    ~AllocaTrackerRAII() { CGF.Allocas = OldTracker; }
-
-    llvm::SmallVector<llvm::AllocaInst *> Take() { return Tracker.Take(); }
-
-  private:
-    CodeGenFunction &CGF;
-    AllocaTracker *OldTracker;
-    AllocaTracker Tracker;
-  };
-
   /// CreateTempAlloca - This creates an alloca and inserts it into the entry
   /// block if \p ArraySize is nullptr, otherwise inserts it at the current
   /// insertion point of the builder. The caller is responsible for setting an
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index e44749672d582..0c447b20cef40 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -3952,9 +3952,20 @@ bool CodeGenModule::shouldEmitFunction(GlobalDecl GD) {
   // behavior may break ABI compatibility of the current unit.
   if (const Module *M = F->getOwningModule();
       M && M->getTopLevelModule()->isNamedModule() &&
-      getContext().getCurrentNamedModule() != M->getTopLevelModule() &&
-      !F->hasAttr<AlwaysInlineAttr>())
-    return false;
+      getContext().getCurrentNamedModule() != M->getTopLevelModule()) {
+    // There are practices to mark template member function as always-inline
+    // and mark the template as extern explicit instantiation but not give
+    // the definition for member function. So we have to emit the function
+    // from explicitly instantiation with always-inline.
+    //
+    // See https://github.com/llvm/llvm-project/issues/86893 for details.
+    //
+    // TODO: Maybe it is better to give it a warning if we call a non-inline
+    // function from other module units which is marked as always-inline.
+    if (!F->isTemplateInstantiation() || !F->hasAttr<AlwaysInlineAttr>()) {
+      return false;
+    }
+  }
 
   if (F->hasAttr<NoInlineAttr>())
     return false;
diff --git a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
index b1dd7c4372d47..96b3cc3bb8ffb 100644
--- a/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/RISCV.cpp
@@ -68,8 +68,10 @@ static void getRISCFeaturesFromMcpu(const Driver &D, const Arg *A,
           << A->getSpelling() << Mcpu;
   }
 
-  if (llvm::RISCV::hasFastUnalignedAccess(Mcpu))
-    Features.push_back("+fast-unaligned-access");
+  if (llvm::RISCV::hasFastUnalignedAccess(Mcpu)) {
+    Features.push_back("+unaligned-scalar-mem");
+    Features.push_back("+unaligned-vector-mem");
+  }
 }
 
 void riscv::getRISCVTargetFeatures(const Driver &D, const llvm::Triple &Triple,
@@ -168,12 +170,16 @@ void riscv::getRISCVTargetFeatures(const Driver &D, const llvm::Triple &Triple,
   }
 
   // Android requires fast unaligned access on RISCV64.
-  if (Triple.isAndroid())
-    Features.push_back("+fast-unaligned-access");
+  if (Triple.isAndroid()) {
+    Features.push_back("+unaligned-scalar-mem");
+    Features.push_back("+unaligned-vector-mem");
+  }
 
   // -mstrict-align is default, unless -mno-strict-align is specified.
   AddTargetFeature(Args, Features, options::OPT_mno_strict_align,
-                   options::OPT_mstrict_align, "fast-unaligned-access");
+                   options::OPT_mstrict_align, "unaligned-scalar-mem");
+  AddTargetFeature(Args, Features, options::OPT_mno_strict_align,
+                   options::OPT_mstrict_align, "unaligned-vector-mem");
 
   // Now add any that the user explicitly requested on the command line,
   // which may override the defaults.
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 6d52eced10429..096ed14f95704 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -346,11 +346,14 @@ static bool addExceptionArgs(const ArgList &Args, types::ID InputType,
   bool EH = Args.hasFlag(options::OPT_fexceptions, options::OPT_fno_exceptions,
                          false);
 
-  bool EHa = Args.hasFlag(options::OPT_fasync_exceptions,
-                          options::OPT_fno_async_exceptions, false);
-  if (EHa) {
-    CmdArgs.push_back("-fasync-exceptions");
-    EH = true;
+  // Async exceptions are Windows MSVC only.
+  if (Triple.isWindowsMSVCEnvironment()) {
+    bool EHa = Args.hasFlag(options::OPT_fasync_exceptions,
+                            options::OPT_fno_async_exceptions, false);
+    if (EHa) {
+      CmdArgs.push_back("-fasync-exceptions");
+      EH = true;
+    }
   }
 
   // Obj-C exceptions are enabled by default, regardless of -fexceptions. This
@@ -8102,7 +8105,8 @@ struct EHFlags {
 ///      The 'a' modifier is unimplemented and fundamentally hard in LLVM IR.
 /// - c: Assume that extern "C" functions are implicitly nounwind.
 /// The default is /EHs-c-, meaning cleanups are disabled.
-static EHFlags parseClangCLEHFlags(const Driver &D, const ArgList &Args) {
+static EHFlags parseClangCLEHFlags(const Driver &D, const ArgList &Args,
+                                   bool isWindowsMSVC) {
   EHFlags EH;
 
   std::vector<std::string> EHArgs =
@@ -8112,8 +8116,15 @@ static EHFlags parseClangCLEHFlags(const Driver &D, const ArgList &Args) {
       switch (EHVal[I]) {
       case 'a':
         EH.Asynch = maybeConsumeDash(EHVal, I);
-        if (EH.Asynch)
+        if (EH.Asynch) {
+          // Async exceptions are Windows MSVC only.
+          if (!isWindowsMSVC) {
+            EH.Asynch = false;
+            D.Diag(clang::diag::warn_drv_unused_argument) << "/EHa" << EHVal;
+            continue;
+          }
           EH.Synch = false;
+        }
         continue;
       case 'c':
         EH.NoUnwindC = maybeConsumeDash(EHVal, I);
@@ -8177,7 +8188,8 @@ void Clang::AddClangCLArgs(const ArgList &Args, types::ID InputType,
 
   const Driver &D = getToolChain().getDriver();
 
-  EHFlags EH = parseClangCLEHFlags(D, Args);
+  bool IsWindowsMSVC = getToolChain().getTriple().isWindowsMSVCEnvironment();
+  EHFlags EH = parseClangCLEHFlags(D, Args, IsWindowsMSVC);
   if (!isNVPTX && (EH.Synch || EH.Asynch)) {
     if (types::isCXX(InputType))
       CmdArgs.push_back("-fcxx-exceptions");
diff --git a/clang/lib/Index/USRGeneration.cpp b/clang/lib/Index/USRGeneration.cpp
index 5acc86191f8f9..31c4a3345c09d 100644
--- a/clang/lib/Index/USRGeneration.cpp
+++ b/clang/lib/Index/USRGeneration.cpp
@@ -267,10 +267,13 @@ void USRGenerator::VisitFunctionDecl(const FunctionDecl *D) {
     Out << '>';
   }
 
+  QualType CanonicalType = D->getType().getCanonicalType();
   // Mangle in type information for the arguments.
-  for (auto *PD : D->parameters()) {
-    Out << '#';
-    VisitType(PD->getType());
+  if (const auto *FPT = CanonicalType->getAs<FunctionProtoType>()) {
+    for (QualType PT : FPT->param_types()) {
+      Out << '#';
+      VisitType(PT);
+    }
   }
   if (D->isVariadic())
     Out << '.';
diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp
index cf31456b6950a..b20e6efcebfd1 100644
--- a/clang/lib/Interpreter/Interpreter.cpp
+++ b/clang/lib/Interpreter/Interpreter.cpp
@@ -550,7 +550,8 @@ std::unique_ptr<RuntimeInterfaceBuilder> Interpreter::FindRuntimeInterface() {
 
   auto LookupInterface = [&](Expr *&Interface, llvm::StringRef Name) {
     LookupResult R(S, &Ctx.Idents.get(Name), SourceLocation(),
-                   Sema::LookupOrdinaryName, Sema::ForVisibleRedeclaration);
+                   Sema::LookupOrdinaryName,
+                   RedeclarationKind::ForVisibleRedeclaration);
     S.LookupQualifiedName(R, Ctx.getTranslationUnitDecl());
     if (R.empty())
       return false;
diff --git a/clang/lib/Interpreter/InterpreterUtils.cpp b/clang/lib/Interpreter/InterpreterUtils.cpp
index c19cf6aa3156c..45f6322b8461e 100644
--- a/clang/lib/Interpreter/InterpreterUtils.cpp
+++ b/clang/lib/Interpreter/InterpreterUtils.cpp
@@ -72,7 +72,7 @@ NamedDecl *LookupNamed(Sema &S, llvm::StringRef Name,
                        const DeclContext *Within) {
   DeclarationName DName = &S.Context.Idents.get(Name);
   LookupResult R(S, DName, SourceLocation(), Sema::LookupOrdinaryName,
-                 Sema::ForVisibleRedeclaration);
+                 RedeclarationKind::ForVisibleRedeclaration);
 
   R.suppressDiagnostics();
 
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 2b934234b7cf5..5f26b5a9e46be 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -28,6 +28,7 @@
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/SemaCUDA.h"
 #include "clang/Sema/SemaDiagnostic.h"
+#include "clang/Sema/SemaOpenMP.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -2383,7 +2384,7 @@ Parser::DeclGroupPtrTy Parser::ParseDeclGroup(ParsingDeclSpec &DS,
       }
 
       if (getLangOpts().OpenMP)
-        Actions.startOpenMPCXXRangeFor();
+        Actions.OpenMP().startOpenMPCXXRangeFor();
       if (Tok.is(tok::l_brace))
         FRI->RangeExpr = ParseBraceInitializer();
       else
@@ -5330,7 +5331,7 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS,
 
   stripTypeAttributesOffDeclSpec(attrs, DS, TUK);
 
-  Sema::SkipBodyInfo SkipBody;
+  SkipBodyInfo SkipBody;
   if (!Name && TUK == Sema::TUK_Definition && Tok.is(tok::l_brace) &&
       NextToken().is(tok::identifier))
     SkipBody = Actions.shouldSkipAnonEnumBody(getCurScope(),
@@ -7659,8 +7660,21 @@ void Parser::ParseParameterDeclarationClause(
     // Parse a C++23 Explicit Object Parameter
     // We do that in all language modes to produce a better diagnostic.
     SourceLocation ThisLoc;
-    if (getLangOpts().CPlusPlus && Tok.is(tok::kw_this))
+    if (getLangOpts().CPlusPlus && Tok.is(tok::kw_this)) {
       ThisLoc = ConsumeToken();
+      // C++23 [dcl.fct]p6:
+      //   An explicit-object-parameter-declaration is a parameter-declaration
+      //   with a this specifier. An explicit-object-parameter-declaration
+      //   shall appear only as the first parameter-declaration of a
+      //   parameter-declaration-list of either:
+      //   - a member-declarator that declares a member function, or
+      //   - a lambda-declarator.
+      //
+      // The parameter-declaration-list of a requires-expression is not such
+      // a context.
+      if (DeclaratorCtx == DeclaratorContext::RequiresExpr)
+        Diag(ThisLoc, diag::err_requires_expr_explicit_object_parameter);
+    }
 
     ParseDeclarationSpecifiers(DS, /*TemplateInfo=*/ParsedTemplateInfo(),
                                AS_none, DeclSpecContext::DSC_normal,
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index cd4803d51bc1d..51fd64b2d01aa 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -2092,7 +2092,7 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind,
   TypeResult TypeResult = true;      // invalid
 
   bool Owned = false;
-  Sema::SkipBodyInfo SkipBody;
+  SkipBodyInfo SkipBody;
   if (TemplateId) {
     // Explicit specialization, class template partial specialization,
     // or explicit instantiation.
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index 473ec9afd6018..32d96f81c4c8d 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -31,6 +31,7 @@
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/SemaCUDA.h"
+#include "clang/Sema/SemaOpenMP.h"
 #include "clang/Sema/SemaSYCL.h"
 #include "clang/Sema/TypoCorrection.h"
 #include "llvm/ADT/SmallVector.h"
@@ -2075,7 +2076,7 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) {
           // replace this call to ActOnOpenACCArraySectionExpr in the future.
           // Eventually we'll genericize the OPenMPArraySectionExpr type as
           // well.
-          LHS = Actions.ActOnOMPArraySectionExpr(
+          LHS = Actions.OpenMP().ActOnOMPArraySectionExpr(
               LHS.get(), Loc, ArgExprs.empty() ? nullptr : ArgExprs[0],
               ColonLocFirst, ColonLocSecond, Length.get(), Stride.get(), RLoc);
         } else {
@@ -3277,7 +3278,7 @@ Parser::ParseParenExpression(ParenParseOption &ExprType, bool stopIfCastExpr,
     if (ErrorFound) {
       Result = ExprError();
     } else if (!Result.isInvalid()) {
-      Result = Actions.ActOnOMPArrayShapingExpr(
+      Result = Actions.OpenMP().ActOnOMPArrayShapingExpr(
           Result.get(), OpenLoc, RParenLoc, OMPDimensions, OMPBracketsRanges);
     }
     return Result;
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index 43d6105dcf31c..0d2ad980696fc 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -3910,10 +3910,10 @@ ExprResult Parser::ParseTypeTrait() {
   SmallVector<ParsedType, 2> Args;
   do {
     // Parse the next type.
-    TypeResult Ty =
-        ParseTypeName(/*SourceRange=*/nullptr,
-                      getLangOpts().CPlusPlus ? DeclaratorContext::TemplateArg
-                                              : DeclaratorContext::TypeName);
+    TypeResult Ty = ParseTypeName(/*SourceRange=*/nullptr,
+                                  getLangOpts().CPlusPlus
+                                      ? DeclaratorContext::TemplateTypeArg
+                                      : DeclaratorContext::TypeName);
     if (Ty.isInvalid()) {
       Parens.skipToEnd();
       return ExprError();
@@ -3955,8 +3955,8 @@ ExprResult Parser::ParseArrayTypeTrait() {
   if (T.expectAndConsume())
     return ExprError();
 
-  TypeResult Ty =
-      ParseTypeName(/*SourceRange=*/nullptr, DeclaratorContext::TemplateArg);
+  TypeResult Ty = ParseTypeName(/*SourceRange=*/nullptr,
+                                DeclaratorContext::TemplateTypeArg);
   if (Ty.isInvalid()) {
     SkipUntil(tok::comma, StopAtSemi);
     SkipUntil(tok::r_paren, StopAtSemi);
diff --git a/clang/lib/Parse/ParseObjc.cpp b/clang/lib/Parse/ParseObjc.cpp
index 887d7a36cee7e..671dcb71e51a3 100644
--- a/clang/lib/Parse/ParseObjc.cpp
+++ b/clang/lib/Parse/ParseObjc.cpp
@@ -375,7 +375,7 @@ Decl *Parser::ParseObjCAtInterfaceDeclaration(SourceLocation AtLoc,
     Actions.ActOnTypedefedProtocols(protocols, protocolLocs,
                                     superClassId, superClassLoc);
 
-  Sema::SkipBodyInfo SkipBody;
+  SkipBodyInfo SkipBody;
   ObjCInterfaceDecl *ClsType = Actions.ActOnStartClassInterface(
       getCurScope(), AtLoc, nameId, nameLoc, typeParameterList, superClassId,
       superClassLoc, typeArgs,
@@ -2133,7 +2133,7 @@ Parser::ParseObjCAtProtocolDeclaration(SourceLocation AtLoc,
                                   /*consumeLastToken=*/true))
     return nullptr;
 
-  Sema::SkipBodyInfo SkipBody;
+  SkipBodyInfo SkipBody;
   ObjCProtocolDecl *ProtoType = Actions.ActOnStartProtocolInterface(
       AtLoc, protocolName, nameLoc, ProtocolRefs.data(), ProtocolRefs.size(),
       ProtocolLocs.data(), EndProtoLoc, attrs, &SkipBody);
diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp
index 91f2b8afcf0c2..123be476e928e 100644
--- a/clang/lib/Parse/ParseOpenACC.cpp
+++ b/clang/lib/Parse/ParseOpenACC.cpp
@@ -835,19 +835,23 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams(
     case OpenACCClauseKind::Default: {
       Token DefKindTok = getCurToken();
 
-      if (expectIdentifierOrKeyword(*this))
-        break;
+      if (expectIdentifierOrKeyword(*this)) {
+        Parens.skipToEnd();
+        return OpenACCCanContinue();
+      }
 
       ConsumeToken();
 
       OpenACCDefaultClauseKind DefKind =
           getOpenACCDefaultClauseKind(DefKindTok);
 
-      if (DefKind == OpenACCDefaultClauseKind::Invalid)
+      if (DefKind == OpenACCDefaultClauseKind::Invalid) {
         Diag(DefKindTok, diag::err_acc_invalid_default_clause_kind);
-      else
-        ParsedClause.setDefaultDetails(DefKind);
+        Parens.skipToEnd();
+        return OpenACCCanContinue();
+      }
 
+      ParsedClause.setDefaultDetails(DefKind);
       break;
     }
     case OpenACCClauseKind::If: {
@@ -977,6 +981,8 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams(
       case OpenACCClauseKind::Self: {
         assert(DirKind != OpenACCDirectiveKind::Update);
         ExprResult CondExpr = ParseOpenACCConditionExpr();
+        ParsedClause.setConditionDetails(CondExpr.isUsable() ? CondExpr.get()
+                                                             : nullptr);
 
         if (CondExpr.isInvalid()) {
           Parens.skipToEnd();
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 814126e321d3b..480201bc06f61 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -21,6 +21,7 @@
 #include "clang/Parse/RAIIObjectsForParser.h"
 #include "clang/Sema/EnterExpressionEvaluationContext.h"
 #include "clang/Sema/Scope.h"
+#include "clang/Sema/SemaOpenMP.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/UniqueVector.h"
@@ -87,7 +88,7 @@ class DeclDirectiveListParserHelper final {
   DeclDirectiveListParserHelper(Parser *P, OpenMPDirectiveKind Kind)
       : P(P), Kind(Kind) {}
   void operator()(CXXScopeSpec &SS, DeclarationNameInfo NameInfo) {
-    ExprResult Res = P->getActions().ActOnOpenMPIdExpression(
+    ExprResult Res = P->getActions().OpenMP().ActOnOpenMPIdExpression(
         P->getCurScope(), SS, NameInfo, Kind);
     if (Res.isUsable())
       Identifiers.push_back(Res.get());
@@ -322,8 +323,8 @@ Parser::ParseOpenMPDeclareReductionDirective(AccessSpecifier AS) {
     SourceRange Range;
     TypeResult TR = ParseTypeName(&Range, DeclaratorContext::Prototype, AS);
     if (TR.isUsable()) {
-      QualType ReductionType =
-          Actions.ActOnOpenMPDeclareReductionType(Range.getBegin(), TR);
+      QualType ReductionType = Actions.OpenMP().ActOnOpenMPDeclareReductionType(
+          Range.getBegin(), TR);
       if (!ReductionType.isNull()) {
         ReductionTypes.push_back(
             std::make_pair(ReductionType, Range.getBegin()));
@@ -363,8 +364,10 @@ Parser::ParseOpenMPDeclareReductionDirective(AccessSpecifier AS) {
     return DeclGroupPtrTy();
   }
 
-  DeclGroupPtrTy DRD = Actions.ActOnOpenMPDeclareReductionDirectiveStart(
-      getCurScope(), Actions.getCurLexicalContext(), Name, ReductionTypes, AS);
+  DeclGroupPtrTy DRD =
+      Actions.OpenMP().ActOnOpenMPDeclareReductionDirectiveStart(
+          getCurScope(), Actions.getCurLexicalContext(), Name, ReductionTypes,
+          AS);
 
   // Parse <combiner> expression and then parse initializer if any for each
   // correct type.
@@ -375,10 +378,11 @@ Parser::ParseOpenMPDeclareReductionDirective(AccessSpecifier AS) {
                                     Scope::CompoundStmtScope |
                                     Scope::OpenMPDirectiveScope);
     // Parse <combiner> expression.
-    Actions.ActOnOpenMPDeclareReductionCombinerStart(getCurScope(), D);
+    Actions.OpenMP().ActOnOpenMPDeclareReductionCombinerStart(getCurScope(), D);
     ExprResult CombinerResult = Actions.ActOnFinishFullExpr(
         ParseExpression().get(), D->getLocation(), /*DiscardedValue*/ false);
-    Actions.ActOnOpenMPDeclareReductionCombinerEnd(D, CombinerResult.get());
+    Actions.OpenMP().ActOnOpenMPDeclareReductionCombinerEnd(
+        D, CombinerResult.get());
 
     if (CombinerResult.isInvalid() && Tok.isNot(tok::r_paren) &&
         Tok.isNot(tok::annot_pragma_openmp_end)) {
@@ -411,8 +415,8 @@ Parser::ParseOpenMPDeclareReductionDirective(AccessSpecifier AS) {
                                         Scope::OpenMPDirectiveScope);
         // Parse expression.
         VarDecl *OmpPrivParm =
-            Actions.ActOnOpenMPDeclareReductionInitializerStart(getCurScope(),
-                                                                D);
+            Actions.OpenMP().ActOnOpenMPDeclareReductionInitializerStart(
+                getCurScope(), D);
         // Check if initializer is omp_priv <init_expr> or something else.
         if (Tok.is(tok::identifier) &&
             Tok.getIdentifierInfo()->isStr("omp_priv")) {
@@ -423,7 +427,7 @@ Parser::ParseOpenMPDeclareReductionDirective(AccessSpecifier AS) {
               ParseAssignmentExpression().get(), D->getLocation(),
               /*DiscardedValue*/ false);
         }
-        Actions.ActOnOpenMPDeclareReductionInitializerEnd(
+        Actions.OpenMP().ActOnOpenMPDeclareReductionInitializerEnd(
             D, InitializerResult.get(), OmpPrivParm);
         if (InitializerResult.isInvalid() && Tok.isNot(tok::r_paren) &&
             Tok.isNot(tok::annot_pragma_openmp_end)) {
@@ -444,8 +448,8 @@ Parser::ParseOpenMPDeclareReductionDirective(AccessSpecifier AS) {
     else
       TPA.Commit();
   }
-  return Actions.ActOnOpenMPDeclareReductionDirectiveEnd(getCurScope(), DRD,
-                                                         IsCorrect);
+  return Actions.OpenMP().ActOnOpenMPDeclareReductionDirectiveEnd(
+      getCurScope(), DRD, IsCorrect);
 }
 
 void Parser::ParseOpenMPReductionInitializerForDecl(VarDecl *OmpPrivParm) {
@@ -569,8 +573,8 @@ Parser::ParseOpenMPDeclareMapperDirective(AccessSpecifier AS) {
   SourceRange Range;
   TypeResult ParsedType = parseOpenMPDeclareMapperVarDecl(Range, VName, AS);
   if (ParsedType.isUsable())
-    MapperType =
-        Actions.ActOnOpenMPDeclareMapperType(Range.getBegin(), ParsedType);
+    MapperType = Actions.OpenMP().ActOnOpenMPDeclareMapperType(Range.getBegin(),
+                                                               ParsedType);
   if (MapperType.isNull())
     IsCorrect = false;
   if (!IsCorrect) {
@@ -591,11 +595,13 @@ Parser::ParseOpenMPDeclareMapperDirective(AccessSpecifier AS) {
   unsigned ScopeFlags = Scope::FnScope | Scope::DeclScope |
                         Scope::CompoundStmtScope | Scope::OpenMPDirectiveScope;
   ParseScope OMPDirectiveScope(this, ScopeFlags);
-  Actions.StartOpenMPDSABlock(OMPD_declare_mapper, DirName, getCurScope(), Loc);
+  Actions.OpenMP().StartOpenMPDSABlock(OMPD_declare_mapper, DirName,
+                                       getCurScope(), Loc);
 
   // Add the mapper variable declaration.
-  ExprResult MapperVarRef = Actions.ActOnOpenMPDeclareMapperDirectiveVarDecl(
-      getCurScope(), MapperType, Range.getBegin(), VName);
+  ExprResult MapperVarRef =
+      Actions.OpenMP().ActOnOpenMPDeclareMapperDirectiveVarDecl(
+          getCurScope(), MapperType, Range.getBegin(), VName);
 
   // Parse map clauses.
   SmallVector<OMPClause *, 6> Clauses;
@@ -603,7 +609,7 @@ Parser::ParseOpenMPDeclareMapperDirective(AccessSpecifier AS) {
     OpenMPClauseKind CKind = Tok.isAnnotation()
                                  ? OMPC_unknown
                                  : getOpenMPClauseKind(PP.getSpelling(Tok));
-    Actions.StartOpenMPClause(CKind);
+    Actions.OpenMP().StartOpenMPClause(CKind);
     OMPClause *Clause =
         ParseOpenMPClause(OMPD_declare_mapper, CKind, Clauses.empty());
     if (Clause)
@@ -613,7 +619,7 @@ Parser::ParseOpenMPDeclareMapperDirective(AccessSpecifier AS) {
     // Skip ',' if any.
     if (Tok.is(tok::comma))
       ConsumeToken();
-    Actions.EndOpenMPClause();
+    Actions.OpenMP().EndOpenMPClause();
   }
   if (Clauses.empty()) {
     Diag(Tok, diag::err_omp_expected_clause)
@@ -622,9 +628,9 @@ Parser::ParseOpenMPDeclareMapperDirective(AccessSpecifier AS) {
   }
 
   // Exit scope.
-  Actions.EndOpenMPDSABlock(nullptr);
+  Actions.OpenMP().EndOpenMPDSABlock(nullptr);
   OMPDirectiveScope.Exit();
-  DeclGroupPtrTy DG = Actions.ActOnOpenMPDeclareMapperDirective(
+  DeclGroupPtrTy DG = Actions.OpenMP().ActOnOpenMPDeclareMapperDirective(
       getCurScope(), Actions.getCurLexicalContext(), MapperId, MapperType,
       Range.getBegin(), VName, AS, MapperVarRef.get(), Clauses);
   if (!IsCorrect)
@@ -652,7 +658,8 @@ TypeResult Parser::parseOpenMPDeclareMapperVarDecl(SourceRange &Range,
   }
   Name = Actions.GetNameForDeclarator(DeclaratorInfo).getName();
 
-  return Actions.ActOnOpenMPDeclareMapperVarDecl(getCurScope(), DeclaratorInfo);
+  return Actions.OpenMP().ActOnOpenMPDeclareMapperVarDecl(getCurScope(),
+                                                          DeclaratorInfo);
 }
 
 namespace {
@@ -748,7 +755,7 @@ static bool parseDeclareSimdClauses(
       OpenMPClauseKind CKind = getOpenMPClauseKind(ClauseName);
       if (CKind == OMPC_uniform || CKind == OMPC_aligned ||
           CKind == OMPC_linear) {
-        Sema::OpenMPVarListDataTy Data;
+        SemaOpenMP::OpenMPVarListDataTy Data;
         SmallVectorImpl<Expr *> *Vars = &Uniforms;
         if (CKind == OMPC_aligned) {
           Vars = &Aligneds;
@@ -768,7 +775,7 @@ static bool parseDeclareSimdClauses(
           assert(0 <= Data.ExtraModifier &&
                  Data.ExtraModifier <= OMPC_LINEAR_unknown &&
                  "Unexpected linear modifier.");
-          if (P.getActions().CheckOpenMPLinearModifier(
+          if (P.getActions().OpenMP().CheckOpenMPLinearModifier(
                   static_cast<OpenMPLinearClauseKind>(Data.ExtraModifier),
                   Data.ExtraModifierLoc))
             Data.ExtraModifier = OMPC_LINEAR_val;
@@ -816,7 +823,7 @@ Parser::ParseOMPDeclareSimdClauses(Parser::DeclGroupPtrTy Ptr,
   SourceLocation EndLoc = ConsumeAnnotationToken();
   if (IsError)
     return Ptr;
-  return Actions.ActOnOpenMPDeclareSimdDirective(
+  return Actions.OpenMP().ActOnOpenMPDeclareSimdDirective(
       Ptr, BS, Simdlen.get(), Uniforms, Aligneds, Alignments, Linears,
       LinModifiers, Steps, SourceRange(Loc, EndLoc));
 }
@@ -1412,7 +1419,8 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr,
     return;
   }
 
-  OMPTraitInfo *ParentTI = Actions.getOMPTraitInfoForSurroundingScope();
+  OMPTraitInfo *ParentTI =
+      Actions.OpenMP().getOMPTraitInfoForSurroundingScope();
   ASTContext &ASTCtx = Actions.getASTContext();
   OMPTraitInfo &TI = ASTCtx.getNewOMPTraitInfo();
   SmallVector<Expr *, 6> AdjustNothing;
@@ -1445,7 +1453,7 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr,
       case OMPC_adjust_args: {
         AdjustArgsLoc = Tok.getLocation();
         ConsumeToken();
-        Sema::OpenMPVarListDataTy Data;
+        SemaOpenMP::OpenMPVarListDataTy Data;
         SmallVector<Expr *> Vars;
         IsError = ParseOpenMPVarList(OMPD_declare_variant, OMPC_adjust_args,
                                      Vars, Data);
@@ -1486,12 +1494,12 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr,
   }
 
   std::optional<std::pair<FunctionDecl *, Expr *>> DeclVarData =
-      Actions.checkOpenMPDeclareVariantFunction(
+      Actions.OpenMP().checkOpenMPDeclareVariantFunction(
           Ptr, AssociatedFunction.get(), TI, AppendArgs.size(),
           SourceRange(Loc, Tok.getLocation()));
 
   if (DeclVarData && !TI.Sets.empty())
-    Actions.ActOnOpenMPDeclareVariantDirective(
+    Actions.OpenMP().ActOnOpenMPDeclareVariantDirective(
         DeclVarData->first, DeclVarData->second, TI, AdjustNothing,
         AdjustNeedDevicePtr, AppendArgs, AdjustArgsLoc, AppendArgsLoc,
         SourceRange(Loc, Tok.getLocation()));
@@ -1642,7 +1650,7 @@ void Parser::ParseOpenMPClauses(OpenMPDirectiveKind DKind,
     OpenMPClauseKind CKind = Tok.isAnnotation()
                                  ? OMPC_unknown
                                  : getOpenMPClauseKind(PP.getSpelling(Tok));
-    Actions.StartOpenMPClause(CKind);
+    Actions.OpenMP().StartOpenMPClause(CKind);
     OMPClause *Clause = ParseOpenMPClause(
         DKind, CKind, !FirstClauses[unsigned(CKind)].getInt());
     SkipUntil(tok::comma, tok::identifier, tok::annot_pragma_openmp_end,
@@ -1651,13 +1659,13 @@ void Parser::ParseOpenMPClauses(OpenMPDirectiveKind DKind,
     if (Clause != nullptr)
       Clauses.push_back(Clause);
     if (Tok.is(tok::annot_pragma_openmp_end)) {
-      Actions.EndOpenMPClause();
+      Actions.OpenMP().EndOpenMPClause();
       break;
     }
     // Skip ',' if any.
     if (Tok.is(tok::comma))
       ConsumeToken();
-    Actions.EndOpenMPClause();
+    Actions.OpenMP().EndOpenMPClause();
   }
 }
 
@@ -1750,12 +1758,13 @@ void Parser::ParseOpenMPAssumesDirective(OpenMPDirectiveKind DKind,
     Assumptions.push_back(Assumption);
   }
 
-  Actions.ActOnOpenMPAssumesDirective(Loc, DKind, Assumptions, SkippedClauses);
+  Actions.OpenMP().ActOnOpenMPAssumesDirective(Loc, DKind, Assumptions,
+                                               SkippedClauses);
 }
 
 void Parser::ParseOpenMPEndAssumesDirective(SourceLocation Loc) {
-  if (Actions.isInOpenMPAssumeScope())
-    Actions.ActOnOpenMPEndAssumesDirective();
+  if (Actions.OpenMP().isInOpenMPAssumeScope())
+    Actions.OpenMP().ActOnOpenMPEndAssumesDirective();
   else
     Diag(Loc, diag::err_expected_begin_assumes);
 }
@@ -1811,7 +1820,7 @@ parseOpenMPSimpleClause(Parser &P, OpenMPClauseKind Kind) {
 }
 
 void Parser::ParseOMPDeclareTargetClauses(
-    Sema::DeclareTargetContextInfo &DTCI) {
+    SemaOpenMP::DeclareTargetContextInfo &DTCI) {
   SourceLocation DeviceTypeLoc;
   bool RequiresToOrLinkOrIndirectClause = false;
   bool HasToOrLinkOrIndirectClause = false;
@@ -1910,11 +1919,11 @@ void Parser::ParseOMPDeclareTargetClauses(
     if (DTCI.Kind == OMPD_declare_target || HasIdentifier) {
       auto &&Callback = [this, MT, &DTCI](CXXScopeSpec &SS,
                                           DeclarationNameInfo NameInfo) {
-        NamedDecl *ND =
-            Actions.lookupOpenMPDeclareTargetName(getCurScope(), SS, NameInfo);
+        NamedDecl *ND = Actions.OpenMP().lookupOpenMPDeclareTargetName(
+            getCurScope(), SS, NameInfo);
         if (!ND)
           return;
-        Sema::DeclareTargetContextInfo::MapInfo MI{MT, NameInfo.getLoc()};
+        SemaOpenMP::DeclareTargetContextInfo::MapInfo MI{MT, NameInfo.getLoc()};
         bool FirstMapping = DTCI.ExplicitlyMapped.try_emplace(ND, MI).second;
         if (!FirstMapping)
           Diag(NameInfo.getLoc(), diag::err_omp_declare_target_multiple)
@@ -2090,8 +2099,8 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl(
       skipUntilPragmaOpenMPEnd(DKind);
       // Skip the last annot_pragma_openmp_end.
       ConsumeAnnotationToken();
-      return Actions.ActOnOpenMPThreadprivateDirective(Loc,
-                                                       Helper.getIdentifiers());
+      return Actions.OpenMP().ActOnOpenMPThreadprivateDirective(
+          Loc, Helper.getIdentifiers());
     }
     break;
   }
@@ -2109,7 +2118,7 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl(
           OpenMPClauseKind CKind =
               Tok.isAnnotation() ? OMPC_unknown
                                  : getOpenMPClauseKind(PP.getSpelling(Tok));
-          Actions.StartOpenMPClause(CKind);
+          Actions.OpenMP().StartOpenMPClause(CKind);
           OMPClause *Clause = ParseOpenMPClause(
               OMPD_allocate, CKind, !FirstClauses[unsigned(CKind)].getInt());
           SkipUntil(tok::comma, tok::identifier, tok::annot_pragma_openmp_end,
@@ -2118,20 +2127,20 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl(
           if (Clause != nullptr)
             Clauses.push_back(Clause);
           if (Tok.is(tok::annot_pragma_openmp_end)) {
-            Actions.EndOpenMPClause();
+            Actions.OpenMP().EndOpenMPClause();
             break;
           }
           // Skip ',' if any.
           if (Tok.is(tok::comma))
             ConsumeToken();
-          Actions.EndOpenMPClause();
+          Actions.OpenMP().EndOpenMPClause();
         }
         skipUntilPragmaOpenMPEnd(DKind);
       }
       // Skip the last annot_pragma_openmp_end.
       ConsumeAnnotationToken();
-      return Actions.ActOnOpenMPAllocateDirective(Loc, Helper.getIdentifiers(),
-                                                  Clauses);
+      return Actions.OpenMP().ActOnOpenMPAllocateDirective(
+          Loc, Helper.getIdentifiers(), Clauses);
     }
     break;
   }
@@ -2150,7 +2159,7 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl(
       OpenMPClauseKind CKind = Tok.isAnnotation()
                                    ? OMPC_unknown
                                    : getOpenMPClauseKind(PP.getSpelling(Tok));
-      Actions.StartOpenMPClause(CKind);
+      Actions.OpenMP().StartOpenMPClause(CKind);
       OMPClause *Clause = ParseOpenMPClause(
           OMPD_requires, CKind, !FirstClauses[unsigned(CKind)].getInt());
       SkipUntil(tok::comma, tok::identifier, tok::annot_pragma_openmp_end,
@@ -2159,13 +2168,13 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl(
       if (Clause != nullptr)
         Clauses.push_back(Clause);
       if (Tok.is(tok::annot_pragma_openmp_end)) {
-        Actions.EndOpenMPClause();
+        Actions.OpenMP().EndOpenMPClause();
         break;
       }
       // Skip ',' if any.
       if (Tok.is(tok::comma))
         ConsumeToken();
-      Actions.EndOpenMPClause();
+      Actions.OpenMP().EndOpenMPClause();
     }
     // Consume final annot_pragma_openmp_end
     if (Clauses.empty()) {
@@ -2175,14 +2184,15 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl(
       return nullptr;
     }
     ConsumeAnnotationToken();
-    return Actions.ActOnOpenMPRequiresDirective(StartLoc, Clauses);
+    return Actions.OpenMP().ActOnOpenMPRequiresDirective(StartLoc, Clauses);
   }
   case OMPD_error: {
     SmallVector<OMPClause *, 1> Clauses;
     SourceLocation StartLoc = ConsumeToken();
     ParseOpenMPClauses(DKind, Clauses, StartLoc);
-    Actions.ActOnOpenMPErrorDirective(Clauses, StartLoc, SourceLocation(),
-                                      /*InExContext = */ false);
+    Actions.OpenMP().ActOnOpenMPErrorDirective(Clauses, StartLoc,
+                                               SourceLocation(),
+                                               /*InExContext = */ false);
     break;
   }
   case OMPD_assumes:
@@ -2217,7 +2227,8 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl(
     // { #pragma omp end declare variant }
     //
     ConsumeToken();
-    OMPTraitInfo *ParentTI = Actions.getOMPTraitInfoForSurroundingScope();
+    OMPTraitInfo *ParentTI =
+        Actions.OpenMP().getOMPTraitInfoForSurroundingScope();
     ASTContext &ASTCtx = Actions.getASTContext();
     OMPTraitInfo &TI = ASTCtx.getNewOMPTraitInfo();
     if (parseOMPDeclareVariantMatchClause(Loc, TI, ParentTI)) {
@@ -2248,7 +2259,7 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl(
         /* ConstructTraits */ ArrayRef<llvm::omp::TraitProperty>());
 
     if (isVariantApplicableInContext(VMI, OMPCtx, /* DeviceSetOnly */ true)) {
-      Actions.ActOnOpenMPBeginDeclareVariant(Loc, TI);
+      Actions.OpenMP().ActOnOpenMPBeginDeclareVariant(Loc, TI);
       break;
     }
 
@@ -2275,8 +2286,8 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl(
     break;
   }
   case OMPD_end_declare_variant: {
-    if (Actions.isInOpenMPDeclareVariantScope())
-      Actions.ActOnOpenMPEndDeclareVariant();
+    if (Actions.OpenMP().isInOpenMPDeclareVariantScope())
+      Actions.OpenMP().ActOnOpenMPEndDeclareVariant();
     else
       Diag(Loc, diag::err_expected_begin_declare_variant);
     ConsumeToken();
@@ -2331,7 +2342,7 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl(
   case OMPD_declare_target: {
     SourceLocation DTLoc = ConsumeAnyToken();
     bool HasClauses = Tok.isNot(tok::annot_pragma_openmp_end);
-    Sema::DeclareTargetContextInfo DTCI(DKind, DTLoc);
+    SemaOpenMP::DeclareTargetContextInfo DTCI(DKind, DTLoc);
     if (HasClauses)
       ParseOMPDeclareTargetClauses(DTCI);
     bool HasImplicitMappings = DKind == OMPD_begin_declare_target ||
@@ -2342,24 +2353,24 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl(
     ConsumeAnyToken();
 
     if (HasImplicitMappings) {
-      Actions.ActOnStartOpenMPDeclareTargetContext(DTCI);
+      Actions.OpenMP().ActOnStartOpenMPDeclareTargetContext(DTCI);
       return nullptr;
     }
 
-    Actions.ActOnFinishedOpenMPDeclareTargetContext(DTCI);
+    Actions.OpenMP().ActOnFinishedOpenMPDeclareTargetContext(DTCI);
     llvm::SmallVector<Decl *, 4> Decls;
     for (auto &It : DTCI.ExplicitlyMapped)
       Decls.push_back(It.first);
     return Actions.BuildDeclaratorGroup(Decls);
   }
   case OMPD_end_declare_target: {
-    if (!Actions.isInOpenMPDeclareTargetContext()) {
+    if (!Actions.OpenMP().isInOpenMPDeclareTargetContext()) {
       Diag(Tok, diag::err_omp_unexpected_directive)
           << 1 << getOpenMPDirectiveName(DKind);
       break;
     }
-    const Sema::DeclareTargetContextInfo &DTCI =
-        Actions.ActOnOpenMPEndDeclareTargetDirective();
+    const SemaOpenMP::DeclareTargetContextInfo &DTCI =
+        Actions.OpenMP().ActOnOpenMPEndDeclareTargetDirective();
     ParseOMPEndDeclareTargetDirective(DTCI.Kind, DKind, DTCI.Loc);
     return nullptr;
   }
@@ -2683,7 +2694,7 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
     if (!ParseOpenMPSimpleVarList(DKind, Helper,
                                   /*AllowScopeSpecifier=*/false)) {
       skipUntilPragmaOpenMPEnd(DKind);
-      DeclGroupPtrTy Res = Actions.ActOnOpenMPThreadprivateDirective(
+      DeclGroupPtrTy Res = Actions.OpenMP().ActOnOpenMPThreadprivateDirective(
           Loc, Helper.getIdentifiers());
       Directive = Actions.ActOnDeclStmt(Res, Loc, Tok.getLocation());
     }
@@ -2710,7 +2721,7 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
           OpenMPClauseKind CKind =
               Tok.isAnnotation() ? OMPC_unknown
                                  : getOpenMPClauseKind(PP.getSpelling(Tok));
-          Actions.StartOpenMPClause(CKind);
+          Actions.OpenMP().StartOpenMPClause(CKind);
           OMPClause *Clause = ParseOpenMPClause(
               OMPD_allocate, CKind, !FirstClauses[unsigned(CKind)].getInt());
           SkipUntil(tok::comma, tok::identifier, tok::annot_pragma_openmp_end,
@@ -2719,17 +2730,17 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
           if (Clause != nullptr)
             Clauses.push_back(Clause);
           if (Tok.is(tok::annot_pragma_openmp_end)) {
-            Actions.EndOpenMPClause();
+            Actions.OpenMP().EndOpenMPClause();
             break;
           }
           // Skip ',' if any.
           if (Tok.is(tok::comma))
             ConsumeToken();
-          Actions.EndOpenMPClause();
+          Actions.OpenMP().EndOpenMPClause();
         }
         skipUntilPragmaOpenMPEnd(DKind);
       }
-      DeclGroupPtrTy Res = Actions.ActOnOpenMPAllocateDirective(
+      DeclGroupPtrTy Res = Actions.OpenMP().ActOnOpenMPAllocateDirective(
           Loc, Helper.getIdentifiers(), Clauses);
       Directive = Actions.ActOnDeclStmt(Res, Loc, Tok.getLocation());
     }
@@ -2875,7 +2886,8 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
     if (isOpenMPSimdDirective(DKind))
       ScopeFlags |= Scope::OpenMPSimdDirectiveScope;
     ParseScope OMPDirectiveScope(this, ScopeFlags);
-    Actions.StartOpenMPDSABlock(DKind, DirName, Actions.getCurScope(), Loc);
+    Actions.OpenMP().StartOpenMPDSABlock(DKind, DirName, Actions.getCurScope(),
+                                         Loc);
 
     while (Tok.isNot(tok::annot_pragma_openmp_end)) {
       // If we are parsing for a directive within a metadirective, the directive
@@ -2909,7 +2921,7 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
       }
       // No more implicit clauses allowed.
       ImplicitClauseAllowed = false;
-      Actions.StartOpenMPClause(CKind);
+      Actions.OpenMP().StartOpenMPClause(CKind);
       HasImplicitClause = false;
       OMPClause *Clause = ParseOpenMPClause(
           DKind, CKind, !FirstClauses[unsigned(CKind)].getInt());
@@ -2922,7 +2934,7 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
       // Skip ',' if any.
       if (Tok.is(tok::comma))
         ConsumeToken();
-      Actions.EndOpenMPClause();
+      Actions.OpenMP().EndOpenMPClause();
     }
     // End location of the directive.
     EndLoc = Tok.getLocation();
@@ -2953,7 +2965,7 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
     StmtResult AssociatedStmt;
     if (HasAssociatedStatement) {
       // The body is a block scope like in Lambdas and Blocks.
-      Actions.ActOnOpenMPRegionStart(DKind, getCurScope());
+      Actions.OpenMP().ActOnOpenMPRegionStart(DKind, getCurScope());
       // FIXME: We create a bogus CompoundStmt scope to hold the contents of
       // the captured region. Code elsewhere assumes that any FunctionScopeInfo
       // should have at least one compound statement scope within it.
@@ -2964,30 +2976,33 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
 
         if (AssociatedStmt.isUsable() && isOpenMPLoopDirective(DKind) &&
             getLangOpts().OpenMPIRBuilder)
-          AssociatedStmt = Actions.ActOnOpenMPLoopnest(AssociatedStmt.get());
+          AssociatedStmt =
+              Actions.OpenMP().ActOnOpenMPLoopnest(AssociatedStmt.get());
       }
-      AssociatedStmt = Actions.ActOnOpenMPRegionEnd(AssociatedStmt, Clauses);
+      AssociatedStmt =
+          Actions.OpenMP().ActOnOpenMPRegionEnd(AssociatedStmt, Clauses);
     } else if (DKind == OMPD_target_update || DKind == OMPD_target_enter_data ||
                DKind == OMPD_target_exit_data) {
-      Actions.ActOnOpenMPRegionStart(DKind, getCurScope());
+      Actions.OpenMP().ActOnOpenMPRegionStart(DKind, getCurScope());
       AssociatedStmt = (Sema::CompoundScopeRAII(Actions),
                         Actions.ActOnCompoundStmt(Loc, Loc, std::nullopt,
                                                   /*isStmtExpr=*/false));
-      AssociatedStmt = Actions.ActOnOpenMPRegionEnd(AssociatedStmt, Clauses);
+      AssociatedStmt =
+          Actions.OpenMP().ActOnOpenMPRegionEnd(AssociatedStmt, Clauses);
     }
-    Directive = Actions.ActOnOpenMPExecutableDirective(
+    Directive = Actions.OpenMP().ActOnOpenMPExecutableDirective(
         DKind, DirName, CancelRegion, Clauses, AssociatedStmt.get(), Loc,
         EndLoc);
 
     // Exit scope.
-    Actions.EndOpenMPDSABlock(Directive.get());
+    Actions.OpenMP().EndOpenMPDSABlock(Directive.get());
     OMPDirectiveScope.Exit();
     break;
   }
   case OMPD_declare_target: {
     SourceLocation DTLoc = ConsumeAnyToken();
     bool HasClauses = Tok.isNot(tok::annot_pragma_openmp_end);
-    Sema::DeclareTargetContextInfo DTCI(DKind, DTLoc);
+    SemaOpenMP::DeclareTargetContextInfo DTCI(DKind, DTLoc);
     if (HasClauses)
       ParseOMPDeclareTargetClauses(DTCI);
     bool HasImplicitMappings =
@@ -3003,7 +3018,7 @@ StmtResult Parser::ParseOpenMPDeclarativeOrExecutableDirective(
     // Skip the last annot_pragma_openmp_end.
     ConsumeAnyToken();
 
-    Actions.ActOnFinishedOpenMPDeclareTargetContext(DTCI);
+    Actions.OpenMP().ActOnFinishedOpenMPDeclareTargetContext(DTCI);
     break;
   }
   case OMPD_declare_simd:
@@ -3118,7 +3133,7 @@ OMPClause *Parser::ParseOpenMPSizesClause() {
 
   T.consumeClose();
 
-  return Actions.ActOnOpenMPSizesClause(
+  return Actions.OpenMP().ActOnOpenMPSizesClause(
       ValExprs, ClauseNameLoc, T.getOpenLocation(), T.getCloseLocation());
 }
 
@@ -3130,7 +3145,7 @@ OMPClause *Parser::ParseOpenMPUsesAllocatorClause(OpenMPDirectiveKind DKind) {
   BalancedDelimiterTracker T(*this, tok::l_paren, tok::annot_pragma_openmp_end);
   if (T.expectAndConsume(diag::err_expected_lparen_after, "uses_allocator"))
     return nullptr;
-  SmallVector<Sema::UsesAllocatorsData, 4> Data;
+  SmallVector<SemaOpenMP::UsesAllocatorsData, 4> Data;
   do {
     CXXScopeSpec SS;
     Token Replacement;
@@ -3144,7 +3159,7 @@ OMPClause *Parser::ParseOpenMPUsesAllocatorClause(OpenMPDirectiveKind DKind) {
                 StopBeforeMatch);
       break;
     }
-    Sema::UsesAllocatorsData &D = Data.emplace_back();
+    SemaOpenMP::UsesAllocatorsData &D = Data.emplace_back();
     D.Allocator = Allocator.get();
     if (Tok.is(tok::l_paren)) {
       BalancedDelimiterTracker T(*this, tok::l_paren,
@@ -3169,8 +3184,8 @@ OMPClause *Parser::ParseOpenMPUsesAllocatorClause(OpenMPDirectiveKind DKind) {
       ConsumeAnyToken();
   } while (Tok.isNot(tok::r_paren) && Tok.isNot(tok::annot_pragma_openmp_end));
   T.consumeClose();
-  return Actions.ActOnOpenMPUsesAllocatorClause(Loc, T.getOpenLocation(),
-                                                T.getCloseLocation(), Data);
+  return Actions.OpenMP().ActOnOpenMPUsesAllocatorClause(
+      Loc, T.getOpenLocation(), T.getCloseLocation(), Data);
 }
 
 /// Parsing of OpenMP clauses.
@@ -3538,15 +3553,16 @@ OMPClause *Parser::ParseOpenMPSingleExprClause(OpenMPClauseKind Kind,
 
   if (ParseOnly)
     return nullptr;
-  return Actions.ActOnOpenMPSingleExprClause(Kind, Val.get(), Loc, LLoc, RLoc);
+  return Actions.OpenMP().ActOnOpenMPSingleExprClause(Kind, Val.get(), Loc,
+                                                      LLoc, RLoc);
 }
 
 /// Parse indirect clause for '#pragma omp declare target' directive.
 ///  'indirect' '[' '(' invoked-by-fptr ')' ']'
 /// where invoked-by-fptr is a constant boolean expression that evaluates to
 /// true or false at compile time.
-bool Parser::ParseOpenMPIndirectClause(Sema::DeclareTargetContextInfo &DTCI,
-                                       bool ParseOnly) {
+bool Parser::ParseOpenMPIndirectClause(
+    SemaOpenMP::DeclareTargetContextInfo &DTCI, bool ParseOnly) {
   SourceLocation Loc = ConsumeToken();
   SourceLocation RLoc;
 
@@ -3721,15 +3737,16 @@ OMPClause *Parser::ParseOpenMPInteropClause(OpenMPClauseKind Kind,
     return nullptr;
 
   if (Kind == OMPC_init)
-    return Actions.ActOnOpenMPInitClause(InteropVarExpr.get(), InteropInfo, Loc,
-                                         T.getOpenLocation(), VarLoc, RLoc);
+    return Actions.OpenMP().ActOnOpenMPInitClause(
+        InteropVarExpr.get(), InteropInfo, Loc, T.getOpenLocation(), VarLoc,
+        RLoc);
   if (Kind == OMPC_use)
-    return Actions.ActOnOpenMPUseClause(InteropVarExpr.get(), Loc,
-                                        T.getOpenLocation(), VarLoc, RLoc);
+    return Actions.OpenMP().ActOnOpenMPUseClause(
+        InteropVarExpr.get(), Loc, T.getOpenLocation(), VarLoc, RLoc);
 
   if (Kind == OMPC_destroy)
-    return Actions.ActOnOpenMPDestroyClause(InteropVarExpr.get(), Loc,
-                                            T.getOpenLocation(), VarLoc, RLoc);
+    return Actions.OpenMP().ActOnOpenMPDestroyClause(
+        InteropVarExpr.get(), Loc, T.getOpenLocation(), VarLoc, RLoc);
 
   llvm_unreachable("Unexpected interop variable clause.");
 }
@@ -3787,8 +3804,8 @@ OMPClause *Parser::ParseOpenMPOMPXAttributesClause(bool ParseOnly) {
     };
   }
 
-  return Actions.ActOnOpenMPXAttributeClause(Attrs, Loc, T.getOpenLocation(),
-                                             T.getCloseLocation());
+  return Actions.OpenMP().ActOnOpenMPXAttributeClause(
+      Attrs, Loc, T.getOpenLocation(), T.getCloseLocation());
 }
 
 /// Parsing of simple OpenMP clauses like 'default' or 'proc_bind'.
@@ -3823,9 +3840,8 @@ OMPClause *Parser::ParseOpenMPSimpleClause(OpenMPClauseKind Kind,
         << getOpenMPClauseName(OMPC_default) << "5.1";
     return nullptr;
   }
-  return Actions.ActOnOpenMPSimpleClause(Kind, Val->Type,
-                                         Val->TypeLoc, Val->LOpen,
-                                         Val->Loc, Val->RLoc);
+  return Actions.OpenMP().ActOnOpenMPSimpleClause(
+      Kind, Val->Type, Val->TypeLoc, Val->LOpen, Val->Loc, Val->RLoc);
 }
 
 /// Parsing of OpenMP clauses like 'ordered'.
@@ -3860,7 +3876,7 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPClauseKind Kind, bool ParseOnly) {
 
   if (ParseOnly)
     return nullptr;
-  return Actions.ActOnOpenMPClause(Kind, Loc, Tok.getLocation());
+  return Actions.OpenMP().ActOnOpenMPClause(Kind, Loc, Tok.getLocation());
 }
 
 /// Parsing of OpenMP clauses with single expressions and some additional
@@ -4118,7 +4134,7 @@ OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPDirectiveKind DKind,
 
   if (ParseOnly)
     return nullptr;
-  return Actions.ActOnOpenMPSingleExprWithArgClause(
+  return Actions.OpenMP().ActOnOpenMPSingleExprWithArgClause(
       Kind, Arg, Val.get(), Loc, T.getOpenLocation(), KLoc, DelimLoc, RLoc);
 }
 
@@ -4184,7 +4200,7 @@ static OpenMPMapModifierKind isMapModifier(Parser &P) {
 }
 
 /// Parse the mapper modifier in map, to, and from clauses.
-bool Parser::parseMapperModifier(Sema::OpenMPVarListDataTy &Data) {
+bool Parser::parseMapperModifier(SemaOpenMP::OpenMPVarListDataTy &Data) {
   // Parse '('.
   BalancedDelimiterTracker T(*this, tok::l_paren, tok::colon);
   if (T.expectAndConsume(diag::err_expected_lparen_after, "mapper")) {
@@ -4216,7 +4232,7 @@ bool Parser::parseMapperModifier(Sema::OpenMPVarListDataTy &Data) {
 /// map([ [map-type-modifier[,] [map-type-modifier[,] ...] map-type : ] list)
 /// where, map-type-modifier ::= always | close | mapper(mapper-identifier) |
 /// present
-bool Parser::parseMapTypeModifiers(Sema::OpenMPVarListDataTy &Data) {
+bool Parser::parseMapTypeModifiers(SemaOpenMP::OpenMPVarListDataTy &Data) {
   while (getCurToken().isNot(tok::colon)) {
     OpenMPMapModifierKind TypeModifier = isMapModifier(*this);
     if (TypeModifier == OMPC_MAP_MODIFIER_always ||
@@ -4282,7 +4298,7 @@ static OpenMPMapClauseKind isMapType(Parser &P) {
 /// Parse map-type in map clause.
 /// map([ [map-type-modifier[,] [map-type-modifier[,] ...] map-type : ] list)
 /// where, map-type ::= to | from | tofrom | alloc | release | delete
-static void parseMapType(Parser &P, Sema::OpenMPVarListDataTy &Data) {
+static void parseMapType(Parser &P, SemaOpenMP::OpenMPVarListDataTy &Data) {
   Token Tok = P.getCurToken();
   if (Tok.is(tok::colon)) {
     P.Diag(Tok, diag::err_omp_map_type_missing);
@@ -4306,7 +4322,7 @@ ExprResult Parser::ParseOpenMPIteratorsExpr() {
     return ExprError();
 
   SourceLocation LLoc = T.getOpenLocation();
-  SmallVector<Sema::OMPIteratorData, 4> Data;
+  SmallVector<SemaOpenMP::OMPIteratorData, 4> Data;
   while (Tok.isNot(tok::r_paren) && Tok.isNot(tok::annot_pragma_openmp_end)) {
     // Check if the type parsing is required.
     ParsedType IteratorType;
@@ -4380,7 +4396,7 @@ ExprResult Parser::ParseOpenMPIteratorsExpr() {
     if (Tok.is(tok::comma))
       ConsumeToken();
 
-    Sema::OMPIteratorData &D = Data.emplace_back();
+    SemaOpenMP::OMPIteratorData &D = Data.emplace_back();
     D.DeclIdent = II;
     D.DeclIdentLoc = IdLoc;
     D.Type = IteratorType;
@@ -4397,12 +4413,12 @@ ExprResult Parser::ParseOpenMPIteratorsExpr() {
   if (!T.consumeClose())
     RLoc = T.getCloseLocation();
 
-  return Actions.ActOnOMPIteratorExpr(getCurScope(), IteratorKwLoc, LLoc, RLoc,
-                                      Data);
+  return Actions.OpenMP().ActOnOMPIteratorExpr(getCurScope(), IteratorKwLoc,
+                                               LLoc, RLoc, Data);
 }
 
 bool Parser::ParseOpenMPReservedLocator(OpenMPClauseKind Kind,
-                                        Sema::OpenMPVarListDataTy &Data,
+                                        SemaOpenMP::OpenMPVarListDataTy &Data,
                                         const LangOptions &LangOpts) {
   // Currently the only reserved locator is 'omp_all_memory' which is only
   // allowed on a depend clause.
@@ -4430,7 +4446,7 @@ bool Parser::ParseOpenMPReservedLocator(OpenMPClauseKind Kind,
 
 /// Parse step size expression. Returns true if parsing is successfull,
 /// otherwise returns false.
-static bool parseStepSize(Parser &P, Sema::OpenMPVarListDataTy &Data,
+static bool parseStepSize(Parser &P, SemaOpenMP::OpenMPVarListDataTy &Data,
                           OpenMPClauseKind CKind, SourceLocation ELoc) {
   ExprResult Tail = P.ParseAssignmentExpression();
   Sema &Actions = P.getActions();
@@ -4451,7 +4467,7 @@ static bool parseStepSize(Parser &P, Sema::OpenMPVarListDataTy &Data,
 bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind,
                                 OpenMPClauseKind Kind,
                                 SmallVectorImpl<Expr *> &Vars,
-                                Sema::OpenMPVarListDataTy &Data) {
+                                SemaOpenMP::OpenMPVarListDataTy &Data) {
   UnqualifiedId UnqualifiedReductionId;
   bool InvalidReductionId = false;
   bool IsInvalidMapperModifier = false;
@@ -4961,7 +4977,7 @@ OMPClause *Parser::ParseOpenMPVarListClause(OpenMPDirectiveKind DKind,
   SourceLocation Loc = Tok.getLocation();
   SourceLocation LOpen = ConsumeToken();
   SmallVector<Expr *, 4> Vars;
-  Sema::OpenMPVarListDataTy Data;
+  SemaOpenMP::OpenMPVarListDataTy Data;
 
   if (ParseOpenMPVarList(DKind, Kind, Vars, Data))
     return nullptr;
@@ -4969,5 +4985,5 @@ OMPClause *Parser::ParseOpenMPVarListClause(OpenMPDirectiveKind DKind,
   if (ParseOnly)
     return nullptr;
   OMPVarListLocTy Locs(Loc, LOpen, Data.RLoc);
-  return Actions.ActOnOpenMPVarListClause(Kind, Vars, Locs, Data);
+  return Actions.OpenMP().ActOnOpenMPVarListClause(Kind, Vars, Locs, Data);
 }
diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index 76a3fa8f2627d..629421c01d17d 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -22,6 +22,7 @@
 #include "clang/Sema/DeclSpec.h"
 #include "clang/Sema/EnterExpressionEvaluationContext.h"
 #include "clang/Sema/Scope.h"
+#include "clang/Sema/SemaOpenMP.h"
 #include "clang/Sema/TypoCorrection.h"
 #include "llvm/ADT/STLExtras.h"
 #include <optional>
@@ -2301,7 +2302,7 @@ StmtResult Parser::ParseForStatement(SourceLocation *TrailingElseLoc) {
     // In OpenMP loop region loop control variable must be captured and be
     // private. Perform analysis of first part (if any).
     if (getLangOpts().OpenMP && FirstPart.isUsable()) {
-      Actions.ActOnOpenMPLoopInitialization(ForLoc, FirstPart.get());
+      Actions.OpenMP().ActOnOpenMPLoopInitialization(ForLoc, FirstPart.get());
     }
   }
 
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index d6f2b9f448cd5..ef46fc74cedc1 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -1441,7 +1441,7 @@ Decl *Parser::ParseFunctionDefinition(ParsingDeclarator &D,
 
   // Tell the actions module that we have entered a function definition with the
   // specified Declarator for the function.
-  Sema::SkipBodyInfo SkipBody;
+  SkipBodyInfo SkipBody;
   Decl *Res = Actions.ActOnStartOfFunctionDef(getCurScope(), D,
                                               TemplateInfo.TemplateParams
                                                   ? *TemplateInfo.TemplateParams
diff --git a/clang/lib/Sema/CMakeLists.txt b/clang/lib/Sema/CMakeLists.txt
index ab3b813a9ccd9..a96439df66422 100644
--- a/clang/lib/Sema/CMakeLists.txt
+++ b/clang/lib/Sema/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(LLVM_LINK_COMPONENTS
   Core
+  Demangle
   FrontendHLSL
   FrontendOpenMP
   MC
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 8de202f4f7a0c..a1e32d391ed0c 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -46,6 +46,7 @@
 #include "clang/Sema/SemaHLSL.h"
 #include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/SemaOpenACC.h"
+#include "clang/Sema/SemaOpenMP.h"
 #include "clang/Sema/SemaSYCL.h"
 #include "clang/Sema/TemplateDeduction.h"
 #include "clang/Sema/TemplateInstCallback.h"
@@ -203,6 +204,7 @@ Sema::Sema(Preprocessor &pp, ASTContext &ctxt, ASTConsumer &consumer,
       CUDAPtr(std::make_unique<SemaCUDA>(*this)),
       HLSLPtr(std::make_unique<SemaHLSL>(*this)),
       OpenACCPtr(std::make_unique<SemaOpenACC>(*this)),
+      OpenMPPtr(std::make_unique<SemaOpenMP>(*this)),
       SYCLPtr(std::make_unique<SemaSYCL>(*this)),
       MSPointerToMemberRepresentationMethod(
           LangOpts.getMSPointerToMemberRepresentationMethod()),
@@ -226,8 +228,7 @@ Sema::Sema(Preprocessor &pp, ASTContext &ctxt, ASTConsumer &consumer,
       StringWithUTF8StringMethod(nullptr),
       ValueWithBytesObjCTypeMethod(nullptr), NSArrayDecl(nullptr),
       ArrayWithObjectsMethod(nullptr), NSDictionaryDecl(nullptr),
-      DictionaryWithObjectsMethod(nullptr), CodeCompleter(CodeCompleter),
-      VarDataSharingAttributesStack(nullptr) {
+      DictionaryWithObjectsMethod(nullptr), CodeCompleter(CodeCompleter) {
   assert(pp.TUKind == TUKind);
   TUScope = nullptr;
 
@@ -252,7 +253,7 @@ Sema::Sema(Preprocessor &pp, ASTContext &ctxt, ASTConsumer &consumer,
       nullptr, ExpressionEvaluationContextRecord::EK_Other);
 
   // Initialization of data sharing attributes stack for OpenMP
-  InitDataSharingAttributesStack();
+  OpenMP().InitDataSharingAttributesStack();
 
   std::unique_ptr<sema::SemaPPCallbacks> Callbacks =
       std::make_unique<sema::SemaPPCallbacks>();
@@ -501,7 +502,7 @@ Sema::~Sema() {
   threadSafety::threadSafetyCleanup(ThreadSafetyDeclCache);
 
   // Destroys data sharing attributes stack for OpenMP
-  DestroyDataSharingAttributesStack();
+  OpenMP().DestroyDataSharingAttributesStack();
 
   // Detach from the PP callback handler which outlives Sema since it's owned
   // by the preprocessor.
@@ -1159,7 +1160,7 @@ void Sema::ActOnEndOfTranslationUnit() {
 
   DiagnoseUnterminatedPragmaAlignPack();
   DiagnoseUnterminatedPragmaAttribute();
-  DiagnoseUnterminatedOpenMPDeclareTarget();
+  OpenMP().DiagnoseUnterminatedOpenMPDeclareTarget();
 
   // All delayed member exception specs should be checked or we end up accepting
   // incompatible declarations.
@@ -1747,7 +1748,7 @@ class DeferredDiagnosticsEmitter
     // Finalize analysis of OpenMP-specific constructs.
     if (Caller && S.LangOpts.OpenMP && UsePath.size() == 1 &&
         (ShouldEmitRootNode || InOMPDeviceContext))
-      S.finalizeOpenMPDelayedAnalysis(Caller, FD, Loc);
+      S.OpenMP().finalizeOpenMPDelayedAnalysis(Caller, FD, Loc);
     if (Caller)
       S.CUDA().DeviceKnownEmittedFns[FD] = {Caller, Loc};
     // Always emit deferred diagnostics for the direct users. This does not
@@ -1899,8 +1900,8 @@ Sema::targetDiag(SourceLocation Loc, unsigned DiagID, const FunctionDecl *FD) {
   FD = FD ? FD : getCurFunctionDecl();
   if (LangOpts.OpenMP)
     return LangOpts.OpenMPIsTargetDevice
-               ? diagIfOpenMPDeviceCode(Loc, DiagID, FD)
-               : diagIfOpenMPHostCode(Loc, DiagID, FD);
+               ? OpenMP().diagIfOpenMPDeviceCode(Loc, DiagID, FD)
+               : OpenMP().diagIfOpenMPHostCode(Loc, DiagID, FD);
   if (getLangOpts().CUDA)
     return getLangOpts().CUDAIsDevice ? CUDA().DiagIfDeviceCode(Loc, DiagID)
                                       : CUDA().DiagIfHostCode(Loc, DiagID);
@@ -2131,7 +2132,7 @@ void Sema::PushFunctionScope() {
     FunctionScopes.push_back(new FunctionScopeInfo(getDiagnostics()));
   }
   if (LangOpts.OpenMP)
-    pushOpenMPFunctionRegion();
+    OpenMP().pushOpenMPFunctionRegion();
 }
 
 void Sema::PushBlockScope(Scope *BlockScope, BlockDecl *Block) {
@@ -2251,7 +2252,7 @@ Sema::PopFunctionScopeInfo(const AnalysisBasedWarnings::Policy *WP,
                                PoppedFunctionScopeDeleter(this));
 
   if (LangOpts.OpenMP)
-    popOpenMPFunctionRegion(Scope.get());
+    OpenMP().popOpenMPFunctionRegion(Scope.get());
 
   // Issue any analysis-based warnings.
   if (WP && D)
@@ -2687,7 +2688,9 @@ void Sema::PushCapturedRegionScope(Scope *S, CapturedDecl *CD, RecordDecl *RD,
                                    unsigned OpenMPCaptureLevel) {
   auto *CSI = new CapturedRegionScopeInfo(
       getDiagnostics(), S, CD, RD, CD->getContextParam(), K,
-      (getLangOpts().OpenMP && K == CR_OpenMP) ? getOpenMPNestingLevel() : 0,
+      (getLangOpts().OpenMP && K == CR_OpenMP)
+          ? OpenMP().getOpenMPNestingLevel()
+          : 0,
       OpenMPCaptureLevel);
   CSI->ReturnType = Context.VoidTy;
   FunctionScopes.push_back(CSI);
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index b0c28531fe873..126fd3797417c 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -155,7 +155,7 @@ namespace {
       Self.CheckCastAlign(SrcExpr.get(), DestType, OpRange);
     }
 
-    void checkObjCConversion(Sema::CheckedConversionKind CCK) {
+    void checkObjCConversion(CheckedConversionKind CCK) {
       assert(Self.getLangOpts().allowsNonTrivialObjCLifetimeQualifiers());
 
       Expr *src = SrcExpr.get();
@@ -248,18 +248,14 @@ static TryCastResult TryStaticMemberPointerUpcast(Sema &Self, ExprResult &SrcExp
                                                CastKind &Kind,
                                                CXXCastPath &BasePath);
 
-static TryCastResult TryStaticImplicitCast(Sema &Self, ExprResult &SrcExpr,
-                                           QualType DestType,
-                                           Sema::CheckedConversionKind CCK,
-                                           SourceRange OpRange,
-                                           unsigned &msg, CastKind &Kind,
-                                           bool ListInitialization);
+static TryCastResult
+TryStaticImplicitCast(Sema &Self, ExprResult &SrcExpr, QualType DestType,
+                      CheckedConversionKind CCK, SourceRange OpRange,
+                      unsigned &msg, CastKind &Kind, bool ListInitialization);
 static TryCastResult TryStaticCast(Sema &Self, ExprResult &SrcExpr,
-                                   QualType DestType,
-                                   Sema::CheckedConversionKind CCK,
-                                   SourceRange OpRange,
-                                   unsigned &msg, CastKind &Kind,
-                                   CXXCastPath &BasePath,
+                                   QualType DestType, CheckedConversionKind CCK,
+                                   SourceRange OpRange, unsigned &msg,
+                                   CastKind &Kind, CXXCastPath &BasePath,
                                    bool ListInitialization);
 static TryCastResult TryConstCast(Sema &Self, ExprResult &SrcExpr,
                                   QualType DestType, bool CStyle,
@@ -1223,7 +1219,7 @@ void CastOperation::CheckReinterpretCast() {
 
   if (isValidCast(tcr)) {
     if (Self.getLangOpts().allowsNonTrivialObjCLifetimeQualifiers())
-      checkObjCConversion(Sema::CCK_OtherCast);
+      checkObjCConversion(CheckedConversionKind::OtherCast);
     DiagnoseReinterpretUpDownCast(Self, SrcExpr.get(), DestType, OpRange);
 
     if (unsigned DiagID = checkCastFunctionType(Self, SrcExpr, DestType))
@@ -1274,9 +1270,9 @@ void CastOperation::CheckStaticCast() {
   }
 
   unsigned msg = diag::err_bad_cxx_cast_generic;
-  TryCastResult tcr
-    = TryStaticCast(Self, SrcExpr, DestType, Sema::CCK_OtherCast, OpRange, msg,
-                    Kind, BasePath, /*ListInitialization=*/false);
+  TryCastResult tcr =
+      TryStaticCast(Self, SrcExpr, DestType, CheckedConversionKind::OtherCast,
+                    OpRange, msg, Kind, BasePath, /*ListInitialization=*/false);
   if (tcr != TC_Success && msg != 0) {
     if (SrcExpr.isInvalid())
       return;
@@ -1296,7 +1292,7 @@ void CastOperation::CheckStaticCast() {
     if (Kind == CK_BitCast)
       checkCastAlign();
     if (Self.getLangOpts().allowsNonTrivialObjCLifetimeQualifiers())
-      checkObjCConversion(Sema::CCK_OtherCast);
+      checkObjCConversion(CheckedConversionKind::OtherCast);
   } else {
     SrcExpr = ExprError();
   }
@@ -1317,14 +1313,13 @@ static bool IsAddressSpaceConversion(QualType SrcType, QualType DestType) {
 /// possible. If @p CStyle, ignore access restrictions on hierarchy casting
 /// and casting away constness.
 static TryCastResult TryStaticCast(Sema &Self, ExprResult &SrcExpr,
-                                   QualType DestType,
-                                   Sema::CheckedConversionKind CCK,
+                                   QualType DestType, CheckedConversionKind CCK,
                                    SourceRange OpRange, unsigned &msg,
                                    CastKind &Kind, CXXCastPath &BasePath,
                                    bool ListInitialization) {
   // Determine whether we have the semantics of a C-style cast.
-  bool CStyle
-    = (CCK == Sema::CCK_CStyleCast || CCK == Sema::CCK_FunctionalCast);
+  bool CStyle = (CCK == CheckedConversionKind::CStyleCast ||
+                 CCK == CheckedConversionKind::FunctionalCast);
 
   // The order the tests is not entirely arbitrary. There is one conversion
   // that can be handled in two different ways. Given:
@@ -1884,11 +1879,11 @@ TryStaticMemberPointerUpcast(Sema &Self, ExprResult &SrcExpr, QualType SrcType,
 ///
 ///   An expression e can be explicitly converted to a type T using a
 ///   @c static_cast if the declaration "T t(e);" is well-formed [...].
-TryCastResult
-TryStaticImplicitCast(Sema &Self, ExprResult &SrcExpr, QualType DestType,
-                      Sema::CheckedConversionKind CCK,
-                      SourceRange OpRange, unsigned &msg,
-                      CastKind &Kind, bool ListInitialization) {
+TryCastResult TryStaticImplicitCast(Sema &Self, ExprResult &SrcExpr,
+                                    QualType DestType,
+                                    CheckedConversionKind CCK,
+                                    SourceRange OpRange, unsigned &msg,
+                                    CastKind &Kind, bool ListInitialization) {
   if (DestType->isRecordType()) {
     if (Self.RequireCompleteType(OpRange.getBegin(), DestType,
                                  diag::err_bad_cast_incomplete) ||
@@ -1900,13 +1895,14 @@ TryStaticImplicitCast(Sema &Self, ExprResult &SrcExpr, QualType DestType,
   }
 
   InitializedEntity Entity = InitializedEntity::InitializeTemporary(DestType);
-  InitializationKind InitKind
-    = (CCK == Sema::CCK_CStyleCast)
-        ? InitializationKind::CreateCStyleCast(OpRange.getBegin(), OpRange,
-                                               ListInitialization)
-    : (CCK == Sema::CCK_FunctionalCast)
-        ? InitializationKind::CreateFunctionalCast(OpRange, ListInitialization)
-    : InitializationKind::CreateCast(OpRange);
+  InitializationKind InitKind =
+      (CCK == CheckedConversionKind::CStyleCast)
+          ? InitializationKind::CreateCStyleCast(OpRange.getBegin(), OpRange,
+                                                 ListInitialization)
+      : (CCK == CheckedConversionKind::FunctionalCast)
+          ? InitializationKind::CreateFunctionalCast(OpRange,
+                                                     ListInitialization)
+          : InitializationKind::CreateCast(OpRange);
   Expr *SrcExprRaw = SrcExpr.get();
   // FIXME: Per DR242, we should check for an implicit conversion sequence
   // or for a constructor that could be invoked by direct-initialization
@@ -1918,8 +1914,8 @@ TryStaticImplicitCast(Sema &Self, ExprResult &SrcExpr, QualType DestType,
   // There is no other way that works.
   // On the other hand, if we're checking a C-style cast, we've still got
   // the reinterpret_cast way.
-  bool CStyle
-    = (CCK == Sema::CCK_CStyleCast || CCK == Sema::CCK_FunctionalCast);
+  bool CStyle = (CCK == CheckedConversionKind::CStyleCast ||
+                 CCK == CheckedConversionKind::FunctionalCast);
   if (InitSeq.Failed() && (CStyle || !DestType->isReferenceType()))
     return TC_NotApplicable;
 
@@ -2814,8 +2810,9 @@ void CastOperation::CheckCXXCStyleCast(bool FunctionalStyle,
   if (isValidCast(tcr))
     Kind = CK_NoOp;
 
-  Sema::CheckedConversionKind CCK =
-      FunctionalStyle ? Sema::CCK_FunctionalCast : Sema::CCK_CStyleCast;
+  CheckedConversionKind CCK = FunctionalStyle
+                                  ? CheckedConversionKind::FunctionalCast
+                                  : CheckedConversionKind::CStyleCast;
   if (tcr == TC_NotApplicable) {
     tcr = TryAddressSpaceCast(Self, SrcExpr, DestType, /*CStyle*/ true, msg,
                               Kind);
@@ -3201,7 +3198,7 @@ void CastOperation::CheckCStyleCast() {
 
   // ARC imposes extra restrictions on casts.
   if (Self.getLangOpts().allowsNonTrivialObjCLifetimeQualifiers()) {
-    checkObjCConversion(Sema::CCK_CStyleCast);
+    checkObjCConversion(CheckedConversionKind::CStyleCast);
     if (SrcExpr.isInvalid())
       return;
 
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 8e21811b67d90..d814327d23470 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3233,6 +3233,17 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     if (BuiltinCountZeroBitsGeneric(*this, TheCall))
       return ExprError();
     break;
+
+  case Builtin::BI__builtin_allow_runtime_check: {
+    Expr *Arg = TheCall->getArg(0);
+    // Check if the argument is a string literal.
+    if (!isa<StringLiteral>(Arg->IgnoreParenImpCasts())) {
+      Diag(TheCall->getBeginLoc(), diag::err_expr_not_string_literal)
+          << Arg->getSourceRange();
+      return ExprError();
+    }
+    break;
+  }
   }
 
   if (getLangOpts().HLSL && CheckHLSLBuiltinFunctionCall(BuiltinID, TheCall))
@@ -7938,7 +7949,6 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto,
     // For variadic functions, we may have more args than parameters.
     // For some K&R functions, we may have less args than parameters.
     const auto N = std::min<unsigned>(Proto->getNumParams(), Args.size());
-    bool AnyScalableArgsOrRet = Proto->getReturnType()->isSizelessVectorType();
     for (unsigned ArgIdx = 0; ArgIdx < N; ++ArgIdx) {
       // Args[ArgIdx] can be null in malformed code.
       if (const Expr *Arg = Args[ArgIdx]) {
@@ -7952,8 +7962,6 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto,
           checkAIXMemberAlignment((Arg->getExprLoc()), Arg);
 
         QualType ParamTy = Proto->getParamType(ArgIdx);
-        if (ParamTy->isSizelessVectorType())
-          AnyScalableArgsOrRet = true;
         QualType ArgTy = Arg->getType();
         CheckArgAlignment(Arg->getExprLoc(), FDecl, std::to_string(ArgIdx + 1),
                           ArgTy, ParamTy);
@@ -7974,23 +7982,6 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto,
       }
     }
 
-    // If the call requires a streaming-mode change and has scalable vector
-    // arguments or return values, then warn the user that the streaming and
-    // non-streaming vector lengths may be different.
-    const auto *CallerFD = dyn_cast<FunctionDecl>(CurContext);
-    if (CallerFD && (!FD || !FD->getBuiltinID()) && AnyScalableArgsOrRet) {
-      bool IsCalleeStreaming =
-          ExtInfo.AArch64SMEAttributes & FunctionType::SME_PStateSMEnabledMask;
-      bool IsCalleeStreamingCompatible =
-          ExtInfo.AArch64SMEAttributes &
-          FunctionType::SME_PStateSMCompatibleMask;
-      ArmStreamingType CallerFnType = getArmStreamingFnType(CallerFD);
-      if (!IsCalleeStreamingCompatible &&
-          (CallerFnType == ArmStreamingCompatible ||
-           ((CallerFnType == ArmStreaming) ^ IsCalleeStreaming)))
-        Diag(Loc, diag::warn_sme_streaming_pass_return_vl_to_non_streaming);
-    }
-
     FunctionType::ArmStateValue CalleeArmZAState =
         FunctionType::getArmZAState(ExtInfo.AArch64SMEAttributes);
     FunctionType::ArmStateValue CalleeArmZT0State =
@@ -7999,7 +7990,7 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto,
         CalleeArmZT0State != FunctionType::ARM_None) {
       bool CallerHasZAState = false;
       bool CallerHasZT0State = false;
-      if (CallerFD) {
+      if (const auto *CallerFD = dyn_cast<FunctionDecl>(CurContext)) {
         auto *Attr = CallerFD->getAttr<ArmNewAttr>();
         if (Attr && Attr->isNewZA())
           CallerHasZAState = true;
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 8b3b9d020db57..1bde99d6fce74 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -48,6 +48,7 @@
 #include "clang/Sema/SemaCUDA.h"
 #include "clang/Sema/SemaHLSL.h"
 #include "clang/Sema/SemaInternal.h"
+#include "clang/Sema/SemaOpenMP.h"
 #include "clang/Sema/Template.h"
 #include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/ADT/SmallString.h"
@@ -3036,7 +3037,7 @@ static void checkNewAttributesAfterDef(Sema &S, Decl *New, const Decl *Old) {
 
     if (isa<AliasAttr>(NewAttribute) || isa<IFuncAttr>(NewAttribute)) {
       if (FunctionDecl *FD = dyn_cast<FunctionDecl>(New)) {
-        Sema::SkipBodyInfo SkipBody;
+        SkipBodyInfo SkipBody;
         S.CheckForFunctionRedefinition(FD, cast<FunctionDecl>(Def), &SkipBody);
 
         // If we're skipping this definition, drop the "alias" attribute.
@@ -5373,7 +5374,7 @@ static bool CheckAnonMemberRedeclaration(Sema &SemaRef, Scope *S,
   LookupResult R(SemaRef, Name, NameLoc,
                  Owner->isRecord() ? Sema::LookupMemberName
                                    : Sema::LookupOrdinaryName,
-                 Sema::ForVisibleRedeclaration);
+                 RedeclarationKind::ForVisibleRedeclaration);
   if (!SemaRef.LookupName(R, S)) return false;
 
   // Pick a representative declaration.
@@ -6168,11 +6169,12 @@ Decl *Sema::ActOnDeclarator(Scope *S, Declarator &D) {
   // Check if we are in an `omp begin/end declare variant` scope. Handle this
   // declaration only if the `bind_to_declaration` extension is set.
   SmallVector<FunctionDecl *, 4> Bases;
-  if (LangOpts.OpenMP && isInOpenMPDeclareVariantScope())
-    if (getOMPTraitInfoForSurroundingScope()->isExtensionActive(llvm::omp::TraitProperty::
-              implementation_extension_bind_to_declaration))
-    ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(
-        S, D, MultiTemplateParamsArg(), Bases);
+  if (LangOpts.OpenMP && OpenMP().isInOpenMPDeclareVariantScope())
+    if (OpenMP().getOMPTraitInfoForSurroundingScope()->isExtensionActive(
+            llvm::omp::TraitProperty::
+                implementation_extension_bind_to_declaration))
+      OpenMP().ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(
+          S, D, MultiTemplateParamsArg(), Bases);
 
   Decl *Dcl = HandleDeclarator(S, D, MultiTemplateParamsArg());
 
@@ -6181,7 +6183,8 @@ Decl *Sema::ActOnDeclarator(Scope *S, Declarator &D) {
     Dcl->setTopLevelDeclInObjCContainer();
 
   if (!Bases.empty())
-    ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope(Dcl, Bases);
+    OpenMP().ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope(Dcl,
+                                                                        Bases);
 
   return Dcl;
 }
@@ -6467,7 +6470,8 @@ NamedDecl *Sema::HandleDeclarator(Scope *S, Declarator &D,
 
     if (IsLinkageLookup) {
       Previous.clear(LookupRedeclarationWithLinkage);
-      Previous.setRedeclarationKind(ForExternalRedeclaration);
+      Previous.setRedeclarationKind(
+          RedeclarationKind::ForExternalRedeclaration);
     }
 
     LookupName(Previous, S, CreateBuiltins);
@@ -6568,8 +6572,8 @@ NamedDecl *Sema::HandleDeclarator(Scope *S, Declarator &D,
   if (New->getDeclName() && AddToScope)
     PushOnScopeChains(New, S);
 
-  if (isInOpenMPDeclareTargetContext())
-    checkDeclIsAllowedInOpenMPTarget(nullptr, New);
+  if (OpenMP().isInOpenMPDeclareTargetContext())
+    OpenMP().checkDeclIsAllowedInOpenMPTarget(nullptr, New);
 
   return New;
 }
@@ -8518,7 +8522,8 @@ void Sema::CheckShadow(Scope *S, VarDecl *D) {
     return;
 
   LookupResult R(*this, D->getDeclName(), D->getLocation(),
-                 Sema::LookupOrdinaryName, Sema::ForVisibleRedeclaration);
+                 Sema::LookupOrdinaryName,
+                 RedeclarationKind::ForVisibleRedeclaration);
   LookupName(R, S);
   if (NamedDecl *ShadowedDecl = getShadowedDeclaration(D, R))
     CheckShadow(D, ShadowedDecl, R);
@@ -9158,7 +9163,7 @@ static NamedDecl *DiagnoseInvalidRedeclaration(
   LookupResult Prev(SemaRef, Name, NewFD->getLocation(),
                     IsLocalFriend ? Sema::LookupLocalFriendName
                                   : Sema::LookupOrdinaryName,
-                    Sema::ForVisibleRedeclaration);
+                    RedeclarationKind::ForVisibleRedeclaration);
 
   NewFD->setInvalidDecl();
   if (IsLocalFriend)
@@ -12268,7 +12273,7 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD,
   }
 
   if (LangOpts.OpenMP)
-    ActOnFinishedFunctionDefinitionInOpenMPAssumeScope(NewFD);
+    OpenMP().ActOnFinishedFunctionDefinitionInOpenMPAssumeScope(NewFD);
 
   // Semantic checking for this function declaration (in isolation).
 
@@ -12403,22 +12408,12 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD,
   }
 
   // Check if the function definition uses any AArch64 SME features without
-  // having the '+sme' feature enabled and warn user if sme locally streaming
-  // function returns or uses arguments with VL-based types.
+  // having the '+sme' feature enabled.
   if (DeclIsDefn) {
     const auto *Attr = NewFD->getAttr<ArmNewAttr>();
     bool UsesSM = NewFD->hasAttr<ArmLocallyStreamingAttr>();
     bool UsesZA = Attr && Attr->isNewZA();
     bool UsesZT0 = Attr && Attr->isNewZT0();
-
-    if (NewFD->hasAttr<ArmLocallyStreamingAttr>()) {
-      if (NewFD->getReturnType()->isSizelessVectorType() ||
-          llvm::any_of(NewFD->parameters(), [](ParmVarDecl *P) {
-            return P->getOriginalType()->isSizelessVectorType();
-          }))
-        Diag(NewFD->getLocation(),
-             diag::warn_sme_locally_streaming_has_vl_args_returns);
-    }
     if (const auto *FPT = NewFD->getType()->getAs<FunctionProtoType>()) {
       FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo();
       UsesSM |=
@@ -12668,7 +12663,7 @@ void Sema::CheckMSVCRTEntryPoint(FunctionDecl *FD) {
   }
 }
 
-bool Sema::CheckForConstantInitializer(Expr *Init, QualType DclT) {
+bool Sema::CheckForConstantInitializer(Expr *Init, unsigned DiagID) {
   // FIXME: Need strict checking.  In C89, we need to check for
   // any assignment, increment, decrement, function-calls, or
   // commas outside of a sizeof.  In C99, it's the same list,
@@ -12686,8 +12681,7 @@ bool Sema::CheckForConstantInitializer(Expr *Init, QualType DclT) {
   const Expr *Culprit;
   if (Init->isConstantInitializer(Context, false, &Culprit))
     return false;
-  Diag(Culprit->getExprLoc(), diag::err_init_element_not_constant)
-    << Culprit->getSourceRange();
+  Diag(Culprit->getExprLoc(), DiagID) << Culprit->getSourceRange();
   return true;
 }
 
@@ -13805,29 +13799,24 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
     // OpenCL v1.2 s6.5.3: __constant locals must be constant-initialized.
     // This is true even in C++ for OpenCL.
     } else if (VDecl->getType().getAddressSpace() == LangAS::opencl_constant) {
-      CheckForConstantInitializer(Init, DclT);
+      CheckForConstantInitializer(Init);
 
-    // Otherwise, C++ does not restrict the initializer.
+      // Otherwise, C++ does not restrict the initializer.
     } else if (getLangOpts().CPlusPlus) {
       // do nothing
 
     // C99 6.7.8p4: All the expressions in an initializer for an object that has
     // static storage duration shall be constant expressions or string literals.
     } else if (VDecl->getStorageClass() == SC_Static) {
-      CheckForConstantInitializer(Init, DclT);
+      CheckForConstantInitializer(Init);
 
-    // C89 is stricter than C99 for aggregate initializers.
-    // C89 6.5.7p3: All the expressions [...] in an initializer list
-    // for an object that has aggregate or union type shall be
-    // constant expressions.
+      // C89 is stricter than C99 for aggregate initializers.
+      // C89 6.5.7p3: All the expressions [...] in an initializer list
+      // for an object that has aggregate or union type shall be
+      // constant expressions.
     } else if (!getLangOpts().C99 && VDecl->getType()->isAggregateType() &&
                isa<InitListExpr>(Init)) {
-      const Expr *Culprit;
-      if (!Init->isConstantInitializer(Context, false, &Culprit)) {
-        Diag(Culprit->getExprLoc(),
-             diag::ext_aggregate_init_not_constant)
-          << Culprit->getSourceRange();
-      }
+      CheckForConstantInitializer(Init, diag::ext_aggregate_init_not_constant);
     }
 
     if (auto *E = dyn_cast<ExprWithCleanups>(Init))
@@ -13960,7 +13949,7 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) {
     // Avoid duplicate diagnostics for constexpr variables.
     if (!getLangOpts().CPlusPlus && !VDecl->isInvalidDecl() &&
         !VDecl->isConstexpr())
-      CheckForConstantInitializer(Init, DclT);
+      CheckForConstantInitializer(Init);
   }
 
   QualType InitType = Init->getType();
@@ -14956,7 +14945,7 @@ Sema::DeclGroupPtrTy Sema::FinalizeDeclaratorGroup(Scope *S, const DeclSpec &DS,
       if (auto *VD = dyn_cast<VarDecl>(D);
           LangOpts.OpenMP && VD && VD->hasAttr<OMPDeclareTargetDeclAttr>() &&
           VD->hasGlobalStorage())
-        ActOnOpenMPDeclareTargetInitializer(D);
+        OpenMP().ActOnOpenMPDeclareTargetInitializer(D);
       // For declarators, there are some additional syntactic-ish checks we need
       // to perform.
       if (auto *DD = dyn_cast<DeclaratorDecl>(D)) {
@@ -15199,7 +15188,7 @@ Decl *Sema::ActOnParamDeclarator(Scope *S, Declarator &D,
   const IdentifierInfo *II = D.getIdentifier();
   if (II) {
     LookupResult R(*this, II, D.getIdentifierLoc(), LookupOrdinaryName,
-                   ForVisibleRedeclaration);
+                   RedeclarationKind::ForVisibleRedeclaration);
     LookupName(R, S);
     if (!R.empty()) {
       NamedDecl *PrevDecl = *R.begin();
@@ -15495,8 +15484,8 @@ Sema::ActOnStartOfFunctionDef(Scope *FnBodyScope, Declarator &D,
   // specialization function under the OpenMP context defined as part of the
   // `omp begin declare variant`.
   SmallVector<FunctionDecl *, 4> Bases;
-  if (LangOpts.OpenMP && isInOpenMPDeclareVariantScope())
-    ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(
+  if (LangOpts.OpenMP && OpenMP().isInOpenMPDeclareVariantScope())
+    OpenMP().ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(
         ParentScope, D, TemplateParameterLists, Bases);
 
   D.setFunctionDefinitionKind(FunctionDefinitionKind::Definition);
@@ -15504,7 +15493,8 @@ Sema::ActOnStartOfFunctionDef(Scope *FnBodyScope, Declarator &D,
   Decl *Dcl = ActOnStartOfFunctionDef(FnBodyScope, DP, SkipBody, BodyKind);
 
   if (!Bases.empty())
-    ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope(Dcl, Bases);
+    OpenMP().ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope(Dcl,
+                                                                        Bases);
 
   return Dcl;
 }
@@ -17430,7 +17420,7 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
 
   RedeclarationKind Redecl = forRedeclarationInCurContext();
   if (TUK == TUK_Friend || TUK == TUK_Reference)
-    Redecl = NotForRedeclaration;
+    Redecl = RedeclarationKind::NotForRedeclaration;
 
   /// Create a new tag decl in C/ObjC. Since the ODR-like semantics for ObjC/C
   /// implemented asks for structural equivalence checking, the returned decl
@@ -18591,7 +18581,7 @@ FieldDecl *Sema::HandleField(Scope *S, RecordDecl *Record,
   // Check to see if this name was declared as a member previously
   NamedDecl *PrevDecl = nullptr;
   LookupResult Previous(*this, II, Loc, LookupMemberName,
-                        ForVisibleRedeclaration);
+                        RedeclarationKind::ForVisibleRedeclaration);
   LookupName(Previous, S);
   switch (Previous.getResultKind()) {
     case LookupResult::Found:
@@ -18995,8 +18985,9 @@ Decl *Sema::ActOnIvar(Scope *S, SourceLocation DeclStart, Declarator &D,
     NewID->setInvalidDecl();
 
   if (II) {
-    NamedDecl *PrevDecl = LookupSingleName(S, II, Loc, LookupMemberName,
-                                           ForVisibleRedeclaration);
+    NamedDecl *PrevDecl =
+        LookupSingleName(S, II, Loc, LookupMemberName,
+                         RedeclarationKind::ForVisibleRedeclaration);
     if (PrevDecl && isDeclInScope(PrevDecl, EnclosingContext, S)
         && !isa<TagDecl>(PrevDecl)) {
       Diag(Loc, diag::err_duplicate_member) << II;
@@ -20001,7 +19992,7 @@ EnumConstantDecl *Sema::CheckEnumConstant(EnumDecl *Enum,
                                   Val, EnumVal);
 }
 
-Sema::SkipBodyInfo Sema::shouldSkipAnonEnumBody(Scope *S, IdentifierInfo *II,
+SkipBodyInfo Sema::shouldSkipAnonEnumBody(Scope *S, IdentifierInfo *II,
                                                 SourceLocation IILoc) {
   if (!(getLangOpts().Modules || getLangOpts().ModulesLocalVisibility) ||
       !getLangOpts().CPlusPlus)
@@ -20041,7 +20032,8 @@ Decl *Sema::ActOnEnumConstant(Scope *S, Decl *theEnumDecl, Decl *lastEnumConst,
 
   // Verify that there isn't already something declared with this name in this
   // scope.
-  LookupResult R(*this, Id, IdLoc, LookupOrdinaryName, ForVisibleRedeclaration);
+  LookupResult R(*this, Id, IdLoc, LookupOrdinaryName,
+                 RedeclarationKind::ForVisibleRedeclaration);
   LookupName(R, S);
   NamedDecl *PrevDecl = R.getAsSingle<NamedDecl>();
 
@@ -20651,7 +20643,7 @@ Sema::FunctionEmissionStatus Sema::getEmissionStatus(const FunctionDecl *FD,
         return FunctionEmissionStatus::OMPDiscarded;
     // If we have an explicit value for the device type, or we are in a target
     // declare context, we need to emit all extern and used symbols.
-    if (isInOpenMPDeclareTargetContext() || DevTy)
+    if (OpenMP().isInOpenMPDeclareTargetContext() || DevTy)
       if (IsEmittedForExternalSymbol())
         return FunctionEmissionStatus::Emitted;
     // Device mode only emits what it must, if it wasn't tagged yet and needed,
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index b7b1fbc625a15..c3bf18a3f79e2 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -45,6 +45,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Demangle/Demangle.h"
 #include "llvm/IR/Assumptions.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/Support/Error.h"
@@ -1983,6 +1984,38 @@ static void handleWeakRefAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   D->addAttr(::new (S.Context) WeakRefAttr(S.Context, AL));
 }
 
+// Mark alias/ifunc target as used. Due to name mangling, we look up the
+// demangled name ignoring parameters (not supported by microsoftDemangle
+// https://github.com/llvm/llvm-project/issues/88825). This should handle the
+// majority of use cases while leaving namespace scope names unmarked.
+static void markUsedForAliasOrIfunc(Sema &S, Decl *D, const ParsedAttr &AL,
+                                    StringRef Str) {
+  std::unique_ptr<char, llvm::FreeDeleter> Demangled;
+  if (S.getASTContext().getCXXABIKind() != TargetCXXABI::Microsoft)
+    Demangled.reset(llvm::itaniumDemangle(Str, /*ParseParams=*/false));
+  std::unique_ptr<MangleContext> MC(S.Context.createMangleContext());
+  SmallString<256> Name;
+
+  const DeclarationNameInfo Target(
+      &S.Context.Idents.get(Demangled ? Demangled.get() : Str), AL.getLoc());
+  LookupResult LR(S, Target, Sema::LookupOrdinaryName);
+  if (S.LookupName(LR, S.TUScope)) {
+    for (NamedDecl *ND : LR) {
+      if (!isa<FunctionDecl>(ND) && !isa<VarDecl>(ND))
+        continue;
+      if (MC->shouldMangleDeclName(ND)) {
+        llvm::raw_svector_ostream Out(Name);
+        Name.clear();
+        MC->mangleName(GlobalDecl(ND), Out);
+      } else {
+        Name = ND->getIdentifier()->getName();
+      }
+      if (Name == Str)
+        ND->markUsed(S.Context);
+    }
+  }
+}
+
 static void handleIFuncAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   StringRef Str;
   if (!S.checkStringLiteralArgumentAttr(AL, 0, Str))
@@ -1995,6 +2028,7 @@ static void handleIFuncAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
     return;
   }
 
+  markUsedForAliasOrIfunc(S, D, AL, Str);
   D->addAttr(::new (S.Context) IFuncAttr(S.Context, AL, Str));
 }
 
@@ -2029,17 +2063,7 @@ static void handleAliasAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
     }
   }
 
-  // Mark target used to prevent unneeded-internal-declaration warnings.
-  if (!S.LangOpts.CPlusPlus) {
-    // FIXME: demangle Str for C++, as the attribute refers to the mangled
-    // linkage name, not the pre-mangled identifier.
-    const DeclarationNameInfo target(&S.Context.Idents.get(Str), AL.getLoc());
-    LookupResult LR(S, target, Sema::LookupOrdinaryName);
-    if (S.LookupQualifiedName(LR, S.getCurLexicalContext()))
-      for (NamedDecl *ND : LR)
-        ND->markUsed(S.Context);
-  }
-
+  markUsedForAliasOrIfunc(S, D, AL, Str);
   D->addAttr(::new (S.Context) AliasAttr(S.Context, AL, Str));
 }
 
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 7669171fea56f..591016243b0ac 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -44,6 +44,7 @@
 #include "clang/Sema/ScopeInfo.h"
 #include "clang/Sema/SemaCUDA.h"
 #include "clang/Sema/SemaInternal.h"
+#include "clang/Sema/SemaOpenMP.h"
 #include "clang/Sema/Template.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
@@ -895,7 +896,7 @@ Sema::ActOnDecompositionDeclarator(Scope *S, Declarator &D,
     assert(VarName && "Cannot have an unnamed binding declaration");
 
     LookupResult Previous(*this, NameInfo, LookupOrdinaryName,
-                          ForVisibleRedeclaration);
+                          RedeclarationKind::ForVisibleRedeclaration);
     LookupName(Previous, S,
                /*CreateBuiltins*/DC->getRedeclContext()->isTranslationUnit());
 
@@ -950,7 +951,7 @@ Sema::ActOnDecompositionDeclarator(Scope *S, Declarator &D,
   DeclarationNameInfo NameInfo((IdentifierInfo *)nullptr,
                                Decomp.getLSquareLoc());
   LookupResult Previous(*this, NameInfo, LookupOrdinaryName,
-                        ForVisibleRedeclaration);
+                        RedeclarationKind::ForVisibleRedeclaration);
 
   // Build the variable that holds the non-decomposed object.
   bool AddToScope = true;
@@ -962,8 +963,8 @@ Sema::ActOnDecompositionDeclarator(Scope *S, Declarator &D,
     CurContext->addHiddenDecl(New);
   }
 
-  if (isInOpenMPDeclareTargetContext())
-    checkDeclIsAllowedInOpenMPTarget(nullptr, New);
+  if (OpenMP().isInOpenMPDeclareTargetContext())
+    OpenMP().checkDeclIsAllowedInOpenMPTarget(nullptr, New);
 
   return New;
 }
@@ -11714,7 +11715,7 @@ Decl *Sema::ActOnStartNamespaceDef(Scope *NamespcScope,
     // look through using directives, just look for any ordinary names
     // as if by qualified name lookup.
     LookupResult R(*this, II, IdentLoc, LookupOrdinaryName,
-                   ForExternalRedeclaration);
+                   RedeclarationKind::ForExternalRedeclaration);
     LookupQualifiedName(R, CurContext->getRedeclContext());
     NamedDecl *PrevDecl =
         R.isSingleResult() ? R.getRepresentativeDecl() : nullptr;
@@ -12915,7 +12916,7 @@ NamedDecl *Sema::BuildUsingDeclaration(
 
   // Do the redeclaration lookup in the current scope.
   LookupResult Previous(*this, UsingName, LookupUsingDeclName,
-                        ForVisibleRedeclaration);
+                        RedeclarationKind::ForVisibleRedeclaration);
   Previous.setHideTags(false);
   if (S) {
     LookupName(Previous, S);
@@ -13158,7 +13159,7 @@ NamedDecl *Sema::BuildUsingEnumDeclaration(Scope *S, AccessSpecifier AS,
     /// In class scope, check if this is a duplicate, for better a diagnostic.
     DeclarationNameInfo UsingEnumName(ED->getDeclName(), NameLoc);
     LookupResult Previous(*this, UsingEnumName, LookupUsingDeclName,
-                          ForVisibleRedeclaration);
+                          RedeclarationKind::ForVisibleRedeclaration);
 
     LookupName(Previous, S);
 
@@ -13191,7 +13192,7 @@ NamedDecl *Sema::BuildUsingEnumDeclaration(Scope *S, AccessSpecifier AS,
     UsingShadowDecl *PrevDecl = nullptr;
     DeclarationNameInfo DNI(EC->getDeclName(), EC->getLocation());
     LookupResult Previous(*this, DNI, LookupOrdinaryName,
-                          ForVisibleRedeclaration);
+                          RedeclarationKind::ForVisibleRedeclaration);
     LookupName(Previous, S);
     FilterUsingLookup(S, Previous);
 
@@ -13586,7 +13587,7 @@ Decl *Sema::ActOnAliasDeclaration(Scope *S, AccessSpecifier AS,
   LookupResult Previous(*this, NameInfo, LookupOrdinaryName,
                         TemplateParamLists.size()
                             ? forRedeclarationInCurContext()
-                            : ForVisibleRedeclaration);
+                            : RedeclarationKind::ForVisibleRedeclaration);
   LookupName(Previous, S);
 
   // Warn about shadowing the name of a template parameter.
@@ -13736,7 +13737,7 @@ Decl *Sema::ActOnNamespaceAliasDef(Scope *S, SourceLocation NamespaceLoc,
 
   // Check if we have a previous declaration with the same name.
   LookupResult PrevR(*this, Alias, AliasLoc, LookupOrdinaryName,
-                     ForVisibleRedeclaration);
+                     RedeclarationKind::ForVisibleRedeclaration);
   LookupName(PrevR, S);
 
   // Check we're not shadowing a template parameter.
@@ -13982,7 +13983,7 @@ void Sema::CheckImplicitSpecialMemberDeclaration(Scope *S, FunctionDecl *FD) {
   // implicit special members with this name.
   DeclarationName Name = FD->getDeclName();
   LookupResult R(*this, Name, SourceLocation(), LookupOrdinaryName,
-                 ForExternalRedeclaration);
+                 RedeclarationKind::ForExternalRedeclaration);
   for (auto *D : FD->getParent()->lookup(Name))
     if (auto *Acceptable = R.getAcceptableDecl(D))
       R.addDecl(Acceptable);
@@ -17112,9 +17113,9 @@ Decl *Sema::ActOnExceptionDeclarator(Scope *S, Declarator &D) {
   }
 
   const IdentifierInfo *II = D.getIdentifier();
-  if (NamedDecl *PrevDecl = LookupSingleName(S, II, D.getIdentifierLoc(),
-                                             LookupOrdinaryName,
-                                             ForVisibleRedeclaration)) {
+  if (NamedDecl *PrevDecl =
+          LookupSingleName(S, II, D.getIdentifierLoc(), LookupOrdinaryName,
+                           RedeclarationKind::ForVisibleRedeclaration)) {
     // The scope should be freshly made just for us. There is just no way
     // it contains any previous declaration, except for function parameters in
     // a function-try-block's catch statement.
@@ -17905,7 +17906,7 @@ NamedDecl *Sema::ActOnFriendFunctionDecl(Scope *S, Declarator &D,
   DeclContext *DC;
   Scope *DCScope = S;
   LookupResult Previous(*this, NameInfo, LookupOrdinaryName,
-                        ForExternalRedeclaration);
+                        RedeclarationKind::ForExternalRedeclaration);
 
   bool isTemplateId = D.getName().getKind() == UnqualifiedIdKind::IK_TemplateId;
 
@@ -18654,8 +18655,8 @@ void Sema::MarkVTableUsed(SourceLocation Loc, CXXRecordDecl *Class,
   // Do not mark as used if compiling for the device outside of the target
   // region.
   if (TUKind != TU_Prefix && LangOpts.OpenMP && LangOpts.OpenMPIsTargetDevice &&
-      !isInOpenMPDeclareTargetContext() &&
-      !isInOpenMPTargetExecutionDirective()) {
+      !OpenMP().isInOpenMPDeclareTargetContext() &&
+      !OpenMP().isInOpenMPTargetExecutionDirective()) {
     if (!DefinitionRequired)
       MarkVirtualMembersReferenced(Loc, Class);
     return;
@@ -19241,7 +19242,7 @@ MSPropertyDecl *Sema::HandleMSProperty(Scope *S, RecordDecl *Record,
   // Check to see if this name was declared as a member previously
   NamedDecl *PrevDecl = nullptr;
   LookupResult Previous(*this, II, Loc, LookupMemberName,
-                        ForVisibleRedeclaration);
+                        RedeclarationKind::ForVisibleRedeclaration);
   LookupName(Previous, S);
   switch (Previous.getResultKind()) {
   case LookupResult::Found:
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 24f354f1c7249..d2c77ad61644f 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -52,6 +52,7 @@
 #include "clang/Sema/SemaCUDA.h"
 #include "clang/Sema/SemaFixItUtils.h"
 #include "clang/Sema/SemaInternal.h"
+#include "clang/Sema/SemaOpenMP.h"
 #include "clang/Sema/Template.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/STLForwardCompat.h"
@@ -360,9 +361,9 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, ArrayRef<SourceLocation> Locs,
   //  at the same location.
   // [OpenMP 5.2] Also allow iterator declared variables.
   if (LangOpts.OpenMP && isa<VarDecl>(D) &&
-      !isOpenMPDeclareMapperVarDeclAllowed(cast<VarDecl>(D))) {
+      !OpenMP().isOpenMPDeclareMapperVarDeclAllowed(cast<VarDecl>(D))) {
     Diag(Loc, diag::err_omp_declare_mapper_wrong_var)
-        << getOpenMPDeclareMapperVarName();
+        << OpenMP().getOpenMPDeclareMapperVarName();
     Diag(D->getLocation(), diag::note_entity_declared_at) << D;
     return true;
   }
@@ -2267,7 +2268,7 @@ NonOdrUseReason Sema::getNonOdrUseReasonInCurrentContext(ValueDecl *D) {
   //   be loaded from the captured.
   if (VarDecl *VD = dyn_cast<VarDecl>(D)) {
     if (VD->getType()->isReferenceType() &&
-        !(getLangOpts().OpenMP && isOpenMPCapturedDecl(D)) &&
+        !(getLangOpts().OpenMP && OpenMP().isOpenMPCapturedDecl(D)) &&
         !isCapturingReferenceToHostVarInCUDADeviceLambda(*this, VD) &&
         VD->isUsableInConstantExpressions(Context))
       return NOUR_Constant;
@@ -5080,9 +5081,10 @@ ExprResult Sema::ActOnArraySubscriptExpr(Scope *S, Expr *base,
 
   if (base && !base->getType().isNull() &&
       base->hasPlaceholderType(BuiltinType::OMPArraySection))
-    return ActOnOMPArraySectionExpr(base, lbLoc, ArgExprs.front(), SourceLocation(),
-                                    SourceLocation(), /*Length*/ nullptr,
-                                    /*Stride=*/nullptr, rbLoc);
+    return OpenMP().ActOnOMPArraySectionExpr(base, lbLoc, ArgExprs.front(),
+                                             SourceLocation(), SourceLocation(),
+                                             /*Length*/ nullptr,
+                                             /*Stride=*/nullptr, rbLoc);
 
   // Since this might be a postfix expression, get rid of ParenListExprs.
   if (isa<ParenListExpr>(base)) {
@@ -5354,558 +5356,6 @@ void Sema::CheckSubscriptAccessOfNoDeref(const ArraySubscriptExpr *E) {
   }
 }
 
-ExprResult Sema::ActOnOMPArraySectionExpr(Expr *Base, SourceLocation LBLoc,
-                                          Expr *LowerBound,
-                                          SourceLocation ColonLocFirst,
-                                          SourceLocation ColonLocSecond,
-                                          Expr *Length, Expr *Stride,
-                                          SourceLocation RBLoc) {
-  if (Base->hasPlaceholderType() &&
-      !Base->hasPlaceholderType(BuiltinType::OMPArraySection)) {
-    ExprResult Result = CheckPlaceholderExpr(Base);
-    if (Result.isInvalid())
-      return ExprError();
-    Base = Result.get();
-  }
-  if (LowerBound && LowerBound->getType()->isNonOverloadPlaceholderType()) {
-    ExprResult Result = CheckPlaceholderExpr(LowerBound);
-    if (Result.isInvalid())
-      return ExprError();
-    Result = DefaultLvalueConversion(Result.get());
-    if (Result.isInvalid())
-      return ExprError();
-    LowerBound = Result.get();
-  }
-  if (Length && Length->getType()->isNonOverloadPlaceholderType()) {
-    ExprResult Result = CheckPlaceholderExpr(Length);
-    if (Result.isInvalid())
-      return ExprError();
-    Result = DefaultLvalueConversion(Result.get());
-    if (Result.isInvalid())
-      return ExprError();
-    Length = Result.get();
-  }
-  if (Stride && Stride->getType()->isNonOverloadPlaceholderType()) {
-    ExprResult Result = CheckPlaceholderExpr(Stride);
-    if (Result.isInvalid())
-      return ExprError();
-    Result = DefaultLvalueConversion(Result.get());
-    if (Result.isInvalid())
-      return ExprError();
-    Stride = Result.get();
-  }
-
-  // Build an unanalyzed expression if either operand is type-dependent.
-  if (Base->isTypeDependent() ||
-      (LowerBound &&
-       (LowerBound->isTypeDependent() || LowerBound->isValueDependent())) ||
-      (Length && (Length->isTypeDependent() || Length->isValueDependent())) ||
-      (Stride && (Stride->isTypeDependent() || Stride->isValueDependent()))) {
-    return new (Context) OMPArraySectionExpr(
-        Base, LowerBound, Length, Stride, Context.DependentTy, VK_LValue,
-        OK_Ordinary, ColonLocFirst, ColonLocSecond, RBLoc);
-  }
-
-  // Perform default conversions.
-  QualType OriginalTy = OMPArraySectionExpr::getBaseOriginalType(Base);
-  QualType ResultTy;
-  if (OriginalTy->isAnyPointerType()) {
-    ResultTy = OriginalTy->getPointeeType();
-  } else if (OriginalTy->isArrayType()) {
-    ResultTy = OriginalTy->getAsArrayTypeUnsafe()->getElementType();
-  } else {
-    return ExprError(
-        Diag(Base->getExprLoc(), diag::err_omp_typecheck_section_value)
-        << Base->getSourceRange());
-  }
-  // C99 6.5.2.1p1
-  if (LowerBound) {
-    auto Res = PerformOpenMPImplicitIntegerConversion(LowerBound->getExprLoc(),
-                                                      LowerBound);
-    if (Res.isInvalid())
-      return ExprError(Diag(LowerBound->getExprLoc(),
-                            diag::err_omp_typecheck_section_not_integer)
-                       << 0 << LowerBound->getSourceRange());
-    LowerBound = Res.get();
-
-    if (LowerBound->getType()->isSpecificBuiltinType(BuiltinType::Char_S) ||
-        LowerBound->getType()->isSpecificBuiltinType(BuiltinType::Char_U))
-      Diag(LowerBound->getExprLoc(), diag::warn_omp_section_is_char)
-          << 0 << LowerBound->getSourceRange();
-  }
-  if (Length) {
-    auto Res =
-        PerformOpenMPImplicitIntegerConversion(Length->getExprLoc(), Length);
-    if (Res.isInvalid())
-      return ExprError(Diag(Length->getExprLoc(),
-                            diag::err_omp_typecheck_section_not_integer)
-                       << 1 << Length->getSourceRange());
-    Length = Res.get();
-
-    if (Length->getType()->isSpecificBuiltinType(BuiltinType::Char_S) ||
-        Length->getType()->isSpecificBuiltinType(BuiltinType::Char_U))
-      Diag(Length->getExprLoc(), diag::warn_omp_section_is_char)
-          << 1 << Length->getSourceRange();
-  }
-  if (Stride) {
-    ExprResult Res =
-        PerformOpenMPImplicitIntegerConversion(Stride->getExprLoc(), Stride);
-    if (Res.isInvalid())
-      return ExprError(Diag(Stride->getExprLoc(),
-                            diag::err_omp_typecheck_section_not_integer)
-                       << 1 << Stride->getSourceRange());
-    Stride = Res.get();
-
-    if (Stride->getType()->isSpecificBuiltinType(BuiltinType::Char_S) ||
-        Stride->getType()->isSpecificBuiltinType(BuiltinType::Char_U))
-      Diag(Stride->getExprLoc(), diag::warn_omp_section_is_char)
-          << 1 << Stride->getSourceRange();
-  }
-
-  // C99 6.5.2.1p1: "shall have type "pointer to *object* type". Similarly,
-  // C++ [expr.sub]p1: The type "T" shall be a completely-defined object
-  // type. Note that functions are not objects, and that (in C99 parlance)
-  // incomplete types are not object types.
-  if (ResultTy->isFunctionType()) {
-    Diag(Base->getExprLoc(), diag::err_omp_section_function_type)
-        << ResultTy << Base->getSourceRange();
-    return ExprError();
-  }
-
-  if (RequireCompleteType(Base->getExprLoc(), ResultTy,
-                          diag::err_omp_section_incomplete_type, Base))
-    return ExprError();
-
-  if (LowerBound && !OriginalTy->isAnyPointerType()) {
-    Expr::EvalResult Result;
-    if (LowerBound->EvaluateAsInt(Result, Context)) {
-      // OpenMP 5.0, [2.1.5 Array Sections]
-      // The array section must be a subset of the original array.
-      llvm::APSInt LowerBoundValue = Result.Val.getInt();
-      if (LowerBoundValue.isNegative()) {
-        Diag(LowerBound->getExprLoc(), diag::err_omp_section_not_subset_of_array)
-            << LowerBound->getSourceRange();
-        return ExprError();
-      }
-    }
-  }
-
-  if (Length) {
-    Expr::EvalResult Result;
-    if (Length->EvaluateAsInt(Result, Context)) {
-      // OpenMP 5.0, [2.1.5 Array Sections]
-      // The length must evaluate to non-negative integers.
-      llvm::APSInt LengthValue = Result.Val.getInt();
-      if (LengthValue.isNegative()) {
-        Diag(Length->getExprLoc(), diag::err_omp_section_length_negative)
-            << toString(LengthValue, /*Radix=*/10, /*Signed=*/true)
-            << Length->getSourceRange();
-        return ExprError();
-      }
-    }
-  } else if (ColonLocFirst.isValid() &&
-             (OriginalTy.isNull() || (!OriginalTy->isConstantArrayType() &&
-                                      !OriginalTy->isVariableArrayType()))) {
-    // OpenMP 5.0, [2.1.5 Array Sections]
-    // When the size of the array dimension is not known, the length must be
-    // specified explicitly.
-    Diag(ColonLocFirst, diag::err_omp_section_length_undefined)
-        << (!OriginalTy.isNull() && OriginalTy->isArrayType());
-    return ExprError();
-  }
-
-  if (Stride) {
-    Expr::EvalResult Result;
-    if (Stride->EvaluateAsInt(Result, Context)) {
-      // OpenMP 5.0, [2.1.5 Array Sections]
-      // The stride must evaluate to a positive integer.
-      llvm::APSInt StrideValue = Result.Val.getInt();
-      if (!StrideValue.isStrictlyPositive()) {
-        Diag(Stride->getExprLoc(), diag::err_omp_section_stride_non_positive)
-            << toString(StrideValue, /*Radix=*/10, /*Signed=*/true)
-            << Stride->getSourceRange();
-        return ExprError();
-      }
-    }
-  }
-
-  if (!Base->hasPlaceholderType(BuiltinType::OMPArraySection)) {
-    ExprResult Result = DefaultFunctionArrayLvalueConversion(Base);
-    if (Result.isInvalid())
-      return ExprError();
-    Base = Result.get();
-  }
-  return new (Context) OMPArraySectionExpr(
-      Base, LowerBound, Length, Stride, Context.OMPArraySectionTy, VK_LValue,
-      OK_Ordinary, ColonLocFirst, ColonLocSecond, RBLoc);
-}
-
-ExprResult Sema::ActOnOMPArrayShapingExpr(Expr *Base, SourceLocation LParenLoc,
-                                          SourceLocation RParenLoc,
-                                          ArrayRef<Expr *> Dims,
-                                          ArrayRef<SourceRange> Brackets) {
-  if (Base->hasPlaceholderType()) {
-    ExprResult Result = CheckPlaceholderExpr(Base);
-    if (Result.isInvalid())
-      return ExprError();
-    Result = DefaultLvalueConversion(Result.get());
-    if (Result.isInvalid())
-      return ExprError();
-    Base = Result.get();
-  }
-  QualType BaseTy = Base->getType();
-  // Delay analysis of the types/expressions if instantiation/specialization is
-  // required.
-  if (!BaseTy->isPointerType() && Base->isTypeDependent())
-    return OMPArrayShapingExpr::Create(Context, Context.DependentTy, Base,
-                                       LParenLoc, RParenLoc, Dims, Brackets);
-  if (!BaseTy->isPointerType() ||
-      (!Base->isTypeDependent() &&
-       BaseTy->getPointeeType()->isIncompleteType()))
-    return ExprError(Diag(Base->getExprLoc(),
-                          diag::err_omp_non_pointer_type_array_shaping_base)
-                     << Base->getSourceRange());
-
-  SmallVector<Expr *, 4> NewDims;
-  bool ErrorFound = false;
-  for (Expr *Dim : Dims) {
-    if (Dim->hasPlaceholderType()) {
-      ExprResult Result = CheckPlaceholderExpr(Dim);
-      if (Result.isInvalid()) {
-        ErrorFound = true;
-        continue;
-      }
-      Result = DefaultLvalueConversion(Result.get());
-      if (Result.isInvalid()) {
-        ErrorFound = true;
-        continue;
-      }
-      Dim = Result.get();
-    }
-    if (!Dim->isTypeDependent()) {
-      ExprResult Result =
-          PerformOpenMPImplicitIntegerConversion(Dim->getExprLoc(), Dim);
-      if (Result.isInvalid()) {
-        ErrorFound = true;
-        Diag(Dim->getExprLoc(), diag::err_omp_typecheck_shaping_not_integer)
-            << Dim->getSourceRange();
-        continue;
-      }
-      Dim = Result.get();
-      Expr::EvalResult EvResult;
-      if (!Dim->isValueDependent() && Dim->EvaluateAsInt(EvResult, Context)) {
-        // OpenMP 5.0, [2.1.4 Array Shaping]
-        // Each si is an integral type expression that must evaluate to a
-        // positive integer.
-        llvm::APSInt Value = EvResult.Val.getInt();
-        if (!Value.isStrictlyPositive()) {
-          Diag(Dim->getExprLoc(), diag::err_omp_shaping_dimension_not_positive)
-              << toString(Value, /*Radix=*/10, /*Signed=*/true)
-              << Dim->getSourceRange();
-          ErrorFound = true;
-          continue;
-        }
-      }
-    }
-    NewDims.push_back(Dim);
-  }
-  if (ErrorFound)
-    return ExprError();
-  return OMPArrayShapingExpr::Create(Context, Context.OMPArrayShapingTy, Base,
-                                     LParenLoc, RParenLoc, NewDims, Brackets);
-}
-
-ExprResult Sema::ActOnOMPIteratorExpr(Scope *S, SourceLocation IteratorKwLoc,
-                                      SourceLocation LLoc, SourceLocation RLoc,
-                                      ArrayRef<OMPIteratorData> Data) {
-  SmallVector<OMPIteratorExpr::IteratorDefinition, 4> ID;
-  bool IsCorrect = true;
-  for (const OMPIteratorData &D : Data) {
-    TypeSourceInfo *TInfo = nullptr;
-    SourceLocation StartLoc;
-    QualType DeclTy;
-    if (!D.Type.getAsOpaquePtr()) {
-      // OpenMP 5.0, 2.1.6 Iterators
-      // In an iterator-specifier, if the iterator-type is not specified then
-      // the type of that iterator is of int type.
-      DeclTy = Context.IntTy;
-      StartLoc = D.DeclIdentLoc;
-    } else {
-      DeclTy = GetTypeFromParser(D.Type, &TInfo);
-      StartLoc = TInfo->getTypeLoc().getBeginLoc();
-    }
-
-    bool IsDeclTyDependent = DeclTy->isDependentType() ||
-                             DeclTy->containsUnexpandedParameterPack() ||
-                             DeclTy->isInstantiationDependentType();
-    if (!IsDeclTyDependent) {
-      if (!DeclTy->isIntegralType(Context) && !DeclTy->isAnyPointerType()) {
-        // OpenMP 5.0, 2.1.6 Iterators, Restrictions, C/C++
-        // The iterator-type must be an integral or pointer type.
-        Diag(StartLoc, diag::err_omp_iterator_not_integral_or_pointer)
-            << DeclTy;
-        IsCorrect = false;
-        continue;
-      }
-      if (DeclTy.isConstant(Context)) {
-        // OpenMP 5.0, 2.1.6 Iterators, Restrictions, C/C++
-        // The iterator-type must not be const qualified.
-        Diag(StartLoc, diag::err_omp_iterator_not_integral_or_pointer)
-            << DeclTy;
-        IsCorrect = false;
-        continue;
-      }
-    }
-
-    // Iterator declaration.
-    assert(D.DeclIdent && "Identifier expected.");
-    // Always try to create iterator declarator to avoid extra error messages
-    // about unknown declarations use.
-    auto *VD = VarDecl::Create(Context, CurContext, StartLoc, D.DeclIdentLoc,
-                               D.DeclIdent, DeclTy, TInfo, SC_None);
-    VD->setImplicit();
-    if (S) {
-      // Check for conflicting previous declaration.
-      DeclarationNameInfo NameInfo(VD->getDeclName(), D.DeclIdentLoc);
-      LookupResult Previous(*this, NameInfo, LookupOrdinaryName,
-                            ForVisibleRedeclaration);
-      Previous.suppressDiagnostics();
-      LookupName(Previous, S);
-
-      FilterLookupForScope(Previous, CurContext, S, /*ConsiderLinkage=*/false,
-                           /*AllowInlineNamespace=*/false);
-      if (!Previous.empty()) {
-        NamedDecl *Old = Previous.getRepresentativeDecl();
-        Diag(D.DeclIdentLoc, diag::err_redefinition) << VD->getDeclName();
-        Diag(Old->getLocation(), diag::note_previous_definition);
-      } else {
-        PushOnScopeChains(VD, S);
-      }
-    } else {
-      CurContext->addDecl(VD);
-    }
-
-    /// Act on the iterator variable declaration.
-    ActOnOpenMPIteratorVarDecl(VD);
-
-    Expr *Begin = D.Range.Begin;
-    if (!IsDeclTyDependent && Begin && !Begin->isTypeDependent()) {
-      ExprResult BeginRes =
-          PerformImplicitConversion(Begin, DeclTy, AA_Converting);
-      Begin = BeginRes.get();
-    }
-    Expr *End = D.Range.End;
-    if (!IsDeclTyDependent && End && !End->isTypeDependent()) {
-      ExprResult EndRes = PerformImplicitConversion(End, DeclTy, AA_Converting);
-      End = EndRes.get();
-    }
-    Expr *Step = D.Range.Step;
-    if (!IsDeclTyDependent && Step && !Step->isTypeDependent()) {
-      if (!Step->getType()->isIntegralType(Context)) {
-        Diag(Step->getExprLoc(), diag::err_omp_iterator_step_not_integral)
-            << Step << Step->getSourceRange();
-        IsCorrect = false;
-        continue;
-      }
-      std::optional<llvm::APSInt> Result =
-          Step->getIntegerConstantExpr(Context);
-      // OpenMP 5.0, 2.1.6 Iterators, Restrictions
-      // If the step expression of a range-specification equals zero, the
-      // behavior is unspecified.
-      if (Result && Result->isZero()) {
-        Diag(Step->getExprLoc(), diag::err_omp_iterator_step_constant_zero)
-            << Step << Step->getSourceRange();
-        IsCorrect = false;
-        continue;
-      }
-    }
-    if (!Begin || !End || !IsCorrect) {
-      IsCorrect = false;
-      continue;
-    }
-    OMPIteratorExpr::IteratorDefinition &IDElem = ID.emplace_back();
-    IDElem.IteratorDecl = VD;
-    IDElem.AssignmentLoc = D.AssignLoc;
-    IDElem.Range.Begin = Begin;
-    IDElem.Range.End = End;
-    IDElem.Range.Step = Step;
-    IDElem.ColonLoc = D.ColonLoc;
-    IDElem.SecondColonLoc = D.SecColonLoc;
-  }
-  if (!IsCorrect) {
-    // Invalidate all created iterator declarations if error is found.
-    for (const OMPIteratorExpr::IteratorDefinition &D : ID) {
-      if (Decl *ID = D.IteratorDecl)
-        ID->setInvalidDecl();
-    }
-    return ExprError();
-  }
-  SmallVector<OMPIteratorHelperData, 4> Helpers;
-  if (!CurContext->isDependentContext()) {
-    // Build number of ityeration for each iteration range.
-    // Ni = ((Stepi > 0) ? ((Endi + Stepi -1 - Begini)/Stepi) :
-    // ((Begini-Stepi-1-Endi) / -Stepi);
-    for (OMPIteratorExpr::IteratorDefinition &D : ID) {
-      // (Endi - Begini)
-      ExprResult Res = CreateBuiltinBinOp(D.AssignmentLoc, BO_Sub, D.Range.End,
-                                          D.Range.Begin);
-      if(!Res.isUsable()) {
-        IsCorrect = false;
-        continue;
-      }
-      ExprResult St, St1;
-      if (D.Range.Step) {
-        St = D.Range.Step;
-        // (Endi - Begini) + Stepi
-        Res = CreateBuiltinBinOp(D.AssignmentLoc, BO_Add, Res.get(), St.get());
-        if (!Res.isUsable()) {
-          IsCorrect = false;
-          continue;
-        }
-        // (Endi - Begini) + Stepi - 1
-        Res =
-            CreateBuiltinBinOp(D.AssignmentLoc, BO_Sub, Res.get(),
-                               ActOnIntegerConstant(D.AssignmentLoc, 1).get());
-        if (!Res.isUsable()) {
-          IsCorrect = false;
-          continue;
-        }
-        // ((Endi - Begini) + Stepi - 1) / Stepi
-        Res = CreateBuiltinBinOp(D.AssignmentLoc, BO_Div, Res.get(), St.get());
-        if (!Res.isUsable()) {
-          IsCorrect = false;
-          continue;
-        }
-        St1 = CreateBuiltinUnaryOp(D.AssignmentLoc, UO_Minus, D.Range.Step);
-        // (Begini - Endi)
-        ExprResult Res1 = CreateBuiltinBinOp(D.AssignmentLoc, BO_Sub,
-                                             D.Range.Begin, D.Range.End);
-        if (!Res1.isUsable()) {
-          IsCorrect = false;
-          continue;
-        }
-        // (Begini - Endi) - Stepi
-        Res1 =
-            CreateBuiltinBinOp(D.AssignmentLoc, BO_Add, Res1.get(), St1.get());
-        if (!Res1.isUsable()) {
-          IsCorrect = false;
-          continue;
-        }
-        // (Begini - Endi) - Stepi - 1
-        Res1 =
-            CreateBuiltinBinOp(D.AssignmentLoc, BO_Sub, Res1.get(),
-                               ActOnIntegerConstant(D.AssignmentLoc, 1).get());
-        if (!Res1.isUsable()) {
-          IsCorrect = false;
-          continue;
-        }
-        // ((Begini - Endi) - Stepi - 1) / (-Stepi)
-        Res1 =
-            CreateBuiltinBinOp(D.AssignmentLoc, BO_Div, Res1.get(), St1.get());
-        if (!Res1.isUsable()) {
-          IsCorrect = false;
-          continue;
-        }
-        // Stepi > 0.
-        ExprResult CmpRes =
-            CreateBuiltinBinOp(D.AssignmentLoc, BO_GT, D.Range.Step,
-                               ActOnIntegerConstant(D.AssignmentLoc, 0).get());
-        if (!CmpRes.isUsable()) {
-          IsCorrect = false;
-          continue;
-        }
-        Res = ActOnConditionalOp(D.AssignmentLoc, D.AssignmentLoc, CmpRes.get(),
-                                 Res.get(), Res1.get());
-        if (!Res.isUsable()) {
-          IsCorrect = false;
-          continue;
-        }
-      }
-      Res = ActOnFinishFullExpr(Res.get(), /*DiscardedValue=*/false);
-      if (!Res.isUsable()) {
-        IsCorrect = false;
-        continue;
-      }
-
-      // Build counter update.
-      // Build counter.
-      auto *CounterVD =
-          VarDecl::Create(Context, CurContext, D.IteratorDecl->getBeginLoc(),
-                          D.IteratorDecl->getBeginLoc(), nullptr,
-                          Res.get()->getType(), nullptr, SC_None);
-      CounterVD->setImplicit();
-      ExprResult RefRes =
-          BuildDeclRefExpr(CounterVD, CounterVD->getType(), VK_LValue,
-                           D.IteratorDecl->getBeginLoc());
-      // Build counter update.
-      // I = Begini + counter * Stepi;
-      ExprResult UpdateRes;
-      if (D.Range.Step) {
-        UpdateRes = CreateBuiltinBinOp(
-            D.AssignmentLoc, BO_Mul,
-            DefaultLvalueConversion(RefRes.get()).get(), St.get());
-      } else {
-        UpdateRes = DefaultLvalueConversion(RefRes.get());
-      }
-      if (!UpdateRes.isUsable()) {
-        IsCorrect = false;
-        continue;
-      }
-      UpdateRes = CreateBuiltinBinOp(D.AssignmentLoc, BO_Add, D.Range.Begin,
-                                     UpdateRes.get());
-      if (!UpdateRes.isUsable()) {
-        IsCorrect = false;
-        continue;
-      }
-      ExprResult VDRes =
-          BuildDeclRefExpr(cast<VarDecl>(D.IteratorDecl),
-                           cast<VarDecl>(D.IteratorDecl)->getType(), VK_LValue,
-                           D.IteratorDecl->getBeginLoc());
-      UpdateRes = CreateBuiltinBinOp(D.AssignmentLoc, BO_Assign, VDRes.get(),
-                                     UpdateRes.get());
-      if (!UpdateRes.isUsable()) {
-        IsCorrect = false;
-        continue;
-      }
-      UpdateRes =
-          ActOnFinishFullExpr(UpdateRes.get(), /*DiscardedValue=*/true);
-      if (!UpdateRes.isUsable()) {
-        IsCorrect = false;
-        continue;
-      }
-      ExprResult CounterUpdateRes =
-          CreateBuiltinUnaryOp(D.AssignmentLoc, UO_PreInc, RefRes.get());
-      if (!CounterUpdateRes.isUsable()) {
-        IsCorrect = false;
-        continue;
-      }
-      CounterUpdateRes =
-          ActOnFinishFullExpr(CounterUpdateRes.get(), /*DiscardedValue=*/true);
-      if (!CounterUpdateRes.isUsable()) {
-        IsCorrect = false;
-        continue;
-      }
-      OMPIteratorHelperData &HD = Helpers.emplace_back();
-      HD.CounterVD = CounterVD;
-      HD.Upper = Res.get();
-      HD.Update = UpdateRes.get();
-      HD.CounterUpdate = CounterUpdateRes.get();
-    }
-  } else {
-    Helpers.assign(ID.size(), {});
-  }
-  if (!IsCorrect) {
-    // Invalidate all created iterator declarations if error is found.
-    for (const OMPIteratorExpr::IteratorDefinition &D : ID) {
-      if (Decl *ID = D.IteratorDecl)
-        ID->setInvalidDecl();
-    }
-    return ExprError();
-  }
-  return OMPIteratorExpr::Create(Context, Context.OMPIteratorTy, IteratorKwLoc,
-                                 LLoc, RLoc, ID, Helpers);
-}
-
 ExprResult
 Sema::CreateBuiltinArraySubscriptExpr(Expr *Base, SourceLocation LLoc,
                                       Expr *Idx, SourceLocation RLoc) {
@@ -7190,8 +6640,8 @@ ExprResult Sema::ActOnCallExpr(Scope *Scope, Expr *Fn, SourceLocation LParenLoc,
   }
 
   if (LangOpts.OpenMP)
-    Call = ActOnOpenMPCall(Call, Scope, LParenLoc, ArgExprs, RParenLoc,
-                           ExecConfig);
+    Call = OpenMP().ActOnOpenMPCall(Call, Scope, LParenLoc, ArgExprs, RParenLoc,
+                                    ExecConfig);
   if (LangOpts.CPlusPlus) {
     if (const auto *CE = dyn_cast<CallExpr>(Call.get()))
       DiagnosedUnqualifiedCallsToStdFunctions(*this, CE);
@@ -7739,7 +7189,8 @@ ExprResult Sema::BuildResolvedCallExpr(Expr *Fn, NamedDecl *NDecl,
   }
 
   if (CXXMethodDecl *Method = dyn_cast_or_null<CXXMethodDecl>(FDecl))
-    if (Method->isImplicitObjectMemberFunction())
+    if (!isa<RequiresExprBodyDecl>(CurContext) &&
+        Method->isImplicitObjectMemberFunction())
       return ExprError(Diag(LParenLoc, diag::err_member_call_without_object)
                        << Fn->getSourceRange() << 0);
 
@@ -7880,7 +7331,7 @@ Sema::BuildCompoundLiteralExpr(SourceLocation LParenLoc, TypeSourceInfo *TInfo,
     if (!LiteralExpr->isTypeDependent() &&
         !LiteralExpr->isValueDependent() &&
         !literalType->isDependentType()) // C99 6.5.2.5p3
-      if (CheckForConstantInitializer(LiteralExpr, literalType))
+      if (CheckForConstantInitializer(LiteralExpr))
         return ExprError();
   } else if (literalType.getAddressSpace() != LangAS::opencl_private &&
              literalType.getAddressSpace() != LangAS::Default) {
@@ -10726,8 +10177,9 @@ Sema::CheckSingleAssignmentConstraints(QualType LHSType, ExprResult &CallerRHS,
     // diagnostics and just checking for errors, e.g., during overload
     // resolution, return Incompatible to indicate the failure.
     if (getLangOpts().allowsNonTrivialObjCLifetimeQualifiers() &&
-        CheckObjCConversion(SourceRange(), Ty, E, CCK_ImplicitConversion,
-                            Diagnose, DiagnoseCFAudited) != ACR_okay) {
+        CheckObjCConversion(SourceRange(), Ty, E,
+                            CheckedConversionKind::Implicit, Diagnose,
+                            DiagnoseCFAudited) != ACR_okay) {
       if (!Diagnose)
         return Incompatible;
     }
@@ -13448,14 +12900,15 @@ QualType Sema::CheckCompareOperands(ExprResult &LHS, ExprResult &RHS,
         Expr *E = LHS.get();
         if (getLangOpts().ObjCAutoRefCount)
           CheckObjCConversion(SourceRange(), RHSType, E,
-                              CCK_ImplicitConversion);
+                              CheckedConversionKind::Implicit);
         LHS = ImpCastExprToType(E, RHSType,
                                 RPT ? CK_BitCast :CK_CPointerToObjCPointerCast);
       }
       else {
         Expr *E = RHS.get();
         if (getLangOpts().ObjCAutoRefCount)
-          CheckObjCConversion(SourceRange(), LHSType, E, CCK_ImplicitConversion,
+          CheckObjCConversion(SourceRange(), LHSType, E,
+                              CheckedConversionKind::Implicit,
                               /*Diagnose=*/true,
                               /*DiagnoseCFAudited=*/false, Opc);
         RHS = ImpCastExprToType(E, LHSType,
@@ -19192,7 +18645,7 @@ MarkVarDeclODRUsed(ValueDecl *V, SourceLocation Loc, Sema &SemaRef,
   }
   QualType CaptureType, DeclRefType;
   if (SemaRef.LangOpts.OpenMP)
-    SemaRef.tryCaptureOpenMPLambdas(V);
+    SemaRef.OpenMP().tryCaptureOpenMPLambdas(V);
   SemaRef.tryCaptureVariable(V, Loc, Sema::TryCapture_Implicit,
                              /*EllipsisLoc*/ SourceLocation(),
                              /*BuildAndDiagnose*/ true, CaptureType,
@@ -19473,7 +18926,7 @@ static bool captureInBlock(BlockScopeInfo *BSI, ValueDecl *Var,
 
   const bool HasBlocksAttr = Var->hasAttr<BlocksAttr>();
   if (HasBlocksAttr || CaptureType->isReferenceType() ||
-      (S.getLangOpts().OpenMP && S.isOpenMPCapturedDecl(Var))) {
+      (S.getLangOpts().OpenMP && S.OpenMP().isOpenMPCapturedDecl(Var))) {
     // Block capture by reference does not change the capture or
     // declaration reference types.
     ByRef = true;
@@ -19503,7 +18956,7 @@ static bool captureInCapturedRegion(
     ByRef = (Kind == Sema::TryCapture_ExplicitByRef);
   } else if (S.getLangOpts().OpenMP && RSI->CapRegionKind == CR_OpenMP) {
     // Using an LValue reference type is consistent with Lambdas (see below).
-    if (S.isOpenMPCapturedDecl(Var)) {
+    if (S.OpenMP().isOpenMPCapturedDecl(Var)) {
       bool HasConst = DeclRefType.isConstQualified();
       DeclRefType = DeclRefType.getUnqualifiedType();
       // Don't lose diagnostics about assignments to const.
@@ -19511,11 +18964,11 @@ static bool captureInCapturedRegion(
         DeclRefType.addConst();
     }
     // Do not capture firstprivates in tasks.
-    if (S.isOpenMPPrivateDecl(Var, RSI->OpenMPLevel, RSI->OpenMPCaptureLevel) !=
-        OMPC_unknown)
+    if (S.OpenMP().isOpenMPPrivateDecl(Var, RSI->OpenMPLevel,
+                                       RSI->OpenMPCaptureLevel) != OMPC_unknown)
       return true;
-    ByRef = S.isOpenMPCapturedByRef(Var, RSI->OpenMPLevel,
-                                    RSI->OpenMPCaptureLevel);
+    ByRef = S.OpenMP().isOpenMPCapturedByRef(Var, RSI->OpenMPLevel,
+                                             RSI->OpenMPCaptureLevel);
   }
 
   if (ByRef)
@@ -19776,9 +19229,9 @@ bool Sema::tryCaptureVariable(
   // Capture global variables if it is required to use private copy of this
   // variable.
   bool IsGlobal = !VD->hasLocalStorage();
-  if (IsGlobal &&
-      !(LangOpts.OpenMP && isOpenMPCapturedDecl(Var, /*CheckScopeInfo=*/true,
-                                                MaxFunctionScopesIndex)))
+  if (IsGlobal && !(LangOpts.OpenMP &&
+                    OpenMP().isOpenMPCapturedDecl(Var, /*CheckScopeInfo=*/true,
+                                                  MaxFunctionScopesIndex)))
     return true;
 
   if (isa<VarDecl>(Var))
@@ -19896,7 +19349,7 @@ bool Sema::tryCaptureVariable(
             }
             return true;
           }
-          OpenMPClauseKind IsOpenMPPrivateDecl = isOpenMPPrivateDecl(
+          OpenMPClauseKind IsOpenMPPrivateDecl = OpenMP().isOpenMPPrivateDecl(
               Var, RSI->OpenMPLevel, RSI->OpenMPCaptureLevel);
           // If the variable is private (i.e. not captured) and has variably
           // modified type, we still need to capture the type for correct
@@ -19907,7 +19360,8 @@ bool Sema::tryCaptureVariable(
             QualType QTy = Var->getType();
             if (ParmVarDecl *PVD = dyn_cast_or_null<ParmVarDecl>(Var))
               QTy = PVD->getOriginalType();
-            for (int I = 1, E = getNumberOfConstructScopes(RSI->OpenMPLevel);
+            for (int I = 1,
+                     E = OpenMP().getNumberOfConstructScopes(RSI->OpenMPLevel);
                  I < E; ++I) {
               auto *OuterRSI = cast<CapturedRegionScopeInfo>(
                   FunctionScopes[FunctionScopesIndex - I]);
@@ -19919,18 +19373,19 @@ bool Sema::tryCaptureVariable(
           }
           bool IsTargetCap =
               IsOpenMPPrivateDecl != OMPC_private &&
-              isOpenMPTargetCapturedDecl(Var, RSI->OpenMPLevel,
-                                         RSI->OpenMPCaptureLevel);
+              OpenMP().isOpenMPTargetCapturedDecl(Var, RSI->OpenMPLevel,
+                                                  RSI->OpenMPCaptureLevel);
           // Do not capture global if it is not privatized in outer regions.
           bool IsGlobalCap =
-              IsGlobal && isOpenMPGlobalCapturedDecl(Var, RSI->OpenMPLevel,
-                                                     RSI->OpenMPCaptureLevel);
+              IsGlobal && OpenMP().isOpenMPGlobalCapturedDecl(
+                              Var, RSI->OpenMPLevel, RSI->OpenMPCaptureLevel);
 
           // When we detect target captures we are looking from inside the
           // target region, therefore we need to propagate the capture from the
           // enclosing region. Therefore, the capture is not initially nested.
           if (IsTargetCap)
-            adjustOpenMPTargetScopeIndex(FunctionScopesIndex, RSI->OpenMPLevel);
+            OpenMP().adjustOpenMPTargetScopeIndex(FunctionScopesIndex,
+                                                  RSI->OpenMPLevel);
 
           if (IsTargetCap || IsOpenMPPrivateDecl == OMPC_private ||
               (IsGlobal && !IsGlobalCap)) {
@@ -20752,8 +20207,8 @@ static void
 MarkExprReferenced(Sema &SemaRef, SourceLocation Loc, Decl *D, Expr *E,
                    bool MightBeOdrUse,
                    llvm::DenseMap<const VarDecl *, int> &RefsMinusAssignments) {
-  if (SemaRef.isInOpenMPDeclareTargetContext())
-    SemaRef.checkDeclIsAllowedInOpenMPTarget(E, D);
+  if (SemaRef.OpenMP().isInOpenMPDeclareTargetContext())
+    SemaRef.OpenMP().checkDeclIsAllowedInOpenMPTarget(E, D);
 
   if (VarDecl *Var = dyn_cast<VarDecl>(D)) {
     DoMarkVarDeclReferenced(SemaRef, Loc, Var, E, RefsMinusAssignments);
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 74ed3fe7bd520..7582cbd75fec0 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -4250,7 +4250,8 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType,
                                 AssignmentAction Action,
                                 CheckedConversionKind CCK) {
   // C++ [over.match.oper]p7: [...] operands of class type are converted [...]
-  if (CCK == CCK_ForBuiltinOverloadedOp && !From->getType()->isRecordType())
+  if (CCK == CheckedConversionKind::ForBuiltinOverloadedOp &&
+      !From->getType()->isRecordType())
     return From;
 
   switch (ICS.getKind()) {
@@ -4311,7 +4312,7 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType,
       // C++ [over.match.oper]p7:
       //   [...] the second standard conversion sequence of a user-defined
       //   conversion sequence is not applied.
-      if (CCK == CCK_ForBuiltinOverloadedOp)
+      if (CCK == CheckedConversionKind::ForBuiltinOverloadedOp)
         return From;
 
       return PerformImplicitConversion(From, ToType, ICS.UserDefined.After,
@@ -4352,7 +4353,8 @@ Sema::PerformImplicitConversion(Expr *From, QualType ToType,
                                 const StandardConversionSequence& SCS,
                                 AssignmentAction Action,
                                 CheckedConversionKind CCK) {
-  bool CStyle = (CCK == CCK_CStyleCast || CCK == CCK_FunctionalCast);
+  bool CStyle = (CCK == CheckedConversionKind::CStyleCast ||
+                 CCK == CheckedConversionKind::FunctionalCast);
 
   // Overall FIXME: we are recomputing too many types here and doing far too
   // much extra work. What this means is that we need to keep track of more
@@ -9151,7 +9153,7 @@ Sema::CheckMicrosoftIfExistsSymbol(Scope *S,
 
   // Do the redeclaration lookup in the current scope.
   LookupResult R(*this, TargetNameInfo, Sema::LookupAnyName,
-                 Sema::NotForRedeclaration);
+                 RedeclarationKind::NotForRedeclaration);
   LookupParsedName(R, S, &SS);
   R.suppressDiagnostics();
 
diff --git a/clang/lib/Sema/SemaExprMember.cpp b/clang/lib/Sema/SemaExprMember.cpp
index 32998ae60eafe..c79128bc8f39e 100644
--- a/clang/lib/Sema/SemaExprMember.cpp
+++ b/clang/lib/Sema/SemaExprMember.cpp
@@ -9,7 +9,6 @@
 //  This file implements semantic analysis member access expressions.
 //
 //===----------------------------------------------------------------------===//
-#include "clang/Sema/Overload.h"
 #include "clang/AST/ASTLambda.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclObjC.h"
@@ -18,9 +17,11 @@
 #include "clang/AST/ExprObjC.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/Lookup.h"
+#include "clang/Sema/Overload.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/ScopeInfo.h"
 #include "clang/Sema/SemaInternal.h"
+#include "clang/Sema/SemaOpenMP.h"
 
 using namespace clang;
 using namespace sema;
@@ -727,7 +728,7 @@ static bool LookupMemberExprInRecord(Sema &SemaRef, LookupResult &R,
     Sema &SemaRef;
     DeclarationNameInfo NameInfo;
     Sema::LookupNameKind LookupKind;
-    Sema::RedeclarationKind Redecl;
+    RedeclarationKind Redecl;
   };
   QueryState Q = {R.getSema(), R.getLookupNameInfo(), R.getLookupKind(),
                   R.redeclarationKind()};
@@ -1900,9 +1901,9 @@ Sema::BuildFieldReferenceExpr(Expr *BaseExpr, bool IsArrow,
   if (getLangOpts().OpenMP && IsArrow &&
       !CurContext->isDependentContext() &&
       isa<CXXThisExpr>(Base.get()->IgnoreParenImpCasts())) {
-    if (auto *PrivateCopy = isOpenMPCapturedDecl(Field)) {
-      return getOpenMPCapturedExpr(PrivateCopy, VK, OK,
-                                   MemberNameInfo.getLoc());
+    if (auto *PrivateCopy = OpenMP().isOpenMPCapturedDecl(Field)) {
+      return OpenMP().getOpenMPCapturedExpr(PrivateCopy, VK, OK,
+                                            MemberNameInfo.getLoc());
     }
   }
 
diff --git a/clang/lib/Sema/SemaExprObjC.cpp b/clang/lib/Sema/SemaExprObjC.cpp
index 3148f0db6e20c..b13a9d426983b 100644
--- a/clang/lib/Sema/SemaExprObjC.cpp
+++ b/clang/lib/Sema/SemaExprObjC.cpp
@@ -3745,22 +3745,22 @@ bool Sema::isKnownName(StringRef name) {
 
 template <typename DiagBuilderT>
 static void addFixitForObjCARCConversion(
-    Sema &S, DiagBuilderT &DiagB, Sema::CheckedConversionKind CCK,
+    Sema &S, DiagBuilderT &DiagB, CheckedConversionKind CCK,
     SourceLocation afterLParen, QualType castType, Expr *castExpr,
     Expr *realCast, const char *bridgeKeyword, const char *CFBridgeName) {
   // We handle C-style and implicit casts here.
   switch (CCK) {
-  case Sema::CCK_ImplicitConversion:
-  case Sema::CCK_ForBuiltinOverloadedOp:
-  case Sema::CCK_CStyleCast:
-  case Sema::CCK_OtherCast:
+  case CheckedConversionKind::Implicit:
+  case CheckedConversionKind::ForBuiltinOverloadedOp:
+  case CheckedConversionKind::CStyleCast:
+  case CheckedConversionKind::OtherCast:
     break;
-  case Sema::CCK_FunctionalCast:
+  case CheckedConversionKind::FunctionalCast:
     return;
   }
 
   if (CFBridgeName) {
-    if (CCK == Sema::CCK_OtherCast) {
+    if (CCK == CheckedConversionKind::OtherCast) {
       if (const CXXNamedCastExpr *NCE = dyn_cast<CXXNamedCastExpr>(realCast)) {
         SourceRange range(NCE->getOperatorLoc(),
                           NCE->getAngleBrackets().getEnd());
@@ -3805,9 +3805,9 @@ static void addFixitForObjCARCConversion(
     return;
   }
 
-  if (CCK == Sema::CCK_CStyleCast) {
+  if (CCK == CheckedConversionKind::CStyleCast) {
     DiagB.AddFixItHint(FixItHint::CreateInsertion(afterLParen, bridgeKeyword));
-  } else if (CCK == Sema::CCK_OtherCast) {
+  } else if (CCK == CheckedConversionKind::OtherCast) {
     if (const CXXNamedCastExpr *NCE = dyn_cast<CXXNamedCastExpr>(realCast)) {
       std::string castCode = "(";
       castCode += bridgeKeyword;
@@ -3866,12 +3866,12 @@ static ObjCBridgeRelatedAttr *ObjCBridgeRelatedAttrFromType(QualType T,
   return nullptr;
 }
 
-static void
-diagnoseObjCARCConversion(Sema &S, SourceRange castRange,
-                          QualType castType, ARCConversionTypeClass castACTC,
-                          Expr *castExpr, Expr *realCast,
-                          ARCConversionTypeClass exprACTC,
-                          Sema::CheckedConversionKind CCK) {
+static void diagnoseObjCARCConversion(Sema &S, SourceRange castRange,
+                                      QualType castType,
+                                      ARCConversionTypeClass castACTC,
+                                      Expr *castExpr, Expr *realCast,
+                                      ARCConversionTypeClass exprACTC,
+                                      CheckedConversionKind CCK) {
   SourceLocation loc =
     (castRange.isValid() ? castRange.getBegin() : castExpr->getExprLoc());
 
@@ -3927,7 +3927,7 @@ diagnoseObjCARCConversion(Sema &S, SourceRange castRange,
     assert(CreateRule != ACC_bottom && "This cast should already be accepted.");
     if (CreateRule != ACC_plusOne)
     {
-      auto DiagB = (CCK != Sema::CCK_OtherCast)
+      auto DiagB = (CCK != CheckedConversionKind::OtherCast)
                        ? S.Diag(noteLoc, diag::note_arc_bridge)
                        : S.Diag(noteLoc, diag::note_arc_cstyle_bridge);
 
@@ -3937,7 +3937,7 @@ diagnoseObjCARCConversion(Sema &S, SourceRange castRange,
     }
     if (CreateRule != ACC_plusZero)
     {
-      auto DiagB = (CCK == Sema::CCK_OtherCast && !br)
+      auto DiagB = (CCK == CheckedConversionKind::OtherCast && !br)
                        ? S.Diag(noteLoc, diag::note_arc_cstyle_bridge_transfer)
                              << castExprType
                        : S.Diag(br ? castExpr->getExprLoc() : noteLoc,
@@ -3968,7 +3968,7 @@ diagnoseObjCARCConversion(Sema &S, SourceRange castRange,
     assert(CreateRule != ACC_bottom && "This cast should already be accepted.");
     if (CreateRule != ACC_plusOne)
     {
-      auto DiagB = (CCK != Sema::CCK_OtherCast)
+      auto DiagB = (CCK != CheckedConversionKind::OtherCast)
                        ? S.Diag(noteLoc, diag::note_arc_bridge)
                        : S.Diag(noteLoc, diag::note_arc_cstyle_bridge);
       addFixitForObjCARCConversion(S, DiagB, CCK, afterLParen,
@@ -3977,7 +3977,7 @@ diagnoseObjCARCConversion(Sema &S, SourceRange castRange,
     }
     if (CreateRule != ACC_plusZero)
     {
-      auto DiagB = (CCK == Sema::CCK_OtherCast && !br)
+      auto DiagB = (CCK == CheckedConversionKind::OtherCast && !br)
                        ? S.Diag(noteLoc, diag::note_arc_cstyle_bridge_retained)
                              << castType
                        : S.Diag(br ? castExpr->getExprLoc() : noteLoc,
@@ -4403,7 +4403,8 @@ Sema::CheckObjCConversion(SourceRange castRange, QualType castType,
     // Check for viability and report error if casting an rvalue to a
     // life-time qualifier.
     if (castACTC == ACTC_retainable &&
-        (CCK == CCK_CStyleCast || CCK == CCK_OtherCast) &&
+        (CCK == CheckedConversionKind::CStyleCast ||
+         CCK == CheckedConversionKind::OtherCast) &&
         castType != castExprType) {
       const Type *DT = castType.getTypePtr();
       QualType QDT = castType;
@@ -4517,11 +4518,11 @@ void Sema::diagnoseARCUnbridgedCast(Expr *e) {
   if (CStyleCastExpr *cast = dyn_cast<CStyleCastExpr>(realCast)) {
     castRange = SourceRange(cast->getLParenLoc(), cast->getRParenLoc());
     castType = cast->getTypeAsWritten();
-    CCK = CCK_CStyleCast;
+    CCK = CheckedConversionKind::CStyleCast;
   } else if (ExplicitCastExpr *cast = dyn_cast<ExplicitCastExpr>(realCast)) {
     castRange = cast->getTypeInfoAsWritten()->getTypeLoc().getSourceRange();
     castType = cast->getTypeAsWritten();
-    CCK = CCK_OtherCast;
+    CCK = CheckedConversionKind::OtherCast;
   } else {
     llvm_unreachable("Unexpected ImplicitCastExpr");
   }
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index fb7a80ab02846..e86f7578ff0c0 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -9057,11 +9057,11 @@ ExprResult InitializationSequence::Perform(Sema &S,
         }
       }
 
-      Sema::CheckedConversionKind CCK
-        = Kind.isCStyleCast()? Sema::CCK_CStyleCast
-        : Kind.isFunctionalCast()? Sema::CCK_FunctionalCast
-        : Kind.isExplicitCast()? Sema::CCK_OtherCast
-        : Sema::CCK_ImplicitConversion;
+      CheckedConversionKind CCK =
+          Kind.isCStyleCast()       ? CheckedConversionKind::CStyleCast
+          : Kind.isFunctionalCast() ? CheckedConversionKind::FunctionalCast
+          : Kind.isExplicitCast()   ? CheckedConversionKind::OtherCast
+                                    : CheckedConversionKind::Implicit;
       ExprResult CurInitExprRes =
         S.PerformImplicitConversion(CurInit.get(), Step->Type, *Step->ICS,
                                     getAssignmentAction(Entity), CCK);
diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp
index 35a51c6c2328d..1743afaf15287 100644
--- a/clang/lib/Sema/SemaLambda.cpp
+++ b/clang/lib/Sema/SemaLambda.cpp
@@ -21,6 +21,7 @@
 #include "clang/Sema/ScopeInfo.h"
 #include "clang/Sema/SemaCUDA.h"
 #include "clang/Sema/SemaInternal.h"
+#include "clang/Sema/SemaOpenMP.h"
 #include "clang/Sema/Template.h"
 #include "llvm/ADT/STLExtras.h"
 #include <optional>
@@ -1398,7 +1399,7 @@ void Sema::ActOnStartOfLambdaDefinition(LambdaIntroducer &Intro,
 
   // OpenMP lambdas might get assumumption attributes.
   if (LangOpts.OpenMP)
-    ActOnFinishedFunctionDefinitionInOpenMPAssumeScope(Method);
+    OpenMP().ActOnFinishedFunctionDefinitionInOpenMPAssumeScope(Method);
 
   handleLambdaNumbering(Class, Method);
 
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index d65f52b8efe81..55af414df39f5 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -4449,7 +4449,8 @@ LabelDecl *Sema::LookupOrCreateLabel(IdentifierInfo *II, SourceLocation Loc,
   }
 
   // Not a GNU local label.
-  Res = LookupSingleName(CurScope, II, Loc, LookupLabel, NotForRedeclaration);
+  Res = LookupSingleName(CurScope, II, Loc, LookupLabel,
+                         RedeclarationKind::NotForRedeclaration);
   // If we found a label, check to see if it is in the same context as us.
   // When in a Block, we don't want to reuse a label in an enclosing function.
   if (Res && Res->getDeclContext() != CurContext)
@@ -5889,7 +5890,8 @@ void Sema::clearDelayedTypo(TypoExpr *TE) {
 
 void Sema::ActOnPragmaDump(Scope *S, SourceLocation IILoc, IdentifierInfo *II) {
   DeclarationNameInfo Name(II, IILoc);
-  LookupResult R(*this, Name, LookupAnyName, Sema::NotForRedeclaration);
+  LookupResult R(*this, Name, LookupAnyName,
+                 RedeclarationKind::NotForRedeclaration);
   R.suppressDiagnostics();
   R.setHideTags(false);
   LookupName(R, S);
@@ -5899,3 +5901,13 @@ void Sema::ActOnPragmaDump(Scope *S, SourceLocation IILoc, IdentifierInfo *II) {
 void Sema::ActOnPragmaDump(Expr *E) {
   E->dump();
 }
+
+RedeclarationKind Sema::forRedeclarationInCurContext() const {
+  // A declaration with an owning module for linkage can never link against
+  // anything that is not visible. We don't need to check linkage here; if
+  // the context has internal linkage, redeclaration lookup won't find things
+  // from other TUs, and we can't safely compute linkage yet in general.
+  if (cast<Decl>(CurContext)->getOwningModuleForLinkage(/*IgnoreLinkage*/ true))
+    return RedeclarationKind::ForVisibleRedeclaration;
+  return RedeclarationKind::ForExternalRedeclaration;
+}
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 1249136c87650..59f65eaf47a6d 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -15,6 +15,7 @@
 #include "clang/AST/StmtOpenACC.h"
 #include "clang/Basic/DiagnosticSema.h"
 #include "clang/Sema/Sema.h"
+#include "llvm/Support/Casting.h"
 
 using namespace clang;
 
@@ -76,6 +77,19 @@ bool doesClauseApplyToDirective(OpenACCDirectiveKind DirectiveKind,
     default:
       return false;
     }
+  case OpenACCClauseKind::Self:
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Parallel:
+    case OpenACCDirectiveKind::Serial:
+    case OpenACCDirectiveKind::Kernels:
+    case OpenACCDirectiveKind::Update:
+    case OpenACCDirectiveKind::ParallelLoop:
+    case OpenACCDirectiveKind::SerialLoop:
+    case OpenACCDirectiveKind::KernelsLoop:
+      return true;
+    default:
+      return false;
+    }
   default:
     // Do nothing so we can go to the 'unimplemented' diagnostic instead.
     return true;
@@ -121,9 +135,7 @@ SemaOpenACC::ActOnClause(ArrayRef<const OpenACCClause *> ExistingClauses,
     // Restrictions only properly implemented on 'compute' constructs, and
     // 'compute' constructs are the only construct that can do anything with
     // this yet, so skip/treat as unimplemented in this case.
-    if (Clause.getDirectiveKind() != OpenACCDirectiveKind::Parallel &&
-        Clause.getDirectiveKind() != OpenACCDirectiveKind::Serial &&
-        Clause.getDirectiveKind() != OpenACCDirectiveKind::Kernels)
+    if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()))
       break;
 
     // Don't add an invalid clause to the AST.
@@ -146,9 +158,7 @@ SemaOpenACC::ActOnClause(ArrayRef<const OpenACCClause *> ExistingClauses,
     // Restrictions only properly implemented on 'compute' constructs, and
     // 'compute' constructs are the only construct that can do anything with
     // this yet, so skip/treat as unimplemented in this case.
-    if (Clause.getDirectiveKind() != OpenACCDirectiveKind::Parallel &&
-        Clause.getDirectiveKind() != OpenACCDirectiveKind::Serial &&
-        Clause.getDirectiveKind() != OpenACCDirectiveKind::Kernels)
+    if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()))
       break;
 
     // There is no prose in the standard that says duplicates aren't allowed,
@@ -160,12 +170,54 @@ SemaOpenACC::ActOnClause(ArrayRef<const OpenACCClause *> ExistingClauses,
     // The parser has ensured that we have a proper condition expr, so there
     // isn't really much to do here.
 
-    // TODO OpenACC: When we implement 'self', this clauses causes us to
-    // 'ignore' the self clause, so we should implement a warning here.
+    // If the 'if' clause is true, it makes the 'self' clause have no effect,
+    // diagnose that here.
+    // TODO OpenACC: When we add these two to other constructs, we might not
+    // want to warn on this (for example, 'update').
+    const auto *Itr =
+        llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCSelfClause>);
+    if (Itr != ExistingClauses.end()) {
+      Diag(Clause.getBeginLoc(), diag::warn_acc_if_self_conflict);
+      Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
+    }
+
     return OpenACCIfClause::Create(
         getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(),
         Clause.getConditionExpr(), Clause.getEndLoc());
   }
+
+  case OpenACCClauseKind::Self: {
+    // Restrictions only properly implemented on 'compute' constructs, and
+    // 'compute' constructs are the only construct that can do anything with
+    // this yet, so skip/treat as unimplemented in this case.
+    if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()))
+      break;
+
+    // TODO OpenACC: When we implement this for 'update', this takes a
+    // 'var-list' instead of a condition expression, so semantics/handling has
+    // to happen differently here.
+
+    // There is no prose in the standard that says duplicates aren't allowed,
+    // but this diagnostic is present in other compilers, as well as makes
+    // sense.
+    if (checkAlreadyHasClauseOfKind(*this, ExistingClauses, Clause))
+      return nullptr;
+
+    // If the 'if' clause is true, it makes the 'self' clause have no effect,
+    // diagnose that here.
+    // TODO OpenACC: When we add these two to other constructs, we might not
+    // want to warn on this (for example, 'update').
+    const auto *Itr =
+        llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCIfClause>);
+    if (Itr != ExistingClauses.end()) {
+      Diag(Clause.getBeginLoc(), diag::warn_acc_if_self_conflict);
+      Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here);
+    }
+
+    return OpenACCSelfClause::Create(
+        getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(),
+        Clause.getConditionExpr(), Clause.getEndLoc());
+  }
   default:
     break;
   }
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index e9efb4721133f..3e9f6cba25076 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -11,6 +11,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "clang/Sema/SemaOpenMP.h"
 #include "TreeTransform.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTMutationListener.h"
@@ -33,6 +34,7 @@
 #include "clang/Sema/ParsedAttr.h"
 #include "clang/Sema/Scope.h"
 #include "clang/Sema/ScopeInfo.h"
+#include "clang/Sema/Sema.h"
 #include "clang/Sema/SemaInternal.h"
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/PointerEmbeddedInt.h"
@@ -1808,9 +1810,9 @@ const DSAStackTy::DSAVarData DSAStackTy::getTopDSA(ValueDecl *D,
           return DVar;
         }
         const_iterator End = end();
-        if (!SemaRef.isOpenMPCapturedByRef(D,
-                                           std::distance(ParentIterTarget, End),
-                                           /*OpenMPCaptureLevel=*/0)) {
+        if (!SemaRef.OpenMP().isOpenMPCapturedByRef(
+                D, std::distance(ParentIterTarget, End),
+                /*OpenMPCaptureLevel=*/0)) {
           DVar.RefExpr =
               buildDeclRefExpr(SemaRef, VD, D->getType().getNonReferenceType(),
                                IterTarget->ConstructLoc);
@@ -2018,22 +2020,22 @@ bool DSAStackTy::hasDirective(
   return false;
 }
 
-void Sema::InitDataSharingAttributesStack() {
-  VarDataSharingAttributesStack = new DSAStackTy(*this);
+void SemaOpenMP::InitDataSharingAttributesStack() {
+  VarDataSharingAttributesStack = new DSAStackTy(SemaRef);
 }
 
 #define DSAStack static_cast<DSAStackTy *>(VarDataSharingAttributesStack)
 
-void Sema::pushOpenMPFunctionRegion() { DSAStack->pushFunction(); }
+void SemaOpenMP::pushOpenMPFunctionRegion() { DSAStack->pushFunction(); }
 
-void Sema::popOpenMPFunctionRegion(const FunctionScopeInfo *OldFSI) {
+void SemaOpenMP::popOpenMPFunctionRegion(const FunctionScopeInfo *OldFSI) {
   DSAStack->popFunction(OldFSI);
 }
 
 static bool isOpenMPDeviceDelayedContext(Sema &S) {
   assert(S.LangOpts.OpenMP && S.LangOpts.OpenMPIsTargetDevice &&
          "Expected OpenMP device compilation.");
-  return !S.isInOpenMPTargetExecutionDirective();
+  return !S.OpenMP().isInOpenMPTargetExecutionDirective();
 }
 
 namespace {
@@ -2045,20 +2047,20 @@ enum class FunctionEmissionStatus {
 };
 } // anonymous namespace
 
-Sema::SemaDiagnosticBuilder
-Sema::diagIfOpenMPDeviceCode(SourceLocation Loc, unsigned DiagID,
-                             const FunctionDecl *FD) {
-  assert(LangOpts.OpenMP && LangOpts.OpenMPIsTargetDevice &&
+SemaBase::SemaDiagnosticBuilder
+SemaOpenMP::diagIfOpenMPDeviceCode(SourceLocation Loc, unsigned DiagID,
+                                   const FunctionDecl *FD) {
+  assert(getLangOpts().OpenMP && getLangOpts().OpenMPIsTargetDevice &&
          "Expected OpenMP device compilation.");
 
   SemaDiagnosticBuilder::Kind Kind = SemaDiagnosticBuilder::K_Nop;
   if (FD) {
-    FunctionEmissionStatus FES = getEmissionStatus(FD);
+    Sema::FunctionEmissionStatus FES = SemaRef.getEmissionStatus(FD);
     switch (FES) {
-    case FunctionEmissionStatus::Emitted:
+    case Sema::FunctionEmissionStatus::Emitted:
       Kind = SemaDiagnosticBuilder::K_Immediate;
       break;
-    case FunctionEmissionStatus::Unknown:
+    case Sema::FunctionEmissionStatus::Unknown:
       // TODO: We should always delay diagnostics here in case a target
       //       region is in a function we do not emit. However, as the
       //       current diagnostics are associated with the function containing
@@ -2066,48 +2068,48 @@ Sema::diagIfOpenMPDeviceCode(SourceLocation Loc, unsigned DiagID,
       //       on diagnostics for the target region itself. We need to anchor
       //       the diagnostics with the new generated function *or* ensure we
       //       emit diagnostics associated with the surrounding function.
-      Kind = isOpenMPDeviceDelayedContext(*this)
+      Kind = isOpenMPDeviceDelayedContext(SemaRef)
                  ? SemaDiagnosticBuilder::K_Deferred
                  : SemaDiagnosticBuilder::K_Immediate;
       break;
-    case FunctionEmissionStatus::TemplateDiscarded:
-    case FunctionEmissionStatus::OMPDiscarded:
+    case Sema::FunctionEmissionStatus::TemplateDiscarded:
+    case Sema::FunctionEmissionStatus::OMPDiscarded:
       Kind = SemaDiagnosticBuilder::K_Nop;
       break;
-    case FunctionEmissionStatus::CUDADiscarded:
+    case Sema::FunctionEmissionStatus::CUDADiscarded:
       llvm_unreachable("CUDADiscarded unexpected in OpenMP device compilation");
       break;
     }
   }
 
-  return SemaDiagnosticBuilder(Kind, Loc, DiagID, FD, *this);
+  return SemaDiagnosticBuilder(Kind, Loc, DiagID, FD, SemaRef);
 }
 
-Sema::SemaDiagnosticBuilder Sema::diagIfOpenMPHostCode(SourceLocation Loc,
-                                                       unsigned DiagID,
-                                                       const FunctionDecl *FD) {
-  assert(LangOpts.OpenMP && !LangOpts.OpenMPIsTargetDevice &&
+SemaBase::SemaDiagnosticBuilder
+SemaOpenMP::diagIfOpenMPHostCode(SourceLocation Loc, unsigned DiagID,
+                                 const FunctionDecl *FD) {
+  assert(getLangOpts().OpenMP && !getLangOpts().OpenMPIsTargetDevice &&
          "Expected OpenMP host compilation.");
 
   SemaDiagnosticBuilder::Kind Kind = SemaDiagnosticBuilder::K_Nop;
   if (FD) {
-    FunctionEmissionStatus FES = getEmissionStatus(FD);
+    Sema::FunctionEmissionStatus FES = SemaRef.getEmissionStatus(FD);
     switch (FES) {
-    case FunctionEmissionStatus::Emitted:
+    case Sema::FunctionEmissionStatus::Emitted:
       Kind = SemaDiagnosticBuilder::K_Immediate;
       break;
-    case FunctionEmissionStatus::Unknown:
+    case Sema::FunctionEmissionStatus::Unknown:
       Kind = SemaDiagnosticBuilder::K_Deferred;
       break;
-    case FunctionEmissionStatus::TemplateDiscarded:
-    case FunctionEmissionStatus::OMPDiscarded:
-    case FunctionEmissionStatus::CUDADiscarded:
+    case Sema::FunctionEmissionStatus::TemplateDiscarded:
+    case Sema::FunctionEmissionStatus::OMPDiscarded:
+    case Sema::FunctionEmissionStatus::CUDADiscarded:
       Kind = SemaDiagnosticBuilder::K_Nop;
       break;
     }
   }
 
-  return SemaDiagnosticBuilder(Kind, Loc, DiagID, FD, *this);
+  return SemaDiagnosticBuilder(Kind, Loc, DiagID, FD, SemaRef);
 }
 
 static OpenMPDefaultmapClauseKind
@@ -2124,9 +2126,9 @@ getVariableCategoryFromDecl(const LangOptions &LO, const ValueDecl *VD) {
   return OMPC_DEFAULTMAP_aggregate;
 }
 
-bool Sema::isOpenMPCapturedByRef(const ValueDecl *D, unsigned Level,
-                                 unsigned OpenMPCaptureLevel) const {
-  assert(LangOpts.OpenMP && "OpenMP is not allowed");
+bool SemaOpenMP::isOpenMPCapturedByRef(const ValueDecl *D, unsigned Level,
+                                       unsigned OpenMPCaptureLevel) const {
+  assert(getLangOpts().OpenMP && "OpenMP is not allowed");
 
   ASTContext &Ctx = getASTContext();
   bool IsByRef = true;
@@ -2252,7 +2254,7 @@ bool Sema::isOpenMPCapturedByRef(const ValueDecl *D, unsigned Level,
                  !Ty->isAnyPointerType()) ||
                 !Ty->isScalarType() ||
                 DSAStack->isDefaultmapCapturedByRef(
-                    Level, getVariableCategoryFromDecl(LangOpts, D)) ||
+                    Level, getVariableCategoryFromDecl(getLangOpts(), D)) ||
                 DSAStack->hasExplicitDSA(
                     D,
                     [](OpenMPClauseKind K, bool AppliedToPointee) {
@@ -2303,17 +2305,17 @@ bool Sema::isOpenMPCapturedByRef(const ValueDecl *D, unsigned Level,
   return IsByRef;
 }
 
-unsigned Sema::getOpenMPNestingLevel() const {
+unsigned SemaOpenMP::getOpenMPNestingLevel() const {
   assert(getLangOpts().OpenMP);
   return DSAStack->getNestingLevel();
 }
 
-bool Sema::isInOpenMPTaskUntiedContext() const {
+bool SemaOpenMP::isInOpenMPTaskUntiedContext() const {
   return isOpenMPTaskingDirective(DSAStack->getCurrentDirective()) &&
          DSAStack->isUntiedRegion();
 }
 
-bool Sema::isInOpenMPTargetExecutionDirective() const {
+bool SemaOpenMP::isInOpenMPTargetExecutionDirective() const {
   return (isOpenMPTargetExecutionDirective(DSAStack->getCurrentDirective()) &&
           !DSAStack->isClauseParsingMode()) ||
          DSAStack->hasDirective(
@@ -2324,7 +2326,7 @@ bool Sema::isInOpenMPTargetExecutionDirective() const {
              false);
 }
 
-bool Sema::isOpenMPRebuildMemberExpr(ValueDecl *D) {
+bool SemaOpenMP::isOpenMPRebuildMemberExpr(ValueDecl *D) {
   // Only rebuild for Field.
   if (!dyn_cast<FieldDecl>(D))
     return false;
@@ -2347,9 +2349,9 @@ static OMPCapturedExprDecl *buildCaptureDecl(Sema &S, IdentifierInfo *Id,
                                              DeclContext *CurContext,
                                              bool AsExpression);
 
-VarDecl *Sema::isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo,
-                                    unsigned StopAt) {
-  assert(LangOpts.OpenMP && "OpenMP is not allowed");
+VarDecl *SemaOpenMP::isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo,
+                                          unsigned StopAt) {
+  assert(getLangOpts().OpenMP && "OpenMP is not allowed");
   D = getCanonicalDecl(D);
 
   auto *VD = dyn_cast<VarDecl>(D);
@@ -2368,7 +2370,8 @@ VarDecl *Sema::isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo,
   // 'target' we return true so that this global is also mapped to the device.
   //
   if (VD && !VD->hasLocalStorage() &&
-      (getCurCapturedRegion() || getCurBlock() || getCurLambda())) {
+      (SemaRef.getCurCapturedRegion() || SemaRef.getCurBlock() ||
+       SemaRef.getCurLambda())) {
     if (isInOpenMPTargetExecutionDirective()) {
       DSAStackTy::DSAVarData DVarTop =
           DSAStack->getTopDSA(D, DSAStack->isClauseParsingMode());
@@ -2381,8 +2384,9 @@ VarDecl *Sema::isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo,
         return nullptr;
       CapturedRegionScopeInfo *CSI = nullptr;
       for (FunctionScopeInfo *FSI : llvm::drop_begin(
-               llvm::reverse(FunctionScopes),
-               CheckScopeInfo ? (FunctionScopes.size() - (StopAt + 1)) : 0)) {
+               llvm::reverse(SemaRef.FunctionScopes),
+               CheckScopeInfo ? (SemaRef.FunctionScopes.size() - (StopAt + 1))
+                              : 0)) {
         if (!isa<CapturingScopeInfo>(FSI))
           return nullptr;
         if (auto *RSI = dyn_cast<CapturedRegionScopeInfo>(FSI))
@@ -2401,7 +2405,7 @@ VarDecl *Sema::isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo,
     if (isInOpenMPDeclareTargetContext()) {
       // Try to mark variable as declare target if it is used in capturing
       // regions.
-      if (LangOpts.OpenMP <= 45 &&
+      if (getLangOpts().OpenMP <= 45 &&
           !OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
         checkDeclIsAllowedInOpenMPTarget(nullptr, VD);
       return nullptr;
@@ -2411,7 +2415,7 @@ VarDecl *Sema::isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo,
   if (CheckScopeInfo) {
     bool OpenMPFound = false;
     for (unsigned I = StopAt + 1; I > 0; --I) {
-      FunctionScopeInfo *FSI = FunctionScopes[I - 1];
+      FunctionScopeInfo *FSI = SemaRef.FunctionScopes[I - 1];
       if (!isa<CapturingScopeInfo>(FSI))
         return nullptr;
       if (auto *RSI = dyn_cast<CapturedRegionScopeInfo>(FSI))
@@ -2476,22 +2480,23 @@ VarDecl *Sema::isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo,
       VarDecl *VD = DSAStack->getImplicitFDCapExprDecl(FD);
       if (VD)
         return VD;
-      if (getCurrentThisType().isNull())
+      if (SemaRef.getCurrentThisType().isNull())
         return nullptr;
-      Expr *ThisExpr = BuildCXXThisExpr(SourceLocation(), getCurrentThisType(),
-                                        /*IsImplicit=*/true);
+      Expr *ThisExpr = SemaRef.BuildCXXThisExpr(SourceLocation(),
+                                                SemaRef.getCurrentThisType(),
+                                                /*IsImplicit=*/true);
       const CXXScopeSpec CS = CXXScopeSpec();
-      Expr *ME = BuildMemberExpr(ThisExpr, /*IsArrow=*/true, SourceLocation(),
-                                 NestedNameSpecifierLoc(), SourceLocation(), FD,
-                                 DeclAccessPair::make(FD, FD->getAccess()),
-                                 /*HadMultipleCandidates=*/false,
-                                 DeclarationNameInfo(), FD->getType(),
-                                 VK_LValue, OK_Ordinary);
+      Expr *ME = SemaRef.BuildMemberExpr(
+          ThisExpr, /*IsArrow=*/true, SourceLocation(),
+          NestedNameSpecifierLoc(), SourceLocation(), FD,
+          DeclAccessPair::make(FD, FD->getAccess()),
+          /*HadMultipleCandidates=*/false, DeclarationNameInfo(), FD->getType(),
+          VK_LValue, OK_Ordinary);
       OMPCapturedExprDecl *CD = buildCaptureDecl(
-          *this, FD->getIdentifier(), ME, DVarPrivate.CKind != OMPC_private,
-          CurContext->getParent(), /*AsExpression=*/false);
+          SemaRef, FD->getIdentifier(), ME, DVarPrivate.CKind != OMPC_private,
+          SemaRef.CurContext->getParent(), /*AsExpression=*/false);
       DeclRefExpr *VDPrivateRefExpr = buildDeclRefExpr(
-          *this, CD, CD->getType().getNonReferenceType(), SourceLocation());
+          SemaRef, CD, CD->getType().getNonReferenceType(), SourceLocation());
       VD = cast<VarDecl>(VDPrivateRefExpr->getDecl());
       DSAStack->addImplicitDefaultFirstprivateFD(FD, VD);
       return VD;
@@ -2505,28 +2510,28 @@ VarDecl *Sema::isOpenMPCapturedDecl(ValueDecl *D, bool CheckScopeInfo,
   return nullptr;
 }
 
-void Sema::adjustOpenMPTargetScopeIndex(unsigned &FunctionScopesIndex,
-                                        unsigned Level) const {
+void SemaOpenMP::adjustOpenMPTargetScopeIndex(unsigned &FunctionScopesIndex,
+                                              unsigned Level) const {
   FunctionScopesIndex -= getOpenMPCaptureLevels(DSAStack->getDirective(Level));
 }
 
-void Sema::startOpenMPLoop() {
-  assert(LangOpts.OpenMP && "OpenMP must be enabled.");
+void SemaOpenMP::startOpenMPLoop() {
+  assert(getLangOpts().OpenMP && "OpenMP must be enabled.");
   if (isOpenMPLoopDirective(DSAStack->getCurrentDirective()))
     DSAStack->loopInit();
 }
 
-void Sema::startOpenMPCXXRangeFor() {
-  assert(LangOpts.OpenMP && "OpenMP must be enabled.");
+void SemaOpenMP::startOpenMPCXXRangeFor() {
+  assert(getLangOpts().OpenMP && "OpenMP must be enabled.");
   if (isOpenMPLoopDirective(DSAStack->getCurrentDirective())) {
     DSAStack->resetPossibleLoopCounter();
     DSAStack->loopStart();
   }
 }
 
-OpenMPClauseKind Sema::isOpenMPPrivateDecl(ValueDecl *D, unsigned Level,
-                                           unsigned CapLevel) const {
-  assert(LangOpts.OpenMP && "OpenMP is not allowed");
+OpenMPClauseKind SemaOpenMP::isOpenMPPrivateDecl(ValueDecl *D, unsigned Level,
+                                                 unsigned CapLevel) const {
+  assert(getLangOpts().OpenMP && "OpenMP is not allowed");
   if (DSAStack->getCurrentDirective() != OMPD_unknown &&
       (!DSAStack->isClauseParsingMode() ||
        DSAStack->getParentDirective() != OMPD_unknown)) {
@@ -2546,7 +2551,8 @@ OpenMPClauseKind Sema::isOpenMPPrivateDecl(ValueDecl *D, unsigned Level,
   }
   if (DSAStack->hasExplicitDirective(isOpenMPTaskingDirective, Level)) {
     bool IsTriviallyCopyable =
-        D->getType().getNonReferenceType().isTriviallyCopyableType(Context) &&
+        D->getType().getNonReferenceType().isTriviallyCopyableType(
+            getASTContext()) &&
         !D->getType()
              .getNonReferenceType()
              .getCanonicalType()
@@ -2620,9 +2626,9 @@ OpenMPClauseKind Sema::isOpenMPPrivateDecl(ValueDecl *D, unsigned Level,
              : OMPC_unknown;
 }
 
-void Sema::setOpenMPCaptureKind(FieldDecl *FD, const ValueDecl *D,
-                                unsigned Level) {
-  assert(LangOpts.OpenMP && "OpenMP is not allowed");
+void SemaOpenMP::setOpenMPCaptureKind(FieldDecl *FD, const ValueDecl *D,
+                                      unsigned Level) {
+  assert(getLangOpts().OpenMP && "OpenMP is not allowed");
   D = getCanonicalDecl(D);
   OpenMPClauseKind OMPC = OMPC_unknown;
   for (unsigned I = DSAStack->getNestingLevel() + 1; I > Level; --I) {
@@ -2649,18 +2655,19 @@ void Sema::setOpenMPCaptureKind(FieldDecl *FD, const ValueDecl *D,
                                        NewLevel)) {
       OMPC = OMPC_map;
       if (DSAStack->mustBeFirstprivateAtLevel(
-              NewLevel, getVariableCategoryFromDecl(LangOpts, D)))
+              NewLevel, getVariableCategoryFromDecl(getLangOpts(), D)))
         OMPC = OMPC_firstprivate;
       break;
     }
   }
   if (OMPC != OMPC_unknown)
-    FD->addAttr(OMPCaptureKindAttr::CreateImplicit(Context, unsigned(OMPC)));
+    FD->addAttr(
+        OMPCaptureKindAttr::CreateImplicit(getASTContext(), unsigned(OMPC)));
 }
 
-bool Sema::isOpenMPTargetCapturedDecl(const ValueDecl *D, unsigned Level,
-                                      unsigned CaptureLevel) const {
-  assert(LangOpts.OpenMP && "OpenMP is not allowed");
+bool SemaOpenMP::isOpenMPTargetCapturedDecl(const ValueDecl *D, unsigned Level,
+                                            unsigned CaptureLevel) const {
+  assert(getLangOpts().OpenMP && "OpenMP is not allowed");
   // Return true if the current level is no longer enclosed in a target region.
 
   SmallVector<OpenMPDirectiveKind, 4> Regions;
@@ -2672,9 +2679,9 @@ bool Sema::isOpenMPTargetCapturedDecl(const ValueDecl *D, unsigned Level,
          Regions[CaptureLevel] != OMPD_task;
 }
 
-bool Sema::isOpenMPGlobalCapturedDecl(ValueDecl *D, unsigned Level,
-                                      unsigned CaptureLevel) const {
-  assert(LangOpts.OpenMP && "OpenMP is not allowed");
+bool SemaOpenMP::isOpenMPGlobalCapturedDecl(ValueDecl *D, unsigned Level,
+                                            unsigned CaptureLevel) const {
+  assert(getLangOpts().OpenMP && "OpenMP is not allowed");
   // Return true if the current level is no longer enclosed in a target region.
 
   if (const auto *VD = dyn_cast<VarDecl>(D)) {
@@ -2702,37 +2709,37 @@ bool Sema::isOpenMPGlobalCapturedDecl(ValueDecl *D, unsigned Level,
   return true;
 }
 
-void Sema::DestroyDataSharingAttributesStack() { delete DSAStack; }
+void SemaOpenMP::DestroyDataSharingAttributesStack() { delete DSAStack; }
 
-void Sema::ActOnOpenMPBeginDeclareVariant(SourceLocation Loc,
-                                          OMPTraitInfo &TI) {
+void SemaOpenMP::ActOnOpenMPBeginDeclareVariant(SourceLocation Loc,
+                                                OMPTraitInfo &TI) {
   OMPDeclareVariantScopes.push_back(OMPDeclareVariantScope(TI));
 }
 
-void Sema::ActOnOpenMPEndDeclareVariant() {
+void SemaOpenMP::ActOnOpenMPEndDeclareVariant() {
   assert(isInOpenMPDeclareVariantScope() &&
          "Not in OpenMP declare variant scope!");
 
   OMPDeclareVariantScopes.pop_back();
 }
 
-void Sema::finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller,
-                                         const FunctionDecl *Callee,
-                                         SourceLocation Loc) {
-  assert(LangOpts.OpenMP && "Expected OpenMP compilation mode.");
+void SemaOpenMP::finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller,
+                                               const FunctionDecl *Callee,
+                                               SourceLocation Loc) {
+  assert(getLangOpts().OpenMP && "Expected OpenMP compilation mode.");
   std::optional<OMPDeclareTargetDeclAttr::DevTypeTy> DevTy =
       OMPDeclareTargetDeclAttr::getDeviceType(Caller->getMostRecentDecl());
   // Ignore host functions during device analyzis.
-  if (LangOpts.OpenMPIsTargetDevice &&
+  if (getLangOpts().OpenMPIsTargetDevice &&
       (!DevTy || *DevTy == OMPDeclareTargetDeclAttr::DT_Host))
     return;
   // Ignore nohost functions during host analyzis.
-  if (!LangOpts.OpenMPIsTargetDevice && DevTy &&
+  if (!getLangOpts().OpenMPIsTargetDevice && DevTy &&
       *DevTy == OMPDeclareTargetDeclAttr::DT_NoHost)
     return;
   const FunctionDecl *FD = Callee->getMostRecentDecl();
   DevTy = OMPDeclareTargetDeclAttr::getDeviceType(FD);
-  if (LangOpts.OpenMPIsTargetDevice && DevTy &&
+  if (getLangOpts().OpenMPIsTargetDevice && DevTy &&
       *DevTy == OMPDeclareTargetDeclAttr::DT_Host) {
     // Diagnose host function called during device codegen.
     StringRef HostDevTy =
@@ -2743,8 +2750,9 @@ void Sema::finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller,
         << HostDevTy;
     return;
   }
-  if (!LangOpts.OpenMPIsTargetDevice && !LangOpts.OpenMPOffloadMandatory &&
-      DevTy && *DevTy == OMPDeclareTargetDeclAttr::DT_NoHost) {
+  if (!getLangOpts().OpenMPIsTargetDevice &&
+      !getLangOpts().OpenMPOffloadMandatory && DevTy &&
+      *DevTy == OMPDeclareTargetDeclAttr::DT_NoHost) {
     // In OpenMP 5.2 or later, if the function has a host variant then allow
     // that to be called instead
     auto &&HasHostAttr = [](const FunctionDecl *Callee) {
@@ -2773,21 +2781,21 @@ void Sema::finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller,
   }
 }
 
-void Sema::StartOpenMPDSABlock(OpenMPDirectiveKind DKind,
-                               const DeclarationNameInfo &DirName,
-                               Scope *CurScope, SourceLocation Loc) {
+void SemaOpenMP::StartOpenMPDSABlock(OpenMPDirectiveKind DKind,
+                                     const DeclarationNameInfo &DirName,
+                                     Scope *CurScope, SourceLocation Loc) {
   DSAStack->push(DKind, DirName, CurScope, Loc);
-  PushExpressionEvaluationContext(
-      ExpressionEvaluationContext::PotentiallyEvaluated);
+  SemaRef.PushExpressionEvaluationContext(
+      Sema::ExpressionEvaluationContext::PotentiallyEvaluated);
 }
 
-void Sema::StartOpenMPClause(OpenMPClauseKind K) {
+void SemaOpenMP::StartOpenMPClause(OpenMPClauseKind K) {
   DSAStack->setClauseParsingMode(K);
 }
 
-void Sema::EndOpenMPClause() {
+void SemaOpenMP::EndOpenMPClause() {
   DSAStack->setClauseParsingMode(/*K=*/OMPC_unknown);
-  CleanupVarDeclMarking();
+  SemaRef.CleanupVarDeclMarking();
 }
 
 static std::pair<ValueDecl *, bool>
@@ -2871,7 +2879,7 @@ static void reportOriginalDsa(Sema &SemaRef, const DSAStackTy *Stack,
                               const DSAStackTy::DSAVarData &DVar,
                               bool IsLoopIterVar = false);
 
-void Sema::EndOpenMPDSABlock(Stmt *CurDirective) {
+void SemaOpenMP::EndOpenMPDSABlock(Stmt *CurDirective) {
   // OpenMP [2.14.3.5, Restrictions, C/C++, p.1]
   //  A variable of class type (or array thereof) that appears in a lastprivate
   //  clause requires an accessible, unambiguous default constructor for the
@@ -2898,15 +2906,15 @@ void Sema::EndOpenMPDSABlock(Stmt *CurDirective) {
             // variable is not added to IdResolver, so the code in the OpenMP
             // region uses original variable for proper diagnostics.
             VarDecl *VDPrivate = buildVarDecl(
-                *this, DE->getExprLoc(), Type.getUnqualifiedType(),
+                SemaRef, DE->getExprLoc(), Type.getUnqualifiedType(),
                 VD->getName(), VD->hasAttrs() ? &VD->getAttrs() : nullptr, DRE);
-            ActOnUninitializedDecl(VDPrivate);
+            SemaRef.ActOnUninitializedDecl(VDPrivate);
             if (VDPrivate->isInvalidDecl()) {
               PrivateCopies.push_back(nullptr);
               continue;
             }
             PrivateCopies.push_back(buildDeclRefExpr(
-                *this, VDPrivate, DE->getType(), DE->getExprLoc()));
+                SemaRef, VDPrivate, DE->getType(), DE->getExprLoc()));
           } else {
             // The variable is also a firstprivate, so initialization sequence
             // for private copy is generated already.
@@ -2924,7 +2932,7 @@ void Sema::EndOpenMPDSABlock(Stmt *CurDirective) {
           SourceLocation ELoc;
           SourceRange ERange;
           Expr *SimpleRefExpr = RefExpr;
-          auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
+          auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange);
           if (Res.second)
             // It will be analyzed later.
             PrivateRefs.push_back(RefExpr);
@@ -2977,7 +2985,7 @@ void Sema::EndOpenMPDSABlock(Stmt *CurDirective) {
                  diag::err_omp_allocator_used_in_clauses)
                 << D.Allocator->getSourceRange();
             if (DVar.RefExpr)
-              reportOriginalDsa(*this, DSAStack, VD, DVar);
+              reportOriginalDsa(SemaRef, DSAStack, VD, DVar);
             else
               Diag(MapExpr->getExprLoc(), diag::note_used_here)
                   << MapExpr->getSourceRange();
@@ -2987,14 +2995,14 @@ void Sema::EndOpenMPDSABlock(Stmt *CurDirective) {
       }
     }
     // Check allocate clauses.
-    if (!CurContext->isDependentContext())
-      checkAllocateClauses(*this, DSAStack, D->clauses());
-    checkReductionClauses(*this, DSAStack, D->clauses());
+    if (!SemaRef.CurContext->isDependentContext())
+      checkAllocateClauses(SemaRef, DSAStack, D->clauses());
+    checkReductionClauses(SemaRef, DSAStack, D->clauses());
   }
 
   DSAStack->pop();
-  DiscardCleanupsInEvaluationContext();
-  PopExpressionEvaluationContext();
+  SemaRef.DiscardCleanupsInEvaluationContext();
+  SemaRef.PopExpressionEvaluationContext();
 }
 
 static bool FinishOpenMPLinearClause(OMPLinearClause &Clause, DeclRefExpr *IV,
@@ -3047,27 +3055,28 @@ class VarOrFuncDeclFilterCCC final : public CorrectionCandidateCallback {
 
 } // namespace
 
-ExprResult Sema::ActOnOpenMPIdExpression(Scope *CurScope,
-                                         CXXScopeSpec &ScopeSpec,
-                                         const DeclarationNameInfo &Id,
-                                         OpenMPDirectiveKind Kind) {
-  LookupResult Lookup(*this, Id, LookupOrdinaryName);
-  LookupParsedName(Lookup, CurScope, &ScopeSpec, true);
+ExprResult SemaOpenMP::ActOnOpenMPIdExpression(Scope *CurScope,
+                                               CXXScopeSpec &ScopeSpec,
+                                               const DeclarationNameInfo &Id,
+                                               OpenMPDirectiveKind Kind) {
+  ASTContext &Context = getASTContext();
+  LookupResult Lookup(SemaRef, Id, Sema::LookupOrdinaryName);
+  SemaRef.LookupParsedName(Lookup, CurScope, &ScopeSpec, true);
 
   if (Lookup.isAmbiguous())
     return ExprError();
 
   VarDecl *VD;
   if (!Lookup.isSingleResult()) {
-    VarDeclFilterCCC CCC(*this);
+    VarDeclFilterCCC CCC(SemaRef);
     if (TypoCorrection Corrected =
-            CorrectTypo(Id, LookupOrdinaryName, CurScope, nullptr, CCC,
-                        CTK_ErrorRecovery)) {
-      diagnoseTypo(Corrected,
-                   PDiag(Lookup.empty()
-                             ? diag::err_undeclared_var_use_suggest
-                             : diag::err_omp_expected_var_arg_suggest)
-                       << Id.getName());
+            SemaRef.CorrectTypo(Id, Sema::LookupOrdinaryName, CurScope, nullptr,
+                                CCC, Sema::CTK_ErrorRecovery)) {
+      SemaRef.diagnoseTypo(
+          Corrected,
+          SemaRef.PDiag(Lookup.empty() ? diag::err_undeclared_var_use_suggest
+                                       : diag::err_omp_expected_var_arg_suggest)
+              << Id.getName());
       VD = Corrected.getCorrectionDeclAs<VarDecl>();
     } else {
       Diag(Id.getLoc(), Lookup.empty() ? diag::err_undeclared_var_use
@@ -3101,7 +3110,7 @@ ExprResult Sema::ActOnOpenMPIdExpression(Scope *CurScope,
   //   A threadprivate directive for file-scope variables must appear outside
   //   any definition or declaration.
   if (CanonicalVD->getDeclContext()->isTranslationUnit() &&
-      !getCurLexicalContext()->isTranslationUnit()) {
+      !SemaRef.getCurLexicalContext()->isTranslationUnit()) {
     Diag(Id.getLoc(), diag::err_omp_var_scope)
         << getOpenMPDirectiveName(Kind) << VD;
     bool IsDecl =
@@ -3116,7 +3125,7 @@ ExprResult Sema::ActOnOpenMPIdExpression(Scope *CurScope,
   //   in the class definition, in the same scope in which the member
   //   variables are declared.
   if (CanonicalVD->isStaticDataMember() &&
-      !CanonicalVD->getDeclContext()->Equals(getCurLexicalContext())) {
+      !CanonicalVD->getDeclContext()->Equals(SemaRef.getCurLexicalContext())) {
     Diag(Id.getLoc(), diag::err_omp_var_scope)
         << getOpenMPDirectiveName(Kind) << VD;
     bool IsDecl =
@@ -3131,8 +3140,9 @@ ExprResult Sema::ActOnOpenMPIdExpression(Scope *CurScope,
   //   outside any definition or declaration other than the namespace
   //   definition itself.
   if (CanonicalVD->getDeclContext()->isNamespace() &&
-      (!getCurLexicalContext()->isFileContext() ||
-       !getCurLexicalContext()->Encloses(CanonicalVD->getDeclContext()))) {
+      (!SemaRef.getCurLexicalContext()->isFileContext() ||
+       !SemaRef.getCurLexicalContext()->Encloses(
+           CanonicalVD->getDeclContext()))) {
     Diag(Id.getLoc(), diag::err_omp_var_scope)
         << getOpenMPDirectiveName(Kind) << VD;
     bool IsDecl =
@@ -3146,7 +3156,7 @@ ExprResult Sema::ActOnOpenMPIdExpression(Scope *CurScope,
   //   A threadprivate directive for static block-scope variables must appear
   //   in the scope of the variable and not in a nested scope.
   if (CanonicalVD->isLocalVarDecl() && CurScope &&
-      !isDeclInScope(ND, getCurLexicalContext(), CurScope)) {
+      !SemaRef.isDeclInScope(ND, SemaRef.getCurLexicalContext(), CurScope)) {
     Diag(Id.getLoc(), diag::err_omp_var_scope)
         << getOpenMPDirectiveName(Kind) << VD;
     bool IsDecl =
@@ -3174,11 +3184,11 @@ ExprResult Sema::ActOnOpenMPIdExpression(Scope *CurScope,
                              Id.getLoc(), ExprType, VK_LValue);
 }
 
-Sema::DeclGroupPtrTy
-Sema::ActOnOpenMPThreadprivateDirective(SourceLocation Loc,
-                                        ArrayRef<Expr *> VarList) {
+SemaOpenMP::DeclGroupPtrTy
+SemaOpenMP::ActOnOpenMPThreadprivateDirective(SourceLocation Loc,
+                                              ArrayRef<Expr *> VarList) {
   if (OMPThreadPrivateDecl *D = CheckOMPThreadPrivateDecl(Loc, VarList)) {
-    CurContext->addDecl(D);
+    SemaRef.CurContext->addDecl(D);
     return DeclGroupPtrTy::make(DeclGroupRef(D));
   }
   return nullptr;
@@ -3215,7 +3225,9 @@ class LocalVarRefChecker final
 } // namespace
 
 OMPThreadPrivateDecl *
-Sema::CheckOMPThreadPrivateDecl(SourceLocation Loc, ArrayRef<Expr *> VarList) {
+SemaOpenMP::CheckOMPThreadPrivateDecl(SourceLocation Loc,
+                                      ArrayRef<Expr *> VarList) {
+  ASTContext &Context = getASTContext();
   SmallVector<Expr *, 8> Vars;
   for (Expr *RefExpr : VarList) {
     auto *DE = cast<DeclRefExpr>(RefExpr);
@@ -3235,8 +3247,8 @@ Sema::CheckOMPThreadPrivateDecl(SourceLocation Loc, ArrayRef<Expr *> VarList) {
 
     // OpenMP [2.9.2, Restrictions, C/C++, p.10]
     //   A threadprivate variable must not have an incomplete type.
-    if (RequireCompleteType(ILoc, VD->getType(),
-                            diag::err_omp_threadprivate_incomplete_type)) {
+    if (SemaRef.RequireCompleteType(
+            ILoc, VD->getType(), diag::err_omp_threadprivate_incomplete_type)) {
       continue;
     }
 
@@ -3274,7 +3286,7 @@ Sema::CheckOMPThreadPrivateDecl(SourceLocation Loc, ArrayRef<Expr *> VarList) {
     // Check if initial value of threadprivate variable reference variable with
     // local storage (it is not supported by runtime).
     if (const Expr *Init = VD->getAnyInitializer()) {
-      LocalVarRefChecker Checker(*this);
+      LocalVarRefChecker Checker(SemaRef);
       if (Checker.Visit(Init))
         continue;
     }
@@ -3288,8 +3300,8 @@ Sema::CheckOMPThreadPrivateDecl(SourceLocation Loc, ArrayRef<Expr *> VarList) {
   }
   OMPThreadPrivateDecl *D = nullptr;
   if (!Vars.empty()) {
-    D = OMPThreadPrivateDecl::Create(Context, getCurLexicalContext(), Loc,
-                                     Vars);
+    D = OMPThreadPrivateDecl::Create(Context, SemaRef.getCurLexicalContext(),
+                                     Loc, Vars);
     D->setAccess(AS_public);
   }
   return D;
@@ -3395,10 +3407,9 @@ applyOMPAllocateAttribute(Sema &S, VarDecl *VD,
     ML->DeclarationMarkedOpenMPAllocate(VD, A);
 }
 
-Sema::DeclGroupPtrTy
-Sema::ActOnOpenMPAllocateDirective(SourceLocation Loc, ArrayRef<Expr *> VarList,
-                                   ArrayRef<OMPClause *> Clauses,
-                                   DeclContext *Owner) {
+SemaOpenMP::DeclGroupPtrTy SemaOpenMP::ActOnOpenMPAllocateDirective(
+    SourceLocation Loc, ArrayRef<Expr *> VarList, ArrayRef<OMPClause *> Clauses,
+    DeclContext *Owner) {
   assert(Clauses.size() <= 2 && "Expected at most two clauses.");
   Expr *Alignment = nullptr;
   Expr *Allocator = nullptr;
@@ -3407,9 +3418,9 @@ Sema::ActOnOpenMPAllocateDirective(SourceLocation Loc, ArrayRef<Expr *> VarList,
     // allocate directives that appear in a target region must specify an
     // allocator clause unless a requires directive with the dynamic_allocators
     // clause is present in the same compilation unit.
-    if (LangOpts.OpenMPIsTargetDevice &&
+    if (getLangOpts().OpenMPIsTargetDevice &&
         !DSAStack->hasRequiresDeclWithClause<OMPDynamicAllocatorsClause>())
-      targetDiag(Loc, diag::err_expected_allocator_clause);
+      SemaRef.targetDiag(Loc, diag::err_expected_allocator_clause);
   } else {
     for (const OMPClause *C : Clauses)
       if (const auto *AC = dyn_cast<OMPAllocatorClause>(C))
@@ -3420,7 +3431,7 @@ Sema::ActOnOpenMPAllocateDirective(SourceLocation Loc, ArrayRef<Expr *> VarList,
         llvm_unreachable("Unexpected clause on allocate directive");
   }
   OMPAllocateDeclAttr::AllocatorTypeTy AllocatorKind =
-      getAllocatorKind(*this, DSAStack, Allocator);
+      getAllocatorKind(SemaRef, DSAStack, Allocator);
   SmallVector<Expr *, 8> Vars;
   for (Expr *RefExpr : VarList) {
     auto *DE = cast<DeclRefExpr>(RefExpr);
@@ -3435,7 +3446,7 @@ Sema::ActOnOpenMPAllocateDirective(SourceLocation Loc, ArrayRef<Expr *> VarList,
 
     // If the used several times in the allocate directive, the same allocator
     // must be used.
-    if (checkPreviousOMPAllocateAttribute(*this, DSAStack, RefExpr, VD,
+    if (checkPreviousOMPAllocateAttribute(SemaRef, DSAStack, RefExpr, VD,
                                           AllocatorKind, Allocator))
       continue;
 
@@ -3448,7 +3459,7 @@ Sema::ActOnOpenMPAllocateDirective(SourceLocation Loc, ArrayRef<Expr *> VarList,
         Diag(Allocator->getExprLoc(),
              diag::err_omp_expected_predefined_allocator)
             << Allocator->getSourceRange();
-        bool IsDecl = VD->isThisDeclarationADefinition(Context) ==
+        bool IsDecl = VD->isThisDeclarationADefinition(getASTContext()) ==
                       VarDecl::DeclarationOnly;
         Diag(VD->getLocation(),
              IsDecl ? diag::note_previous_decl : diag::note_defined_here)
@@ -3458,45 +3469,46 @@ Sema::ActOnOpenMPAllocateDirective(SourceLocation Loc, ArrayRef<Expr *> VarList,
     }
 
     Vars.push_back(RefExpr);
-    applyOMPAllocateAttribute(*this, VD, AllocatorKind, Allocator, Alignment,
+    applyOMPAllocateAttribute(SemaRef, VD, AllocatorKind, Allocator, Alignment,
                               DE->getSourceRange());
   }
   if (Vars.empty())
     return nullptr;
   if (!Owner)
-    Owner = getCurLexicalContext();
-  auto *D = OMPAllocateDecl::Create(Context, Owner, Loc, Vars, Clauses);
+    Owner = SemaRef.getCurLexicalContext();
+  auto *D = OMPAllocateDecl::Create(getASTContext(), Owner, Loc, Vars, Clauses);
   D->setAccess(AS_public);
   Owner->addDecl(D);
   return DeclGroupPtrTy::make(DeclGroupRef(D));
 }
 
-Sema::DeclGroupPtrTy
-Sema::ActOnOpenMPRequiresDirective(SourceLocation Loc,
-                                   ArrayRef<OMPClause *> ClauseList) {
+SemaOpenMP::DeclGroupPtrTy
+SemaOpenMP::ActOnOpenMPRequiresDirective(SourceLocation Loc,
+                                         ArrayRef<OMPClause *> ClauseList) {
   OMPRequiresDecl *D = nullptr;
-  if (!CurContext->isFileContext()) {
+  if (!SemaRef.CurContext->isFileContext()) {
     Diag(Loc, diag::err_omp_invalid_scope) << "requires";
   } else {
     D = CheckOMPRequiresDecl(Loc, ClauseList);
     if (D) {
-      CurContext->addDecl(D);
+      SemaRef.CurContext->addDecl(D);
       DSAStack->addRequiresDecl(D);
     }
   }
   return DeclGroupPtrTy::make(DeclGroupRef(D));
 }
 
-void Sema::ActOnOpenMPAssumesDirective(SourceLocation Loc,
-                                       OpenMPDirectiveKind DKind,
-                                       ArrayRef<std::string> Assumptions,
-                                       bool SkippedClauses) {
+void SemaOpenMP::ActOnOpenMPAssumesDirective(SourceLocation Loc,
+                                             OpenMPDirectiveKind DKind,
+                                             ArrayRef<std::string> Assumptions,
+                                             bool SkippedClauses) {
   if (!SkippedClauses && Assumptions.empty())
     Diag(Loc, diag::err_omp_no_clause_for_directive)
         << llvm::omp::getAllAssumeClauseOptions()
         << llvm::omp::getOpenMPDirectiveName(DKind);
 
-  auto *AA = OMPAssumeAttr::Create(Context, llvm::join(Assumptions, ","), Loc);
+  auto *AA =
+      OMPAssumeAttr::Create(getASTContext(), llvm::join(Assumptions, ","), Loc);
   if (DKind == llvm::omp::Directive::OMPD_begin_assumes) {
     OMPAssumeScoped.push_back(AA);
     return;
@@ -3515,7 +3527,7 @@ void Sema::ActOnOpenMPAssumesDirective(SourceLocation Loc,
   // declarations in included headers. To this end, we traverse all existing
   // declaration contexts and annotate function declarations here.
   SmallVector<DeclContext *, 8> DeclContexts;
-  auto *Ctx = CurContext;
+  auto *Ctx = SemaRef.CurContext;
   while (Ctx->getLexicalParent())
     Ctx = Ctx->getLexicalParent();
   DeclContexts.push_back(Ctx);
@@ -3539,13 +3551,14 @@ void Sema::ActOnOpenMPAssumesDirective(SourceLocation Loc,
   }
 }
 
-void Sema::ActOnOpenMPEndAssumesDirective() {
+void SemaOpenMP::ActOnOpenMPEndAssumesDirective() {
   assert(isInOpenMPAssumeScope() && "Not in OpenMP assumes scope!");
   OMPAssumeScoped.pop_back();
 }
 
-OMPRequiresDecl *Sema::CheckOMPRequiresDecl(SourceLocation Loc,
-                                            ArrayRef<OMPClause *> ClauseList) {
+OMPRequiresDecl *
+SemaOpenMP::CheckOMPRequiresDecl(SourceLocation Loc,
+                                 ArrayRef<OMPClause *> ClauseList) {
   /// For target specific clauses, the requires directive cannot be
   /// specified after the handling of any of the target regions in the
   /// current compilation unit.
@@ -3576,8 +3589,8 @@ OMPRequiresDecl *Sema::CheckOMPRequiresDecl(SourceLocation Loc,
   }
 
   if (!DSAStack->hasDuplicateRequiresClause(ClauseList))
-    return OMPRequiresDecl::Create(Context, getCurLexicalContext(), Loc,
-                                   ClauseList);
+    return OMPRequiresDecl::Create(
+        getASTContext(), SemaRef.getCurLexicalContext(), Loc, ClauseList);
   return nullptr;
 }
 
@@ -3695,7 +3708,7 @@ class DSAAttrChecker final : public StmtVisitor<DSAAttrChecker, void> {
   llvm::SmallVector<Expr *, 4> ImplicitMap[DefaultmapKindNum][OMPC_MAP_delete];
   llvm::SmallVector<OpenMPMapModifierKind, NumberOfOMPMapClauseModifiers>
       ImplicitMapModifier[DefaultmapKindNum];
-  Sema::VarsWithInheritedDSAType VarsWithInheritedDSA;
+  SemaOpenMP::VarsWithInheritedDSAType VarsWithInheritedDSA;
   llvm::SmallDenseSet<const ValueDecl *, 4> ImplicitDeclarations;
 
   void VisitSubCaptures(OMPExecutableDirective *S) {
@@ -4161,7 +4174,7 @@ class DSAAttrChecker final : public StmtVisitor<DSAAttrChecker, void> {
   getImplicitMapModifier(OpenMPDefaultmapClauseKind Kind) const {
     return ImplicitMapModifier[Kind];
   }
-  const Sema::VarsWithInheritedDSAType &getVarsWithInheritedDSA() const {
+  const SemaOpenMP::VarsWithInheritedDSAType &getVarsWithInheritedDSA() const {
     return VarsWithInheritedDSA;
   }
 
@@ -4193,7 +4206,9 @@ static void handleDeclareVariantConstructTrait(DSAStackTy *Stack,
   Stack->handleConstructTrait(Traits, ScopeEntry);
 }
 
-void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
+void SemaOpenMP::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind,
+                                        Scope *CurScope) {
+  ASTContext &Context = getASTContext();
   switch (DKind) {
   case OMPD_parallel:
   case OMPD_parallel_for:
@@ -4208,13 +4223,13 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
     QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst();
     QualType KmpInt32PtrTy =
         Context.getPointerType(KmpInt32Ty).withConst().withRestrict();
-    Sema::CapturedParamNameType Params[] = {
+    SemaOpenMP::CapturedParamNameType Params[] = {
         std::make_pair(".global_tid.", KmpInt32PtrTy),
         std::make_pair(".bound_tid.", KmpInt32PtrTy),
         std::make_pair(StringRef(), QualType()) // __context with shared vars
     };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params);
+    SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope,
+                                     CR_OpenMP, Params);
     break;
   }
   case OMPD_target_teams:
@@ -4232,7 +4247,7 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
     FunctionProtoType::ExtProtoInfo EPI;
     EPI.Variadic = true;
     QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
-    Sema::CapturedParamNameType Params[] = {
+    SemaOpenMP::CapturedParamNameType Params[] = {
         std::make_pair(".global_tid.", KmpInt32Ty),
         std::make_pair(".part_id.", KmpInt32PtrTy),
         std::make_pair(".privates.", VoidPtrTy),
@@ -4242,31 +4257,33 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
         std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
         std::make_pair(StringRef(), QualType()) // __context with shared vars
     };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params, /*OpenMPCaptureLevel=*/0);
+    SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope,
+                                     CR_OpenMP, Params,
+                                     /*OpenMPCaptureLevel=*/0);
     // Mark this captured region as inlined, because we don't use outlined
     // function directly.
-    getCurCapturedRegion()->TheCapturedDecl->addAttr(
+    SemaRef.getCurCapturedRegion()->TheCapturedDecl->addAttr(
         AlwaysInlineAttr::CreateImplicit(
             Context, {}, AlwaysInlineAttr::Keyword_forceinline));
-    SmallVector<Sema::CapturedParamNameType, 2> ParamsTarget;
+    SmallVector<SemaOpenMP::CapturedParamNameType, 2> ParamsTarget;
     if (getLangOpts().OpenMPIsTargetDevice)
       ParamsTarget.push_back(std::make_pair(StringRef("dyn_ptr"), VoidPtrTy));
     ParamsTarget.push_back(
         std::make_pair(StringRef(), QualType())); // __context with shared vars;
     // Start a captured region for 'target' with no implicit parameters.
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             ParamsTarget,
-                             /*OpenMPCaptureLevel=*/1);
-    Sema::CapturedParamNameType ParamsTeamsOrParallel[] = {
+    SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope,
+                                     CR_OpenMP, ParamsTarget,
+                                     /*OpenMPCaptureLevel=*/1);
+    SemaOpenMP::CapturedParamNameType ParamsTeamsOrParallel[] = {
         std::make_pair(".global_tid.", KmpInt32PtrTy),
         std::make_pair(".bound_tid.", KmpInt32PtrTy),
         std::make_pair(StringRef(), QualType()) // __context with shared vars
     };
     // Start a captured region for 'teams' or 'parallel'.  Both regions have
     // the same implicit parameters.
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             ParamsTeamsOrParallel, /*OpenMPCaptureLevel=*/2);
+    SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope,
+                                     CR_OpenMP, ParamsTeamsOrParallel,
+                                     /*OpenMPCaptureLevel=*/2);
     break;
   }
   case OMPD_target:
@@ -4279,7 +4296,7 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
     FunctionProtoType::ExtProtoInfo EPI;
     EPI.Variadic = true;
     QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
-    Sema::CapturedParamNameType Params[] = {
+    SemaOpenMP::CapturedParamNameType Params[] = {
         std::make_pair(".global_tid.", KmpInt32Ty),
         std::make_pair(".part_id.", KmpInt32PtrTy),
         std::make_pair(".privates.", VoidPtrTy),
@@ -4289,21 +4306,22 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
         std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
         std::make_pair(StringRef(), QualType()) // __context with shared vars
     };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params, /*OpenMPCaptureLevel=*/0);
+    SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope,
+                                     CR_OpenMP, Params,
+                                     /*OpenMPCaptureLevel=*/0);
     // Mark this captured region as inlined, because we don't use outlined
     // function directly.
-    getCurCapturedRegion()->TheCapturedDecl->addAttr(
+    SemaRef.getCurCapturedRegion()->TheCapturedDecl->addAttr(
         AlwaysInlineAttr::CreateImplicit(
             Context, {}, AlwaysInlineAttr::Keyword_forceinline));
-    SmallVector<Sema::CapturedParamNameType, 2> ParamsTarget;
+    SmallVector<SemaOpenMP::CapturedParamNameType, 2> ParamsTarget;
     if (getLangOpts().OpenMPIsTargetDevice)
       ParamsTarget.push_back(std::make_pair(StringRef("dyn_ptr"), VoidPtrTy));
     ParamsTarget.push_back(
         std::make_pair(StringRef(), QualType())); // __context with shared vars;
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             ParamsTarget,
-                             /*OpenMPCaptureLevel=*/1);
+    SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope,
+                                     CR_OpenMP, ParamsTarget,
+                                     /*OpenMPCaptureLevel=*/1);
     break;
   }
   case OMPD_atomic:
@@ -4329,11 +4347,11 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
   case OMPD_scope:
   case OMPD_target_data:
   case OMPD_dispatch: {
-    Sema::CapturedParamNameType Params[] = {
+    SemaOpenMP::CapturedParamNameType Params[] = {
         std::make_pair(StringRef(), QualType()) // __context with shared vars
     };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params);
+    SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope,
+                                     CR_OpenMP, Params);
     break;
   }
   case OMPD_task: {
@@ -4345,7 +4363,7 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
     FunctionProtoType::ExtProtoInfo EPI;
     EPI.Variadic = true;
     QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
-    Sema::CapturedParamNameType Params[] = {
+    SemaOpenMP::CapturedParamNameType Params[] = {
         std::make_pair(".global_tid.", KmpInt32Ty),
         std::make_pair(".part_id.", KmpInt32PtrTy),
         std::make_pair(".privates.", VoidPtrTy),
@@ -4355,11 +4373,11 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
         std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
         std::make_pair(StringRef(), QualType()) // __context with shared vars
     };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params);
+    SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope,
+                                     CR_OpenMP, Params);
     // Mark this captured region as inlined, because we don't use outlined
     // function directly.
-    getCurCapturedRegion()->TheCapturedDecl->addAttr(
+    SemaRef.getCurCapturedRegion()->TheCapturedDecl->addAttr(
         AlwaysInlineAttr::CreateImplicit(
             Context, {}, AlwaysInlineAttr::Keyword_forceinline));
     break;
@@ -4386,7 +4404,7 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
     FunctionProtoType::ExtProtoInfo EPI;
     EPI.Variadic = true;
     QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
-    Sema::CapturedParamNameType Params[] = {
+    SemaOpenMP::CapturedParamNameType Params[] = {
         std::make_pair(".global_tid.", KmpInt32Ty),
         std::make_pair(".part_id.", KmpInt32PtrTy),
         std::make_pair(".privates.", VoidPtrTy),
@@ -4401,11 +4419,11 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
         std::make_pair(".reductions.", VoidPtrTy),
         std::make_pair(StringRef(), QualType()) // __context with shared vars
     };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params);
+    SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope,
+                                     CR_OpenMP, Params);
     // Mark this captured region as inlined, because we don't use outlined
     // function directly.
-    getCurCapturedRegion()->TheCapturedDecl->addAttr(
+    SemaRef.getCurCapturedRegion()->TheCapturedDecl->addAttr(
         AlwaysInlineAttr::CreateImplicit(
             Context, {}, AlwaysInlineAttr::Keyword_forceinline));
     break;
@@ -4426,19 +4444,20 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
     QualType VoidPtrTy = Context.VoidPtrTy.withConst().withRestrict();
     QualType KmpInt32PtrTy =
         Context.getPointerType(KmpInt32Ty).withConst().withRestrict();
-    Sema::CapturedParamNameType ParamsParallel[] = {
+    SemaOpenMP::CapturedParamNameType ParamsParallel[] = {
         std::make_pair(".global_tid.", KmpInt32PtrTy),
         std::make_pair(".bound_tid.", KmpInt32PtrTy),
         std::make_pair(StringRef(), QualType()) // __context with shared vars
     };
     // Start a captured region for 'parallel'.
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             ParamsParallel, /*OpenMPCaptureLevel=*/0);
+    SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope,
+                                     CR_OpenMP, ParamsParallel,
+                                     /*OpenMPCaptureLevel=*/0);
     QualType Args[] = {VoidPtrTy};
     FunctionProtoType::ExtProtoInfo EPI;
     EPI.Variadic = true;
     QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
-    Sema::CapturedParamNameType Params[] = {
+    SemaOpenMP::CapturedParamNameType Params[] = {
         std::make_pair(".global_tid.", KmpInt32Ty),
         std::make_pair(".part_id.", KmpInt32PtrTy),
         std::make_pair(".privates.", VoidPtrTy),
@@ -4453,11 +4472,12 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
         std::make_pair(".reductions.", VoidPtrTy),
         std::make_pair(StringRef(), QualType()) // __context with shared vars
     };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params, /*OpenMPCaptureLevel=*/1);
+    SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope,
+                                     CR_OpenMP, Params,
+                                     /*OpenMPCaptureLevel=*/1);
     // Mark this captured region as inlined, because we don't use outlined
     // function directly.
-    getCurCapturedRegion()->TheCapturedDecl->addAttr(
+    SemaRef.getCurCapturedRegion()->TheCapturedDecl->addAttr(
         AlwaysInlineAttr::CreateImplicit(
             Context, {}, AlwaysInlineAttr::Keyword_forceinline));
     break;
@@ -4467,15 +4487,15 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
     QualType KmpInt32Ty = Context.getIntTypeForBitwidth(32, 1).withConst();
     QualType KmpInt32PtrTy =
         Context.getPointerType(KmpInt32Ty).withConst().withRestrict();
-    Sema::CapturedParamNameType Params[] = {
+    SemaOpenMP::CapturedParamNameType Params[] = {
         std::make_pair(".global_tid.", KmpInt32PtrTy),
         std::make_pair(".bound_tid.", KmpInt32PtrTy),
         std::make_pair(".previous.lb.", Context.getSizeType().withConst()),
         std::make_pair(".previous.ub.", Context.getSizeType().withConst()),
         std::make_pair(StringRef(), QualType()) // __context with shared vars
     };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params);
+    SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope,
+                                     CR_OpenMP, Params);
     break;
   }
   // For 'target teams loop', collect all captured regions so codegen can
@@ -4492,7 +4512,7 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
     FunctionProtoType::ExtProtoInfo EPI;
     EPI.Variadic = true;
     QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
-    Sema::CapturedParamNameType Params[] = {
+    SemaOpenMP::CapturedParamNameType Params[] = {
         std::make_pair(".global_tid.", KmpInt32Ty),
         std::make_pair(".part_id.", KmpInt32PtrTy),
         std::make_pair(".privates.", VoidPtrTy),
@@ -4502,32 +4522,35 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
         std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
         std::make_pair(StringRef(), QualType()) // __context with shared vars
     };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params, /*OpenMPCaptureLevel=*/0);
+    SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope,
+                                     CR_OpenMP, Params,
+                                     /*OpenMPCaptureLevel=*/0);
     // Mark this captured region as inlined, because we don't use outlined
     // function directly.
-    getCurCapturedRegion()->TheCapturedDecl->addAttr(
+    SemaRef.getCurCapturedRegion()->TheCapturedDecl->addAttr(
         AlwaysInlineAttr::CreateImplicit(
             Context, {}, AlwaysInlineAttr::Keyword_forceinline));
-    SmallVector<Sema::CapturedParamNameType, 2> ParamsTarget;
+    SmallVector<SemaOpenMP::CapturedParamNameType, 2> ParamsTarget;
     if (getLangOpts().OpenMPIsTargetDevice)
       ParamsTarget.push_back(std::make_pair(StringRef("dyn_ptr"), VoidPtrTy));
     ParamsTarget.push_back(
         std::make_pair(StringRef(), QualType())); // __context with shared vars;
     // Start a captured region for 'target' with no implicit parameters.
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             ParamsTarget, /*OpenMPCaptureLevel=*/1);
+    SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope,
+                                     CR_OpenMP, ParamsTarget,
+                                     /*OpenMPCaptureLevel=*/1);
 
-    Sema::CapturedParamNameType ParamsTeams[] = {
+    SemaOpenMP::CapturedParamNameType ParamsTeams[] = {
         std::make_pair(".global_tid.", KmpInt32PtrTy),
         std::make_pair(".bound_tid.", KmpInt32PtrTy),
         std::make_pair(StringRef(), QualType()) // __context with shared vars
     };
     // Start a captured region for 'target' with no implicit parameters.
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             ParamsTeams, /*OpenMPCaptureLevel=*/2);
+    SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope,
+                                     CR_OpenMP, ParamsTeams,
+                                     /*OpenMPCaptureLevel=*/2);
 
-    Sema::CapturedParamNameType ParamsParallel[] = {
+    SemaOpenMP::CapturedParamNameType ParamsParallel[] = {
         std::make_pair(".global_tid.", KmpInt32PtrTy),
         std::make_pair(".bound_tid.", KmpInt32PtrTy),
         std::make_pair(".previous.lb.", Context.getSizeType().withConst()),
@@ -4536,8 +4559,9 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
     };
     // Start a captured region for 'teams' or 'parallel'.  Both regions have
     // the same implicit parameters.
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             ParamsParallel, /*OpenMPCaptureLevel=*/3);
+    SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope,
+                                     CR_OpenMP, ParamsParallel,
+                                     /*OpenMPCaptureLevel=*/3);
     break;
   }
 
@@ -4548,16 +4572,17 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
     QualType KmpInt32PtrTy =
         Context.getPointerType(KmpInt32Ty).withConst().withRestrict();
 
-    Sema::CapturedParamNameType ParamsTeams[] = {
+    SemaOpenMP::CapturedParamNameType ParamsTeams[] = {
         std::make_pair(".global_tid.", KmpInt32PtrTy),
         std::make_pair(".bound_tid.", KmpInt32PtrTy),
         std::make_pair(StringRef(), QualType()) // __context with shared vars
     };
     // Start a captured region for 'target' with no implicit parameters.
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             ParamsTeams, /*OpenMPCaptureLevel=*/0);
+    SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope,
+                                     CR_OpenMP, ParamsTeams,
+                                     /*OpenMPCaptureLevel=*/0);
 
-    Sema::CapturedParamNameType ParamsParallel[] = {
+    SemaOpenMP::CapturedParamNameType ParamsParallel[] = {
         std::make_pair(".global_tid.", KmpInt32PtrTy),
         std::make_pair(".bound_tid.", KmpInt32PtrTy),
         std::make_pair(".previous.lb.", Context.getSizeType().withConst()),
@@ -4566,8 +4591,9 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
     };
     // Start a captured region for 'teams' or 'parallel'.  Both regions have
     // the same implicit parameters.
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             ParamsParallel, /*OpenMPCaptureLevel=*/1);
+    SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope,
+                                     CR_OpenMP, ParamsParallel,
+                                     /*OpenMPCaptureLevel=*/1);
     break;
   }
   case OMPD_target_update:
@@ -4581,7 +4607,7 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
     FunctionProtoType::ExtProtoInfo EPI;
     EPI.Variadic = true;
     QualType CopyFnType = Context.getFunctionType(Context.VoidTy, Args, EPI);
-    Sema::CapturedParamNameType Params[] = {
+    SemaOpenMP::CapturedParamNameType Params[] = {
         std::make_pair(".global_tid.", KmpInt32Ty),
         std::make_pair(".part_id.", KmpInt32PtrTy),
         std::make_pair(".privates.", VoidPtrTy),
@@ -4591,11 +4617,11 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
         std::make_pair(".task_t.", Context.VoidPtrTy.withConst()),
         std::make_pair(StringRef(), QualType()) // __context with shared vars
     };
-    ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             Params);
+    SemaRef.ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope,
+                                     CR_OpenMP, Params);
     // Mark this captured region as inlined, because we don't use outlined
     // function directly.
-    getCurCapturedRegion()->TheCapturedDecl->addAttr(
+    SemaRef.getCurCapturedRegion()->TheCapturedDecl->addAttr(
         AlwaysInlineAttr::CreateImplicit(
             Context, {}, AlwaysInlineAttr::Keyword_forceinline));
     break;
@@ -4626,15 +4652,15 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
   default:
     llvm_unreachable("Unknown OpenMP directive");
   }
-  DSAStack->setContext(CurContext);
+  DSAStack->setContext(SemaRef.CurContext);
   handleDeclareVariantConstructTrait(DSAStack, DKind, /* ScopeEntry */ true);
 }
 
-int Sema::getNumberOfConstructScopes(unsigned Level) const {
+int SemaOpenMP::getNumberOfConstructScopes(unsigned Level) const {
   return getOpenMPCaptureLevels(DSAStack->getDirective(Level));
 }
 
-int Sema::getOpenMPCaptureLevels(OpenMPDirectiveKind DKind) {
+int SemaOpenMP::getOpenMPCaptureLevels(OpenMPDirectiveKind DKind) {
   SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
   getOpenMPCaptureRegions(CaptureRegions, DKind);
   return CaptureRegions.size();
@@ -4674,7 +4700,7 @@ static OMPCapturedExprDecl *buildCaptureDecl(Sema &S, IdentifierInfo *Id,
 static DeclRefExpr *buildCapture(Sema &S, ValueDecl *D, Expr *CaptureExpr,
                                  bool WithInit) {
   OMPCapturedExprDecl *CD;
-  if (VarDecl *VD = S.isOpenMPCapturedDecl(D))
+  if (VarDecl *VD = S.OpenMP().isOpenMPCapturedDecl(D))
     CD = cast<OMPCapturedExprDecl>(VD);
   else
     CD = buildCaptureDecl(S, D->getIdentifier(), CaptureExpr, WithInit,
@@ -4726,7 +4752,7 @@ class CaptureRegionUnwinderRAII {
       : S(S), ErrorFound(ErrorFound), DKind(DKind) {}
   ~CaptureRegionUnwinderRAII() {
     if (ErrorFound) {
-      int ThisCaptureLevel = S.getOpenMPCaptureLevels(DKind);
+      int ThisCaptureLevel = S.OpenMP().getOpenMPCaptureLevels(DKind);
       while (--ThisCaptureLevel >= 0)
         S.ActOnCapturedRegionError();
     }
@@ -4734,10 +4760,10 @@ class CaptureRegionUnwinderRAII {
 };
 } // namespace
 
-void Sema::tryCaptureOpenMPLambdas(ValueDecl *V) {
+void SemaOpenMP::tryCaptureOpenMPLambdas(ValueDecl *V) {
   // Capture variables captured by reference in lambdas for target-based
   // directives.
-  if (!CurContext->isDependentContext() &&
+  if (!SemaRef.CurContext->isDependentContext() &&
       (isOpenMPTargetExecutionDirective(DSAStack->getCurrentDirective()) ||
        isOpenMPTargetDataManagementDirective(
            DSAStack->getCurrentDirective()))) {
@@ -4757,14 +4783,14 @@ void Sema::tryCaptureOpenMPLambdas(ValueDecl *V) {
           if (LC.getCaptureKind() == LCK_ByRef) {
             VarDecl *VD = cast<VarDecl>(LC.getCapturedVar());
             DeclContext *VDC = VD->getDeclContext();
-            if (!VDC->Encloses(CurContext))
+            if (!VDC->Encloses(SemaRef.CurContext))
               continue;
-            MarkVariableReferenced(LC.getLocation(), VD);
+            SemaRef.MarkVariableReferenced(LC.getLocation(), VD);
           } else if (LC.getCaptureKind() == LCK_This) {
-            QualType ThisTy = getCurrentThisType();
-            if (!ThisTy.isNull() &&
-                Context.typesAreCompatible(ThisTy, ThisCapture->getType()))
-              CheckCXXThisCapture(LC.getLocation());
+            QualType ThisTy = SemaRef.getCurrentThisType();
+            if (!ThisTy.isNull() && getASTContext().typesAreCompatible(
+                                        ThisTy, ThisCapture->getType()))
+              SemaRef.CheckCXXThisCapture(LC.getLocation());
           }
         }
       }
@@ -4804,8 +4830,8 @@ static bool checkOrderedOrderSpecified(Sema &S,
   return false;
 }
 
-StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S,
-                                      ArrayRef<OMPClause *> Clauses) {
+StmtResult SemaOpenMP::ActOnOpenMPRegionEnd(StmtResult S,
+                                            ArrayRef<OMPClause *> Clauses) {
   handleDeclareVariantConstructTrait(DSAStack, DSAStack->getCurrentDirective(),
                                      /* ScopeEntry */ false);
   if (DSAStack->getCurrentDirective() == OMPD_atomic ||
@@ -4817,7 +4843,7 @@ StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S,
 
   bool ErrorFound = false;
   CaptureRegionUnwinderRAII CaptureRegionUnwinder(
-      *this, ErrorFound, DSAStack->getCurrentDirective());
+      SemaRef, ErrorFound, DSAStack->getCurrentDirective());
   if (!S.isUsable()) {
     ErrorFound = true;
     return StmtError();
@@ -4831,7 +4857,7 @@ StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S,
   SmallVector<const OMPClauseWithPreInit *, 4> PICs;
   // This is required for proper codegen.
   for (OMPClause *Clause : Clauses) {
-    if (!LangOpts.OpenMPSimd &&
+    if (!getLangOpts().OpenMPSimd &&
         (isOpenMPTaskingDirective(DSAStack->getCurrentDirective()) ||
          DSAStack->getCurrentDirective() == OMPD_target) &&
         Clause->getClauseKind() == OMPC_in_reduction) {
@@ -4840,7 +4866,7 @@ StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S,
       auto *IRC = cast<OMPInReductionClause>(Clause);
       for (Expr *E : IRC->taskgroup_descriptors())
         if (E)
-          MarkDeclarationsReferencedInExpr(E);
+          SemaRef.MarkDeclarationsReferencedInExpr(E);
     }
     if (isOpenMPPrivate(Clause->getClauseKind()) ||
         Clause->getClauseKind() == OMPC_copyprivate ||
@@ -4851,7 +4877,7 @@ StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S,
       // Mark all variables in private list clauses as used in inner region.
       for (Stmt *VarRef : Clause->children()) {
         if (auto *E = cast_or_null<Expr>(VarRef)) {
-          MarkDeclarationsReferencedInExpr(E);
+          SemaRef.MarkDeclarationsReferencedInExpr(E);
         }
       }
       DSAStack->setForceVarCapturing(/*V=*/false);
@@ -4865,7 +4891,7 @@ StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S,
         PICs.push_back(C);
       if (auto *C = OMPClauseWithPostUpdate::get(Clause)) {
         if (Expr *E = C->getPostUpdateExpr())
-          MarkDeclarationsReferencedInExpr(E);
+          SemaRef.MarkDeclarationsReferencedInExpr(E);
       }
     }
     if (Clause->getClauseKind() == OMPC_schedule)
@@ -4877,7 +4903,7 @@ StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S,
   }
   // Capture allocator expressions if used.
   for (Expr *E : DSAStack->getInnerAllocators())
-    MarkDeclarationsReferencedInExpr(E);
+    SemaRef.MarkDeclarationsReferencedInExpr(E);
   // OpenMP, 2.7.1 Loop Construct, Restrictions
   // The nonmonotonic modifier cannot be specified if an ordered clause is
   // specified.
@@ -4899,7 +4925,7 @@ StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S,
   // OpenMP 5.0, 2.9.2 Worksharing-Loop Construct, Restrictions.
   // If an order(concurrent) clause is present, an ordered clause may not appear
   // on the same directive.
-  if (checkOrderedOrderSpecified(*this, Clauses))
+  if (checkOrderedOrderSpecified(SemaRef, Clauses))
     ErrorFound = true;
   if (!LCs.empty() && OC && OC->getNumForLoops()) {
     for (const OMPLinearClause *C : LCs) {
@@ -4936,7 +4962,8 @@ StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S,
             CaptureRegion == OMPD_unknown) {
           if (auto *DS = cast_or_null<DeclStmt>(C->getPreInitStmt())) {
             for (Decl *D : DS->decls())
-              MarkVariableReferenced(D->getLocation(), cast<VarDecl>(D));
+              SemaRef.MarkVariableReferenced(D->getLocation(),
+                                             cast<VarDecl>(D));
           }
         }
       }
@@ -4950,7 +4977,7 @@ StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S,
                ++I) {
             OMPUsesAllocatorsClause::Data D = UAC->getAllocatorData(I);
             if (Expr *E = D.AllocatorTraits)
-              MarkDeclarationsReferencedInExpr(E);
+              SemaRef.MarkDeclarationsReferencedInExpr(E);
           }
           continue;
         }
@@ -4965,17 +4992,17 @@ StmtResult Sema::ActOnOpenMPRegionEnd(StmtResult S,
             continue;
           for (Expr *E : RC->copy_array_temps())
             if (E)
-              MarkDeclarationsReferencedInExpr(E);
+              SemaRef.MarkDeclarationsReferencedInExpr(E);
         }
         if (auto *AC = dyn_cast<OMPAlignedClause>(C)) {
           for (Expr *E : AC->varlists())
-            MarkDeclarationsReferencedInExpr(E);
+            SemaRef.MarkDeclarationsReferencedInExpr(E);
         }
       }
     }
     if (++CompletedRegions == CaptureRegions.size())
       DSAStack->setBodyComplete();
-    SR = ActOnCapturedRegionEnd(SR.get());
+    SR = SemaRef.ActOnCapturedRegionEnd(SR.get());
   }
   return SR;
 }
@@ -5782,9 +5809,9 @@ static CapturedStmt *buildLoopVarFunc(Sema &Actions, QualType LoopVarTy,
   // the OpenMPIRBuilder to know additional C/C++ semantics, such as how to
   // invoke a copy constructor.
   QualType TargetParamTy = Ctx.getLValueReferenceType(LoopVarTy);
-  Sema::CapturedParamNameType Params[] = {{"LoopVar", TargetParamTy},
-                                          {"Logical", LogicalTy},
-                                          {StringRef(), QualType()}};
+  SemaOpenMP::CapturedParamNameType Params[] = {{"LoopVar", TargetParamTy},
+                                                {"Logical", LogicalTy},
+                                                {StringRef(), QualType()}};
   Actions.ActOnCapturedRegionStart({}, nullptr, CR_Default, Params);
 
   // Capture the initial iterator which represents the LoopVar value at the
@@ -5835,7 +5862,7 @@ static CapturedStmt *buildLoopVarFunc(Sema &Actions, QualType LoopVarTy,
       AssertSuccess(Actions.ActOnCapturedRegionEnd(Body)));
 }
 
-StmtResult Sema::ActOnOpenMPCanonicalLoop(Stmt *AStmt) {
+StmtResult SemaOpenMP::ActOnOpenMPCanonicalLoop(Stmt *AStmt) {
   ASTContext &Ctx = getASTContext();
 
   // Extract the common elements of ForStmt and CXXForRangeStmt:
@@ -5946,8 +5973,8 @@ StmtResult Sema::ActOnOpenMPCanonicalLoop(Stmt *AStmt) {
     if (IncBin->getOpcode() == BO_AddAssign) {
       Step = IncBin->getRHS();
     } else if (IncBin->getOpcode() == BO_SubAssign) {
-      Step =
-          AssertSuccess(BuildUnaryOp(nullptr, {}, UO_Minus, IncBin->getRHS()));
+      Step = AssertSuccess(
+          SemaRef.BuildUnaryOp(nullptr, {}, UO_Minus, IncBin->getRHS()));
     } else
       llvm_unreachable("unhandled binary increment operator");
   } else if (auto *CondCXXOp = dyn_cast<CXXOperatorCallExpr>(Inc)) {
@@ -5965,7 +5992,7 @@ StmtResult Sema::ActOnOpenMPCanonicalLoop(Stmt *AStmt) {
       break;
     case OO_MinusEqual:
       Step = AssertSuccess(
-          BuildUnaryOp(nullptr, {}, UO_Minus, CondCXXOp->getArg(1)));
+          SemaRef.BuildUnaryOp(nullptr, {}, UO_Minus, CondCXXOp->getArg(1)));
       break;
     default:
       llvm_unreachable("unhandled overloaded increment operator");
@@ -5974,16 +6001,17 @@ StmtResult Sema::ActOnOpenMPCanonicalLoop(Stmt *AStmt) {
     llvm_unreachable("unknown increment expression");
 
   CapturedStmt *DistanceFunc =
-      buildDistanceFunc(*this, LogicalTy, CondRel, LHS, RHS, Step);
+      buildDistanceFunc(SemaRef, LogicalTy, CondRel, LHS, RHS, Step);
   CapturedStmt *LoopVarFunc = buildLoopVarFunc(
-      *this, LVTy, LogicalTy, CounterRef, Step, isa<CXXForRangeStmt>(AStmt));
-  DeclRefExpr *LVRef = BuildDeclRefExpr(LUVDecl, LUVDecl->getType(), VK_LValue,
-                                        {}, nullptr, nullptr, {}, nullptr);
+      SemaRef, LVTy, LogicalTy, CounterRef, Step, isa<CXXForRangeStmt>(AStmt));
+  DeclRefExpr *LVRef =
+      SemaRef.BuildDeclRefExpr(LUVDecl, LUVDecl->getType(), VK_LValue, {},
+                               nullptr, nullptr, {}, nullptr);
   return OMPCanonicalLoop::create(getASTContext(), AStmt, DistanceFunc,
                                   LoopVarFunc, LVRef);
 }
 
-StmtResult Sema::ActOnOpenMPLoopnest(Stmt *AStmt) {
+StmtResult SemaOpenMP::ActOnOpenMPLoopnest(Stmt *AStmt) {
   // Handle a literal loop.
   if (isa<ForStmt>(AStmt) || isa<CXXForRangeStmt>(AStmt))
     return ActOnOpenMPCanonicalLoop(AStmt);
@@ -6128,7 +6156,7 @@ processImplicitMapsWithDefaultMappers(Sema &S, DSAStackTy *Stack,
       continue;
     CXXScopeSpec MapperIdScopeSpec;
     DeclarationNameInfo MapperId;
-    if (OMPClause *NewClause = S.ActOnOpenMPMapClause(
+    if (OMPClause *NewClause = S.OpenMP().ActOnOpenMPMapClause(
             nullptr, C->getMapTypeModifiers(), C->getMapTypeModifiersLoc(),
             MapperIdScopeSpec, MapperId, C->getMapType(),
             /*IsMapTypeImplicit=*/true, SourceLocation(), SourceLocation(),
@@ -6210,14 +6238,12 @@ static bool teamsLoopCanBeParallelFor(Stmt *AStmt, Sema &SemaRef) {
   return Checker.teamsLoopCanBeParallelFor();
 }
 
-bool Sema::mapLoopConstruct(llvm::SmallVector<OMPClause *> &ClausesWithoutBind,
-                            ArrayRef<OMPClause *> Clauses,
-                            OpenMPBindClauseKind &BindKind,
-                            OpenMPDirectiveKind &Kind,
-                            OpenMPDirectiveKind &PrevMappedDirective,
-                            SourceLocation StartLoc, SourceLocation EndLoc,
-                            const DeclarationNameInfo &DirName,
-                            OpenMPDirectiveKind CancelRegion) {
+bool SemaOpenMP::mapLoopConstruct(
+    llvm::SmallVector<OMPClause *> &ClausesWithoutBind,
+    ArrayRef<OMPClause *> Clauses, OpenMPBindClauseKind &BindKind,
+    OpenMPDirectiveKind &Kind, OpenMPDirectiveKind &PrevMappedDirective,
+    SourceLocation StartLoc, SourceLocation EndLoc,
+    const DeclarationNameInfo &DirName, OpenMPDirectiveKind CancelRegion) {
 
   bool UseClausesWithoutBind = false;
 
@@ -6299,7 +6325,7 @@ bool Sema::mapLoopConstruct(llvm::SmallVector<OMPClause *> &ClausesWithoutBind,
   return UseClausesWithoutBind;
 }
 
-StmtResult Sema::ActOnOpenMPExecutableDirective(
+StmtResult SemaOpenMP::ActOnOpenMPExecutableDirective(
     OpenMPDirectiveKind Kind, const DeclarationNameInfo &DirName,
     OpenMPDirectiveKind CancelRegion, ArrayRef<OMPClause *> Clauses,
     Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc,
@@ -6324,8 +6350,8 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
   }
 
   // First check CancelRegion which is then used in checkNestingOfRegions.
-  if (checkCancelRegion(*this, Kind, CancelRegion, StartLoc) ||
-      checkNestingOfRegions(*this, DSAStack, DK, DirName, CancelRegion,
+  if (checkCancelRegion(SemaRef, Kind, CancelRegion, StartLoc) ||
+      checkNestingOfRegions(SemaRef, DSAStack, DK, DirName, CancelRegion,
                             BindKind, StartLoc)) {
     return StmtError();
   }
@@ -6344,13 +6370,14 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
   } else {
     ClausesWithImplicit.append(Clauses.begin(), Clauses.end());
   }
-  if (AStmt && !CurContext->isDependentContext() && Kind != OMPD_atomic &&
-      Kind != OMPD_critical && Kind != OMPD_section && Kind != OMPD_master &&
-      Kind != OMPD_masked && !isOpenMPLoopTransformationDirective(Kind)) {
+  if (AStmt && !SemaRef.CurContext->isDependentContext() &&
+      Kind != OMPD_atomic && Kind != OMPD_critical && Kind != OMPD_section &&
+      Kind != OMPD_master && Kind != OMPD_masked &&
+      !isOpenMPLoopTransformationDirective(Kind)) {
     assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
 
     // Check default data sharing attributes for referenced variables.
-    DSAAttrChecker DSAChecker(DSAStack, *this, cast<CapturedStmt>(AStmt));
+    DSAAttrChecker DSAChecker(DSAStack, SemaRef, cast<CapturedStmt>(AStmt));
     int ThisCaptureLevel = getOpenMPCaptureLevels(Kind);
     Stmt *S = AStmt;
     while (--ThisCaptureLevel >= 0)
@@ -6490,8 +6517,8 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
     }
     // Build expressions for implicit maps of data members with 'default'
     // mappers.
-    if (LangOpts.OpenMP >= 50)
-      processImplicitMapsWithDefaultMappers(*this, DSAStack,
+    if (getLangOpts().OpenMP >= 50)
+      processImplicitMapsWithDefaultMappers(SemaRef, DSAStack,
                                             ClausesWithImplicit);
   }
 
@@ -6505,7 +6532,7 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
   case OMPD_simd:
     Res = ActOnOpenMPSimdDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc,
                                    VarsWithInheritedDSA);
-    if (LangOpts.OpenMP >= 50)
+    if (getLangOpts().OpenMP >= 50)
       AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_tile:
@@ -6523,7 +6550,7 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
   case OMPD_for_simd:
     Res = ActOnOpenMPForSimdDirective(ClausesWithImplicit, AStmt, StartLoc,
                                       EndLoc, VarsWithInheritedDSA);
-    if (LangOpts.OpenMP >= 50)
+    if (getLangOpts().OpenMP >= 50)
       AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_sections:
@@ -6561,7 +6588,7 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
     Res = ActOnOpenMPParallelForSimdDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
     AllowedNameModifiers.push_back(OMPD_parallel);
-    if (LangOpts.OpenMP >= 50)
+    if (getLangOpts().OpenMP >= 50)
       AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_scope:
@@ -6698,7 +6725,7 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
     Res = ActOnOpenMPTaskLoopSimdDirective(ClausesWithImplicit, AStmt, StartLoc,
                                            EndLoc, VarsWithInheritedDSA);
     AllowedNameModifiers.push_back(OMPD_taskloop);
-    if (LangOpts.OpenMP >= 50)
+    if (getLangOpts().OpenMP >= 50)
       AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_master_taskloop:
@@ -6715,13 +6742,13 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
     Res = ActOnOpenMPMasterTaskLoopSimdDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
     AllowedNameModifiers.push_back(OMPD_taskloop);
-    if (LangOpts.OpenMP >= 50)
+    if (getLangOpts().OpenMP >= 50)
       AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_masked_taskloop_simd:
     Res = ActOnOpenMPMaskedTaskLoopSimdDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    if (LangOpts.OpenMP >= 51) {
+    if (getLangOpts().OpenMP >= 51) {
       AllowedNameModifiers.push_back(OMPD_taskloop);
       AllowedNameModifiers.push_back(OMPD_simd);
     }
@@ -6735,7 +6762,7 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
   case OMPD_parallel_masked_taskloop:
     Res = ActOnOpenMPParallelMaskedTaskLoopDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    if (LangOpts.OpenMP >= 51) {
+    if (getLangOpts().OpenMP >= 51) {
       AllowedNameModifiers.push_back(OMPD_taskloop);
       AllowedNameModifiers.push_back(OMPD_parallel);
     }
@@ -6745,13 +6772,13 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
     AllowedNameModifiers.push_back(OMPD_taskloop);
     AllowedNameModifiers.push_back(OMPD_parallel);
-    if (LangOpts.OpenMP >= 50)
+    if (getLangOpts().OpenMP >= 50)
       AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_parallel_masked_taskloop_simd:
     Res = ActOnOpenMPParallelMaskedTaskLoopSimdDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    if (LangOpts.OpenMP >= 51) {
+    if (getLangOpts().OpenMP >= 51) {
       AllowedNameModifiers.push_back(OMPD_taskloop);
       AllowedNameModifiers.push_back(OMPD_parallel);
       AllowedNameModifiers.push_back(OMPD_simd);
@@ -6775,13 +6802,13 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
     Res = ActOnOpenMPDistributeParallelForSimdDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
     AllowedNameModifiers.push_back(OMPD_parallel);
-    if (LangOpts.OpenMP >= 50)
+    if (getLangOpts().OpenMP >= 50)
       AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_distribute_simd:
     Res = ActOnOpenMPDistributeSimdDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    if (LangOpts.OpenMP >= 50)
+    if (getLangOpts().OpenMP >= 50)
       AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_target_parallel_for_simd:
@@ -6789,14 +6816,14 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
     AllowedNameModifiers.push_back(OMPD_target);
     AllowedNameModifiers.push_back(OMPD_parallel);
-    if (LangOpts.OpenMP >= 50)
+    if (getLangOpts().OpenMP >= 50)
       AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_target_simd:
     Res = ActOnOpenMPTargetSimdDirective(ClausesWithImplicit, AStmt, StartLoc,
                                          EndLoc, VarsWithInheritedDSA);
     AllowedNameModifiers.push_back(OMPD_target);
-    if (LangOpts.OpenMP >= 50)
+    if (getLangOpts().OpenMP >= 50)
       AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_teams_distribute:
@@ -6806,14 +6833,14 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
   case OMPD_teams_distribute_simd:
     Res = ActOnOpenMPTeamsDistributeSimdDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
-    if (LangOpts.OpenMP >= 50)
+    if (getLangOpts().OpenMP >= 50)
       AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_teams_distribute_parallel_for_simd:
     Res = ActOnOpenMPTeamsDistributeParallelForSimdDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
     AllowedNameModifiers.push_back(OMPD_parallel);
-    if (LangOpts.OpenMP >= 50)
+    if (getLangOpts().OpenMP >= 50)
       AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_teams_distribute_parallel_for:
@@ -6842,14 +6869,14 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
     AllowedNameModifiers.push_back(OMPD_target);
     AllowedNameModifiers.push_back(OMPD_parallel);
-    if (LangOpts.OpenMP >= 50)
+    if (getLangOpts().OpenMP >= 50)
       AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_target_teams_distribute_simd:
     Res = ActOnOpenMPTargetTeamsDistributeSimdDirective(
         ClausesWithImplicit, AStmt, StartLoc, EndLoc, VarsWithInheritedDSA);
     AllowedNameModifiers.push_back(OMPD_target);
-    if (LangOpts.OpenMP >= 50)
+    if (getLangOpts().OpenMP >= 50)
       AllowedNameModifiers.push_back(OMPD_simd);
     break;
   case OMPD_interop:
@@ -6906,7 +6933,7 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
   if (DSAStack->getDefaultDSA() == DSA_none ||
       DSAStack->getDefaultDSA() == DSA_private ||
       DSAStack->getDefaultDSA() == DSA_firstprivate) {
-    DSAAttrChecker DSAChecker(DSAStack, *this, nullptr);
+    DSAAttrChecker DSAChecker(DSAStack, SemaRef, nullptr);
     for (OMPClause *C : Clauses) {
       switch (C->getClauseKind()) {
       case OMPC_num_threads:
@@ -7043,13 +7070,13 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
   }
 
   if (!AllowedNameModifiers.empty())
-    ErrorFound = checkIfClauses(*this, Kind, Clauses, AllowedNameModifiers) ||
+    ErrorFound = checkIfClauses(SemaRef, Kind, Clauses, AllowedNameModifiers) ||
                  ErrorFound;
 
   if (ErrorFound)
     return StmtError();
 
-  if (!CurContext->isDependentContext() &&
+  if (!SemaRef.CurContext->isDependentContext() &&
       isOpenMPTargetExecutionDirective(Kind) &&
       !(DSAStack->hasRequiresDeclWithClause<OMPUnifiedSharedMemoryClause>() ||
         DSAStack->hasRequiresDeclWithClause<OMPUnifiedAddressClause>() ||
@@ -7062,7 +7089,7 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
   return Res;
 }
 
-Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareSimdDirective(
+SemaOpenMP::DeclGroupPtrTy SemaOpenMP::ActOnOpenMPDeclareSimdDirective(
     DeclGroupPtrTy DG, OMPDeclareSimdDeclAttr::BranchStateTy BS, Expr *Simdlen,
     ArrayRef<Expr *> Uniforms, ArrayRef<Expr *> Aligneds,
     ArrayRef<Expr *> Alignments, ArrayRef<Expr *> Linears,
@@ -7297,13 +7324,15 @@ Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareSimdDirective(
       NewStep = PerformOpenMPImplicitIntegerConversion(Step->getExprLoc(), Step)
                     .get();
       if (NewStep)
-        NewStep =
-            VerifyIntegerConstantExpression(NewStep, /*FIXME*/ AllowFold).get();
+        NewStep = SemaRef
+                      .VerifyIntegerConstantExpression(
+                          NewStep, /*FIXME*/ Sema::AllowFold)
+                      .get();
     }
     NewSteps.push_back(NewStep);
   }
   auto *NewAttr = OMPDeclareSimdDeclAttr::CreateImplicit(
-      Context, BS, SL.get(), const_cast<Expr **>(Uniforms.data()),
+      getASTContext(), BS, SL.get(), const_cast<Expr **>(Uniforms.data()),
       Uniforms.size(), const_cast<Expr **>(Aligneds.data()), Aligneds.size(),
       const_cast<Expr **>(NewAligns.data()), NewAligns.size(),
       const_cast<Expr **>(Linears.data()), Linears.size(),
@@ -7336,7 +7365,7 @@ static void setPrototype(Sema &S, FunctionDecl *FD, FunctionDecl *FDWithProto,
   FD->setParams(Params);
 }
 
-void Sema::ActOnFinishedFunctionDefinitionInOpenMPAssumeScope(Decl *D) {
+void SemaOpenMP::ActOnFinishedFunctionDefinitionInOpenMPAssumeScope(Decl *D) {
   if (D->isInvalidDecl())
     return;
   FunctionDecl *FD = nullptr;
@@ -7349,7 +7378,7 @@ void Sema::ActOnFinishedFunctionDefinitionInOpenMPAssumeScope(Decl *D) {
   // If we are instantiating templates we do *not* apply scoped assumptions but
   // only global ones. We apply scoped assumption to the template definition
   // though.
-  if (!inTemplateInstantiation()) {
+  if (!SemaRef.inTemplateInstantiation()) {
     for (OMPAssumeAttr *AA : OMPAssumeScoped)
       FD->addAttr(AA);
   }
@@ -7357,10 +7386,10 @@ void Sema::ActOnFinishedFunctionDefinitionInOpenMPAssumeScope(Decl *D) {
     FD->addAttr(AA);
 }
 
-Sema::OMPDeclareVariantScope::OMPDeclareVariantScope(OMPTraitInfo &TI)
+SemaOpenMP::OMPDeclareVariantScope::OMPDeclareVariantScope(OMPTraitInfo &TI)
     : TI(&TI), NameSuffix(TI.getMangledName()) {}
 
-void Sema::ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(
+void SemaOpenMP::ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(
     Scope *S, Declarator &D, MultiTemplateParamsArg TemplateParamLists,
     SmallVectorImpl<FunctionDecl *> &Bases) {
   if (!D.getIdentifier())
@@ -7376,11 +7405,11 @@ void Sema::ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(
     return;
 
   const IdentifierInfo *BaseII = D.getIdentifier();
-  LookupResult Lookup(*this, DeclarationName(BaseII), D.getIdentifierLoc(),
-                      LookupOrdinaryName);
-  LookupParsedName(Lookup, S, &D.getCXXScopeSpec());
+  LookupResult Lookup(SemaRef, DeclarationName(BaseII), D.getIdentifierLoc(),
+                      Sema::LookupOrdinaryName);
+  SemaRef.LookupParsedName(Lookup, S, &D.getCXXScopeSpec());
 
-  TypeSourceInfo *TInfo = GetTypeForDeclarator(D);
+  TypeSourceInfo *TInfo = SemaRef.GetTypeForDeclarator(D);
   QualType FType = TInfo->getType();
 
   bool IsConstexpr =
@@ -7409,7 +7438,7 @@ void Sema::ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(
 
     QualType UDeclTy = UDecl->getType();
     if (!UDeclTy->isDependentType()) {
-      QualType NewType = Context.mergeFunctionTypes(
+      QualType NewType = getASTContext().mergeFunctionTypes(
           FType, UDeclTy, /* OfBlockPointer */ false,
           /* Unqualified */ false, /* AllowCXX */ true);
       if (NewType.isNull())
@@ -7425,7 +7454,7 @@ void Sema::ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(
   // If no base was found we create a declaration that we use as base.
   if (Bases.empty() && UseImplicitBase) {
     D.setFunctionDefinitionKind(FunctionDefinitionKind::Declaration);
-    Decl *BaseD = HandleDeclarator(S, D, TemplateParamLists);
+    Decl *BaseD = SemaRef.HandleDeclarator(S, D, TemplateParamLists);
     BaseD->setImplicit(true);
     if (auto *BaseTemplD = dyn_cast<FunctionTemplateDecl>(BaseD))
       Bases.push_back(BaseTemplD->getTemplatedDecl());
@@ -7437,18 +7466,18 @@ void Sema::ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(
   MangledName += D.getIdentifier()->getName();
   MangledName += getOpenMPVariantManglingSeparatorStr();
   MangledName += DVScope.NameSuffix;
-  IdentifierInfo &VariantII = Context.Idents.get(MangledName);
+  IdentifierInfo &VariantII = getASTContext().Idents.get(MangledName);
 
   VariantII.setMangledOpenMPVariantName(true);
   D.SetIdentifier(&VariantII, D.getBeginLoc());
 }
 
-void Sema::ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope(
+void SemaOpenMP::ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope(
     Decl *D, SmallVectorImpl<FunctionDecl *> &Bases) {
   // Do not mark function as is used to prevent its emission if this is the
   // only place where it is used.
   EnterExpressionEvaluationContext Unevaluated(
-      *this, Sema::ExpressionEvaluationContext::Unevaluated);
+      SemaRef, Sema::ExpressionEvaluationContext::Unevaluated);
 
   FunctionDecl *FD = nullptr;
   if (auto *UTemplDecl = dyn_cast<FunctionTemplateDecl>(D))
@@ -7456,14 +7485,14 @@ void Sema::ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope(
   else
     FD = cast<FunctionDecl>(D);
   auto *VariantFuncRef = DeclRefExpr::Create(
-      Context, NestedNameSpecifierLoc(), SourceLocation(), FD,
+      getASTContext(), NestedNameSpecifierLoc(), SourceLocation(), FD,
       /* RefersToEnclosingVariableOrCapture */ false,
       /* NameLoc */ FD->getLocation(), FD->getType(),
       ExprValueKind::VK_PRValue);
 
   OMPDeclareVariantScope &DVScope = OMPDeclareVariantScopes.back();
   auto *OMPDeclareVariantA = OMPDeclareVariantAttr::CreateImplicit(
-      Context, VariantFuncRef, DVScope.TI,
+      getASTContext(), VariantFuncRef, DVScope.TI,
       /*NothingArgs=*/nullptr, /*NothingArgsSize=*/0,
       /*NeedDevicePtrArgs=*/nullptr, /*NeedDevicePtrArgsSize=*/0,
       /*AppendArgs=*/nullptr, /*AppendArgsSize=*/0);
@@ -7471,10 +7500,11 @@ void Sema::ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope(
     BaseFD->addAttr(OMPDeclareVariantA);
 }
 
-ExprResult Sema::ActOnOpenMPCall(ExprResult Call, Scope *Scope,
-                                 SourceLocation LParenLoc,
-                                 MultiExprArg ArgExprs,
-                                 SourceLocation RParenLoc, Expr *ExecConfig) {
+ExprResult SemaOpenMP::ActOnOpenMPCall(ExprResult Call, Scope *Scope,
+                                       SourceLocation LParenLoc,
+                                       MultiExprArg ArgExprs,
+                                       SourceLocation RParenLoc,
+                                       Expr *ExecConfig) {
   // The common case is a regular call we do not want to specialize at all. Try
   // to make that case fast by bailing early.
   CallExpr *CE = dyn_cast<CallExpr>(Call.get());
@@ -7485,7 +7515,7 @@ ExprResult Sema::ActOnOpenMPCall(ExprResult Call, Scope *Scope,
   if (!CalleeFnDecl)
     return Call;
 
-  if (LangOpts.OpenMP >= 51 && CalleeFnDecl->getIdentifier() &&
+  if (getLangOpts().OpenMP >= 51 && CalleeFnDecl->getIdentifier() &&
       CalleeFnDecl->getName().starts_with_insensitive("omp_")) {
     // checking for any calls inside an Order region
     if (Scope && Scope->isOpenMPOrderClauseScope())
@@ -7504,7 +7534,8 @@ ExprResult Sema::ActOnOpenMPCall(ExprResult Call, Scope *Scope,
         << ISATrait;
   };
   TargetOMPContext OMPCtx(Context, std::move(DiagUnknownTrait),
-                          getCurFunctionDecl(), DSAStack->getConstructTraits());
+                          SemaRef.getCurFunctionDecl(),
+                          DSAStack->getConstructTraits());
 
   QualType CalleeFnType = CalleeFnDecl->getType();
 
@@ -7549,7 +7580,7 @@ ExprResult Sema::ActOnOpenMPCall(ExprResult Call, Scope *Scope,
       // different type than the base function. This is intended and OK but if
       // we cannot create a call the difference is not in the "implementation
       // defined range" we allow.
-      Sema::TentativeAnalysisScope Trap(*this);
+      Sema::TentativeAnalysisScope Trap(SemaRef);
 
       if (auto *SpecializedMethod = dyn_cast<CXXMethodDecl>(BestDecl)) {
         auto *MemberCall = dyn_cast<CXXMemberCallExpr>(CE);
@@ -7558,12 +7589,12 @@ ExprResult Sema::ActOnOpenMPCall(ExprResult Call, Scope *Scope,
             /* IsArrow */ false, SpecializedMethod, Context.BoundMemberTy,
             MemberCall->getValueKind(), MemberCall->getObjectKind());
       }
-      NewCall = BuildCallExpr(Scope, BestExpr, LParenLoc, ArgExprs, RParenLoc,
-                              ExecConfig);
+      NewCall = SemaRef.BuildCallExpr(Scope, BestExpr, LParenLoc, ArgExprs,
+                                      RParenLoc, ExecConfig);
       if (NewCall.isUsable()) {
         if (CallExpr *NCE = dyn_cast<CallExpr>(NewCall.get())) {
           FunctionDecl *NewCalleeFnDecl = NCE->getDirectCallee();
-          QualType NewType = Context.mergeFunctionTypes(
+          QualType NewType = getASTContext().mergeFunctionTypes(
               CalleeFnType, NewCalleeFnDecl->getType(),
               /* OfBlockPointer */ false,
               /* Unqualified */ false, /* AllowCXX */ true);
@@ -7581,14 +7612,16 @@ ExprResult Sema::ActOnOpenMPCall(ExprResult Call, Scope *Scope,
 
   if (!NewCall.isUsable())
     return Call;
-  return PseudoObjectExpr::Create(Context, CE, {NewCall.get()}, 0);
+  return PseudoObjectExpr::Create(getASTContext(), CE, {NewCall.get()}, 0);
 }
 
 std::optional<std::pair<FunctionDecl *, Expr *>>
-Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG,
-                                        Expr *VariantRef, OMPTraitInfo &TI,
-                                        unsigned NumAppendArgs,
-                                        SourceRange SR) {
+SemaOpenMP::checkOpenMPDeclareVariantFunction(SemaOpenMP::DeclGroupPtrTy DG,
+                                              Expr *VariantRef,
+                                              OMPTraitInfo &TI,
+                                              unsigned NumAppendArgs,
+                                              SourceRange SR) {
+  ASTContext &Context = getASTContext();
   if (!DG || DG.get().isNull())
     return std::nullopt;
 
@@ -7631,7 +7664,7 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG,
   // Check if the function was emitted already.
   const FunctionDecl *Definition;
   if (!FD->isThisDeclarationADefinition() && FD->isDefined(Definition) &&
-      (LangOpts.EmitAllDecls || Context.DeclMustBeEmitted(Definition)))
+      (getLangOpts().EmitAllDecls || Context.DeclMustBeEmitted(Definition)))
     Diag(SR.getBegin(), diag::warn_omp_declare_variant_after_emitted)
         << FD->getLocation();
 
@@ -7654,7 +7687,7 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG,
   // Deal with non-constant score and user condition expressions.
   auto HandleNonConstantScoresAndConditions = [this](Expr *&E,
                                                      bool IsScore) -> bool {
-    if (!E || E->isIntegerConstantExpr(Context))
+    if (!E || E->isIntegerConstantExpr(getASTContext()))
       return false;
 
     if (IsScore) {
@@ -7686,9 +7719,9 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG,
     // Adjust the function type to account for an extra omp_interop_t for each
     // specified in the append_args clause.
     const TypeDecl *TD = nullptr;
-    LookupResult Result(*this, &Context.Idents.get("omp_interop_t"),
+    LookupResult Result(SemaRef, &Context.Idents.get("omp_interop_t"),
                         SR.getBegin(), Sema::LookupOrdinaryName);
-    if (LookupName(Result, getCurScope())) {
+    if (SemaRef.LookupName(Result, SemaRef.getCurScope())) {
       NamedDecl *ND = Result.getFoundDecl();
       TD = dyn_cast_or_null<TypeDecl>(ND);
     }
@@ -7711,7 +7744,7 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG,
   // Convert VariantRef expression to the type of the original function to
   // resolve possible conflicts.
   ExprResult VariantRefCast = VariantRef;
-  if (LangOpts.CPlusPlus) {
+  if (getLangOpts().CPlusPlus) {
     QualType FnPtrType;
     auto *Method = dyn_cast<CXXMethodDecl>(FD);
     if (Method && !Method->isStatic()) {
@@ -7722,9 +7755,9 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG,
       {
         // Build adrr_of unary op to correctly handle type checks for member
         // functions.
-        Sema::TentativeAnalysisScope Trap(*this);
-        ER = CreateBuiltinUnaryOp(VariantRef->getBeginLoc(), UO_AddrOf,
-                                  VariantRef);
+        Sema::TentativeAnalysisScope Trap(SemaRef);
+        ER = SemaRef.CreateBuiltinUnaryOp(VariantRef->getBeginLoc(), UO_AddrOf,
+                                          VariantRef);
       }
       if (!ER.isUsable()) {
         Diag(VariantRef->getExprLoc(), diag::err_omp_function_expected)
@@ -7737,9 +7770,9 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG,
     }
     QualType VarianPtrType = Context.getPointerType(VariantRef->getType());
     if (VarianPtrType.getUnqualifiedType() != FnPtrType.getUnqualifiedType()) {
-      ImplicitConversionSequence ICS = TryImplicitConversion(
+      ImplicitConversionSequence ICS = SemaRef.TryImplicitConversion(
           VariantRef, FnPtrType.getUnqualifiedType(),
-          /*SuppressUserConversions=*/false, AllowedExplicit::None,
+          /*SuppressUserConversions=*/false, Sema::AllowedExplicit::None,
           /*InOverloadResolution=*/false,
           /*CStyle=*/false,
           /*AllowObjCWritebackConversion=*/false);
@@ -7751,8 +7784,8 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG,
             << (NumAppendArgs ? 1 : 0) << VariantRef->getSourceRange();
         return std::nullopt;
       }
-      VariantRefCast = PerformImplicitConversion(
-          VariantRef, FnPtrType.getUnqualifiedType(), AA_Converting);
+      VariantRefCast = SemaRef.PerformImplicitConversion(
+          VariantRef, FnPtrType.getUnqualifiedType(), Sema::AA_Converting);
       if (!VariantRefCast.isUsable())
         return std::nullopt;
     }
@@ -7765,7 +7798,7 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG,
     }
   }
 
-  ExprResult ER = CheckPlaceholderExpr(VariantRefCast.get());
+  ExprResult ER = SemaRef.CheckPlaceholderExpr(VariantRefCast.get());
   if (!ER.isUsable() ||
       !ER.get()->IgnoreParenImpCasts()->getType()->isFunctionType()) {
     Diag(VariantRef->getExprLoc(), diag::err_omp_function_expected)
@@ -7795,7 +7828,7 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG,
   }
 
   // Check if function types are compatible in C.
-  if (!LangOpts.CPlusPlus) {
+  if (!getLangOpts().CPlusPlus) {
     QualType NewType =
         Context.mergeFunctionTypes(AdjustedFnType, NewFD->getType());
     if (NewType.isNull()) {
@@ -7807,9 +7840,9 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG,
     }
     if (NewType->isFunctionProtoType()) {
       if (FD->getType()->isFunctionNoProtoType())
-        setPrototype(*this, FD, NewFD, NewType);
+        setPrototype(SemaRef, FD, NewFD, NewType);
       else if (NewFD->getType()->isFunctionNoProtoType())
-        setPrototype(*this, NewFD, FD, NewType);
+        setPrototype(SemaRef, NewFD, FD, NewType);
     }
   }
 
@@ -7872,15 +7905,15 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG,
   }
 
   // Check general compatibility.
-  if (areMultiversionVariantFunctionsCompatible(
+  if (SemaRef.areMultiversionVariantFunctionsCompatible(
           FD, NewFD, PartialDiagnostic::NullDiagnostic(),
           PartialDiagnosticAt(SourceLocation(),
                               PartialDiagnostic::NullDiagnostic()),
           PartialDiagnosticAt(
               VariantRef->getExprLoc(),
-              PDiag(diag::err_omp_declare_variant_doesnt_support)),
+              SemaRef.PDiag(diag::err_omp_declare_variant_doesnt_support)),
           PartialDiagnosticAt(VariantRef->getExprLoc(),
-                              PDiag(diag::err_omp_declare_variant_diff)
+                              SemaRef.PDiag(diag::err_omp_declare_variant_diff)
                                   << FD->getLocation()),
           /*TemplatesSupported=*/true, /*ConstexprSupported=*/false,
           /*CLinkageMayDiffer=*/true))
@@ -7888,7 +7921,7 @@ Sema::checkOpenMPDeclareVariantFunction(Sema::DeclGroupPtrTy DG,
   return std::make_pair(FD, cast<Expr>(DRE));
 }
 
-void Sema::ActOnOpenMPDeclareVariantDirective(
+void SemaOpenMP::ActOnOpenMPDeclareVariantDirective(
     FunctionDecl *FD, Expr *VariantRef, OMPTraitInfo &TI,
     ArrayRef<Expr *> AdjustArgsNothing,
     ArrayRef<Expr *> AdjustArgsNeedDevicePtr,
@@ -7906,7 +7939,7 @@ void Sema::ActOnOpenMPDeclareVariantDirective(
 
   if (!AllAdjustArgs.empty() || !AppendArgs.empty()) {
     VariantMatchInfo VMI;
-    TI.getAsVariantMatchInfo(Context, VMI);
+    TI.getAsVariantMatchInfo(getASTContext(), VMI);
     if (!llvm::is_contained(
             VMI.ConstructTraits,
             llvm::omp::TraitProperty::construct_dispatch_dispatch)) {
@@ -7949,18 +7982,18 @@ void Sema::ActOnOpenMPDeclareVariantDirective(
   }
 
   auto *NewAttr = OMPDeclareVariantAttr::CreateImplicit(
-      Context, VariantRef, &TI, const_cast<Expr **>(AdjustArgsNothing.data()),
-      AdjustArgsNothing.size(),
+      getASTContext(), VariantRef, &TI,
+      const_cast<Expr **>(AdjustArgsNothing.data()), AdjustArgsNothing.size(),
       const_cast<Expr **>(AdjustArgsNeedDevicePtr.data()),
       AdjustArgsNeedDevicePtr.size(),
       const_cast<OMPInteropInfo *>(AppendArgs.data()), AppendArgs.size(), SR);
   FD->addAttr(NewAttr);
 }
 
-StmtResult Sema::ActOnOpenMPParallelDirective(ArrayRef<OMPClause *> Clauses,
-                                              Stmt *AStmt,
-                                              SourceLocation StartLoc,
-                                              SourceLocation EndLoc) {
+StmtResult
+SemaOpenMP::ActOnOpenMPParallelDirective(ArrayRef<OMPClause *> Clauses,
+                                         Stmt *AStmt, SourceLocation StartLoc,
+                                         SourceLocation EndLoc) {
   if (!AStmt)
     return StmtError();
 
@@ -7972,11 +8005,11 @@ StmtResult Sema::ActOnOpenMPParallelDirective(ArrayRef<OMPClause *> Clauses,
   // longjmp() and throw() must not violate the entry/exit criteria.
   CS->getCapturedDecl()->setNothrow();
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
-  return OMPParallelDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
-                                      DSAStack->getTaskgroupReductionRef(),
-                                      DSAStack->isCancelRegion());
+  return OMPParallelDirective::Create(
+      getASTContext(), StartLoc, EndLoc, Clauses, AStmt,
+      DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
 }
 
 namespace {
@@ -8226,7 +8259,7 @@ bool OpenMPIterationSpaceChecker::setStep(Expr *NewStep, bool Subtract) {
   if (!NewStep->isValueDependent()) {
     // Check that the step is integer expression.
     SourceLocation StepLoc = NewStep->getBeginLoc();
-    ExprResult Val = SemaRef.PerformOpenMPImplicitIntegerConversion(
+    ExprResult Val = SemaRef.OpenMP().PerformOpenMPImplicitIntegerConversion(
         StepLoc, getExprAsWritten(NewStep));
     if (Val.isInvalid())
       return true;
@@ -9248,7 +9281,7 @@ DeclRefExpr *OpenMPIterationSpaceChecker::buildCounterVar(
     DSAStackTy &DSA) const {
   auto *VD = dyn_cast<VarDecl>(LCDecl);
   if (!VD) {
-    VD = SemaRef.isOpenMPCapturedDecl(LCDecl);
+    VD = SemaRef.OpenMP().isOpenMPCapturedDecl(LCDecl);
     DeclRefExpr *Ref = buildDeclRefExpr(
         SemaRef, VD, VD->getType().getNonReferenceType(), DefaultLoc);
     const DSAStackTy::DSAVarData Data =
@@ -9321,14 +9354,15 @@ Expr *OpenMPIterationSpaceChecker::buildOrderedLoopData(
 }
 } // namespace
 
-void Sema::ActOnOpenMPLoopInitialization(SourceLocation ForLoc, Stmt *Init) {
+void SemaOpenMP::ActOnOpenMPLoopInitialization(SourceLocation ForLoc,
+                                               Stmt *Init) {
   assert(getLangOpts().OpenMP && "OpenMP is not active.");
   assert(Init && "Expected loop in canonical form.");
   unsigned AssociatedLoops = DSAStack->getAssociatedLoops();
   if (AssociatedLoops > 0 &&
       isOpenMPLoopDirective(DSAStack->getCurrentDirective())) {
     DSAStack->loopStart();
-    OpenMPIterationSpaceChecker ISC(*this, /*SupportsNonRectangular=*/true,
+    OpenMPIterationSpaceChecker ISC(SemaRef, /*SupportsNonRectangular=*/true,
                                     *DSAStack, ForLoc);
     if (!ISC.checkAndSetInit(Init, /*EmitDiags=*/false)) {
       if (ValueDecl *D = ISC.getLoopDecl()) {
@@ -9338,7 +9372,7 @@ void Sema::ActOnOpenMPLoopInitialization(SourceLocation ForLoc, Stmt *Init) {
           if (VarDecl *Private = isOpenMPCapturedDecl(D)) {
             VD = Private;
           } else {
-            PrivateRef = buildCapture(*this, D, ISC.getLoopDeclRefExpr(),
+            PrivateRef = buildCapture(SemaRef, D, ISC.getLoopDeclRefExpr(),
                                       /*WithInit=*/false);
             VD = cast<VarDecl>(PrivateRef->getDecl());
           }
@@ -9348,10 +9382,10 @@ void Sema::ActOnOpenMPLoopInitialization(SourceLocation ForLoc, Stmt *Init) {
         if (LD != D->getCanonicalDecl()) {
           DSAStack->resetPossibleLoopCounter();
           if (auto *Var = dyn_cast_or_null<VarDecl>(LD))
-            MarkDeclarationsReferencedInExpr(
-                buildDeclRefExpr(*this, const_cast<VarDecl *>(Var),
-                                 Var->getType().getNonLValueExprType(Context),
-                                 ForLoc, /*RefersToCapture=*/true));
+            SemaRef.MarkDeclarationsReferencedInExpr(buildDeclRefExpr(
+                SemaRef, const_cast<VarDecl *>(Var),
+                Var->getType().getNonLValueExprType(getASTContext()), ForLoc,
+                /*RefersToCapture=*/true));
         }
         OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
         // OpenMP [2.14.1.1, Data-sharing Attribute Rules for Variables
@@ -9372,8 +9406,8 @@ void Sema::ActOnOpenMPLoopInitialization(SourceLocation ForLoc, Stmt *Init) {
                 : OMPC_private;
         if (((isOpenMPSimdDirective(DKind) && DVar.CKind != OMPC_unknown &&
               DVar.CKind != PredeterminedCKind && DVar.RefExpr &&
-              (LangOpts.OpenMP <= 45 || (DVar.CKind != OMPC_lastprivate &&
-                                         DVar.CKind != OMPC_private))) ||
+              (getLangOpts().OpenMP <= 45 || (DVar.CKind != OMPC_lastprivate &&
+                                              DVar.CKind != OMPC_private))) ||
              ((isOpenMPWorksharingDirective(DKind) || DKind == OMPD_taskloop ||
                DKind == OMPD_master_taskloop || DKind == OMPD_masked_taskloop ||
                DKind == OMPD_parallel_master_taskloop ||
@@ -9388,7 +9422,7 @@ void Sema::ActOnOpenMPLoopInitialization(SourceLocation ForLoc, Stmt *Init) {
               << getOpenMPClauseName(PredeterminedCKind);
           if (DVar.RefExpr == nullptr)
             DVar.CKind = PredeterminedCKind;
-          reportOriginalDsa(*this, DSAStack, D, DVar,
+          reportOriginalDsa(SemaRef, DSAStack, D, DVar,
                             /*IsLoopIterVar=*/true);
         } else if (LoopDeclRefExpr) {
           // Make the loop iteration variable private (for worksharing
@@ -9428,7 +9462,7 @@ static bool checkOpenMPIterationSpace(
     unsigned CurrentNestedLoopCount, unsigned NestedLoopCount,
     unsigned TotalNestedLoopCount, Expr *CollapseLoopCountExpr,
     Expr *OrderedLoopCountExpr,
-    Sema::VarsWithInheritedDSAType &VarsWithImplicitDSA,
+    SemaOpenMP::VarsWithInheritedDSAType &VarsWithImplicitDSA,
     llvm::MutableArrayRef<LoopIterationSpace> ResultIterSpaces,
     llvm::MapVector<const Expr *, DeclRefExpr *> &Captures) {
   bool SupportsNonRectangular = !isOpenMPLoopTransformationDirective(DKind);
@@ -9817,7 +9851,7 @@ static unsigned
 checkOpenMPLoop(OpenMPDirectiveKind DKind, Expr *CollapseLoopCountExpr,
                 Expr *OrderedLoopCountExpr, Stmt *AStmt, Sema &SemaRef,
                 DSAStackTy &DSA,
-                Sema::VarsWithInheritedDSAType &VarsWithImplicitDSA,
+                SemaOpenMP::VarsWithInheritedDSAType &VarsWithImplicitDSA,
                 OMPLoopBasedDirective::HelperExprs &Built) {
   unsigned NestedLoopCount = 1;
   bool SupportsNonPerfectlyNested = (SemaRef.LangOpts.OpenMP >= 50) &&
@@ -10566,7 +10600,8 @@ static bool checkGenericLoopLastprivate(Sema &S, ArrayRef<OMPClause *> Clauses,
                                         OpenMPDirectiveKind K,
                                         DSAStackTy *Stack);
 
-bool Sema::checkLastPrivateForMappedDirectives(ArrayRef<OMPClause *> Clauses) {
+bool SemaOpenMP::checkLastPrivateForMappedDirectives(
+    ArrayRef<OMPClause *> Clauses) {
 
   // Check for syntax of lastprivate
   // Param of the lastprivate have different meanings in the mapped directives
@@ -10574,16 +10609,15 @@ bool Sema::checkLastPrivateForMappedDirectives(ArrayRef<OMPClause *> Clauses) {
   //      "omp for"  lastprivate vars must be shared
   if (getLangOpts().OpenMP >= 50 &&
       DSAStack->getMappedDirective() == OMPD_loop &&
-      checkGenericLoopLastprivate(*this, Clauses, OMPD_loop, DSAStack)) {
+      checkGenericLoopLastprivate(SemaRef, Clauses, OMPD_loop, DSAStack)) {
     return false;
   }
   return true;
 }
 
-StmtResult
-Sema::ActOnOpenMPSimdDirective(ArrayRef<OMPClause *> Clauses, Stmt *AStmt,
-                               SourceLocation StartLoc, SourceLocation EndLoc,
-                               VarsWithInheritedDSAType &VarsWithImplicitDSA) {
+StmtResult SemaOpenMP::ActOnOpenMPSimdDirective(
+    ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+    SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
     return StmtError();
 
@@ -10596,38 +10630,37 @@ Sema::ActOnOpenMPSimdDirective(ArrayRef<OMPClause *> Clauses, Stmt *AStmt,
   // define the nested loops number.
   unsigned NestedLoopCount = checkOpenMPLoop(
       OMPD_simd, getCollapseNumberExpr(Clauses), getOrderedNumberExpr(Clauses),
-      AStmt, *this, *DSAStack, VarsWithImplicitDSA, B);
+      AStmt, SemaRef, *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp simd loop exprs were not built");
 
-  if (!CurContext->isDependentContext()) {
+  if (!SemaRef.CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (OMPClause *C : Clauses) {
       if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope,
-                                     DSAStack))
+                                     B.NumIterations, SemaRef,
+                                     SemaRef.getCurScope(), DSAStack))
           return StmtError();
     }
   }
 
-  if (checkSimdlenSafelenSpecified(*this, Clauses))
+  if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
   auto *SimdDirective = OMPSimdDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
       DSAStack->getMappedDirective());
   return SimdDirective;
 }
 
-StmtResult
-Sema::ActOnOpenMPForDirective(ArrayRef<OMPClause *> Clauses, Stmt *AStmt,
-                              SourceLocation StartLoc, SourceLocation EndLoc,
-                              VarsWithInheritedDSAType &VarsWithImplicitDSA) {
+StmtResult SemaOpenMP::ActOnOpenMPForDirective(
+    ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+    SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
     return StmtError();
 
@@ -10640,32 +10673,32 @@ Sema::ActOnOpenMPForDirective(ArrayRef<OMPClause *> Clauses, Stmt *AStmt,
   // define the nested loops number.
   unsigned NestedLoopCount = checkOpenMPLoop(
       OMPD_for, getCollapseNumberExpr(Clauses), getOrderedNumberExpr(Clauses),
-      AStmt, *this, *DSAStack, VarsWithImplicitDSA, B);
+      AStmt, SemaRef, *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp for loop exprs were not built");
 
-  if (!CurContext->isDependentContext()) {
+  if (!SemaRef.CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (OMPClause *C : Clauses) {
       if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope,
-                                     DSAStack))
+                                     B.NumIterations, SemaRef,
+                                     SemaRef.getCurScope(), DSAStack))
           return StmtError();
     }
   }
 
   auto *ForDirective = OMPForDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
       DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion(),
       DSAStack->getMappedDirective());
   return ForDirective;
 }
 
-StmtResult Sema::ActOnOpenMPForSimdDirective(
+StmtResult SemaOpenMP::ActOnOpenMPForSimdDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -10677,37 +10710,37 @@ StmtResult Sema::ActOnOpenMPForSimdDirective(
   // define the nested loops number.
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_for_simd, getCollapseNumberExpr(Clauses),
-                      getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
+                      getOrderedNumberExpr(Clauses), AStmt, SemaRef, *DSAStack,
                       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp for simd loop exprs were not built");
 
-  if (!CurContext->isDependentContext()) {
+  if (!SemaRef.CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (OMPClause *C : Clauses) {
       if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope,
-                                     DSAStack))
+                                     B.NumIterations, SemaRef,
+                                     SemaRef.getCurScope(), DSAStack))
           return StmtError();
     }
   }
 
-  if (checkSimdlenSafelenSpecified(*this, Clauses))
+  if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
-  return OMPForSimdDirective::Create(Context, StartLoc, EndLoc, NestedLoopCount,
-                                     Clauses, AStmt, B);
+  SemaRef.setFunctionHasBranchProtectedScope();
+  return OMPForSimdDirective::Create(getASTContext(), StartLoc, EndLoc,
+                                     NestedLoopCount, Clauses, AStmt, B);
 }
 
-StmtResult Sema::ActOnOpenMPSectionsDirective(ArrayRef<OMPClause *> Clauses,
-                                              Stmt *AStmt,
-                                              SourceLocation StartLoc,
-                                              SourceLocation EndLoc) {
+StmtResult
+SemaOpenMP::ActOnOpenMPSectionsDirective(ArrayRef<OMPClause *> Clauses,
+                                         Stmt *AStmt, SourceLocation StartLoc,
+                                         SourceLocation EndLoc) {
   if (!AStmt)
     return StmtError();
 
@@ -10736,23 +10769,23 @@ StmtResult Sema::ActOnOpenMPSectionsDirective(ArrayRef<OMPClause *> Clauses,
     return StmtError();
   }
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
-  return OMPSectionsDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
-                                      DSAStack->getTaskgroupReductionRef(),
-                                      DSAStack->isCancelRegion());
+  return OMPSectionsDirective::Create(
+      getASTContext(), StartLoc, EndLoc, Clauses, AStmt,
+      DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
 }
 
-StmtResult Sema::ActOnOpenMPSectionDirective(Stmt *AStmt,
-                                             SourceLocation StartLoc,
-                                             SourceLocation EndLoc) {
+StmtResult SemaOpenMP::ActOnOpenMPSectionDirective(Stmt *AStmt,
+                                                   SourceLocation StartLoc,
+                                                   SourceLocation EndLoc) {
   if (!AStmt)
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
   DSAStack->setParentCancelRegion(DSAStack->isCancelRegion());
 
-  return OMPSectionDirective::Create(Context, StartLoc, EndLoc, AStmt,
+  return OMPSectionDirective::Create(getASTContext(), StartLoc, EndLoc, AStmt,
                                      DSAStack->isCancelRegion());
 }
 
@@ -10764,10 +10797,10 @@ static Expr *getDirectCallExpr(Expr *E) {
   return nullptr;
 }
 
-StmtResult Sema::ActOnOpenMPDispatchDirective(ArrayRef<OMPClause *> Clauses,
-                                              Stmt *AStmt,
-                                              SourceLocation StartLoc,
-                                              SourceLocation EndLoc) {
+StmtResult
+SemaOpenMP::ActOnOpenMPDispatchDirective(ArrayRef<OMPClause *> Clauses,
+                                         Stmt *AStmt, SourceLocation StartLoc,
+                                         SourceLocation EndLoc) {
   if (!AStmt)
     return StmtError();
 
@@ -10780,7 +10813,7 @@ StmtResult Sema::ActOnOpenMPDispatchDirective(ArrayRef<OMPClause *> Clauses,
 
   SourceLocation TargetCallLoc;
 
-  if (!CurContext->isDependentContext()) {
+  if (!SemaRef.CurContext->isDependentContext()) {
     Expr *TargetCall = nullptr;
 
     auto *E = dyn_cast<Expr>(S);
@@ -10808,10 +10841,10 @@ StmtResult Sema::ActOnOpenMPDispatchDirective(ArrayRef<OMPClause *> Clauses,
     TargetCallLoc = TargetCall->getExprLoc();
   }
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
-  return OMPDispatchDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
-                                      TargetCallLoc);
+  return OMPDispatchDirective::Create(getASTContext(), StartLoc, EndLoc,
+                                      Clauses, AStmt, TargetCallLoc);
 }
 
 static bool checkGenericLoopLastprivate(Sema &S, ArrayRef<OMPClause *> Clauses,
@@ -10839,7 +10872,7 @@ static bool checkGenericLoopLastprivate(Sema &S, ArrayRef<OMPClause *> Clauses,
   return ErrorFound;
 }
 
-StmtResult Sema::ActOnOpenMPGenericLoopDirective(
+StmtResult SemaOpenMP::ActOnOpenMPGenericLoopDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -10848,7 +10881,7 @@ StmtResult Sema::ActOnOpenMPGenericLoopDirective(
   // OpenMP 5.1 [2.11.7, loop construct, Restrictions]
   // A list item may not appear in a lastprivate clause unless it is the
   // loop iteration variable of a loop that is associated with the construct.
-  if (checkGenericLoopLastprivate(*this, Clauses, OMPD_loop, DSAStack))
+  if (checkGenericLoopLastprivate(SemaRef, Clauses, OMPD_loop, DSAStack))
     return StmtError();
 
   auto *CS = cast<CapturedStmt>(AStmt);
@@ -10863,19 +10896,19 @@ StmtResult Sema::ActOnOpenMPGenericLoopDirective(
   // In presence of clause 'collapse', it will define the nested loops number.
   unsigned NestedLoopCount = checkOpenMPLoop(
       OMPD_loop, getCollapseNumberExpr(Clauses), getOrderedNumberExpr(Clauses),
-      AStmt, *this, *DSAStack, VarsWithImplicitDSA, B);
+      AStmt, SemaRef, *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp loop exprs were not built");
 
-  setFunctionHasBranchProtectedScope();
-  return OMPGenericLoopDirective::Create(Context, StartLoc, EndLoc,
+  SemaRef.setFunctionHasBranchProtectedScope();
+  return OMPGenericLoopDirective::Create(getASTContext(), StartLoc, EndLoc,
                                          NestedLoopCount, Clauses, AStmt, B);
 }
 
-StmtResult Sema::ActOnOpenMPTeamsGenericLoopDirective(
+StmtResult SemaOpenMP::ActOnOpenMPTeamsGenericLoopDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -10884,7 +10917,7 @@ StmtResult Sema::ActOnOpenMPTeamsGenericLoopDirective(
   // OpenMP 5.1 [2.11.7, loop construct, Restrictions]
   // A list item may not appear in a lastprivate clause unless it is the
   // loop iteration variable of a loop that is associated with the construct.
-  if (checkGenericLoopLastprivate(*this, Clauses, OMPD_teams_loop, DSAStack))
+  if (checkGenericLoopLastprivate(SemaRef, Clauses, OMPD_teams_loop, DSAStack))
     return StmtError();
 
   auto *CS = cast<CapturedStmt>(AStmt);
@@ -10909,22 +10942,22 @@ StmtResult Sema::ActOnOpenMPTeamsGenericLoopDirective(
   // In presence of clause 'collapse', it will define the nested loops number.
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_teams_loop, getCollapseNumberExpr(Clauses),
-                      /*OrderedLoopCountExpr=*/nullptr, CS, *this, *DSAStack,
+                      /*OrderedLoopCountExpr=*/nullptr, CS, SemaRef, *DSAStack,
                       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp loop exprs were not built");
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
   DSAStack->setParentTeamsRegionLoc(StartLoc);
 
   return OMPTeamsGenericLoopDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
 
-StmtResult Sema::ActOnOpenMPTargetTeamsGenericLoopDirective(
+StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsGenericLoopDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -10933,7 +10966,7 @@ StmtResult Sema::ActOnOpenMPTargetTeamsGenericLoopDirective(
   // OpenMP 5.1 [2.11.7, loop construct, Restrictions]
   // A list item may not appear in a lastprivate clause unless it is the
   // loop iteration variable of a loop that is associated with the construct.
-  if (checkGenericLoopLastprivate(*this, Clauses, OMPD_target_teams_loop,
+  if (checkGenericLoopLastprivate(SemaRef, Clauses, OMPD_target_teams_loop,
                                   DSAStack))
     return StmtError();
 
@@ -10959,22 +10992,22 @@ StmtResult Sema::ActOnOpenMPTargetTeamsGenericLoopDirective(
   // In presence of clause 'collapse', it will define the nested loops number.
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_target_teams_loop, getCollapseNumberExpr(Clauses),
-                      /*OrderedLoopCountExpr=*/nullptr, CS, *this, *DSAStack,
+                      /*OrderedLoopCountExpr=*/nullptr, CS, SemaRef, *DSAStack,
                       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp loop exprs were not built");
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
   return OMPTargetTeamsGenericLoopDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
-      teamsLoopCanBeParallelFor(AStmt, *this));
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
+      teamsLoopCanBeParallelFor(AStmt, SemaRef));
 }
 
-StmtResult Sema::ActOnOpenMPParallelGenericLoopDirective(
+StmtResult SemaOpenMP::ActOnOpenMPParallelGenericLoopDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -10983,7 +11016,8 @@ StmtResult Sema::ActOnOpenMPParallelGenericLoopDirective(
   // OpenMP 5.1 [2.11.7, loop construct, Restrictions]
   // A list item may not appear in a lastprivate clause unless it is the
   // loop iteration variable of a loop that is associated with the construct.
-  if (checkGenericLoopLastprivate(*this, Clauses, OMPD_parallel_loop, DSAStack))
+  if (checkGenericLoopLastprivate(SemaRef, Clauses, OMPD_parallel_loop,
+                                  DSAStack))
     return StmtError();
 
   auto *CS = cast<CapturedStmt>(AStmt);
@@ -11008,21 +11042,21 @@ StmtResult Sema::ActOnOpenMPParallelGenericLoopDirective(
   // In presence of clause 'collapse', it will define the nested loops number.
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_parallel_loop, getCollapseNumberExpr(Clauses),
-                      /*OrderedLoopCountExpr=*/nullptr, CS, *this, *DSAStack,
+                      /*OrderedLoopCountExpr=*/nullptr, CS, SemaRef, *DSAStack,
                       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp loop exprs were not built");
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
   return OMPParallelGenericLoopDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
 
-StmtResult Sema::ActOnOpenMPTargetParallelGenericLoopDirective(
+StmtResult SemaOpenMP::ActOnOpenMPTargetParallelGenericLoopDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -11031,7 +11065,7 @@ StmtResult Sema::ActOnOpenMPTargetParallelGenericLoopDirective(
   // OpenMP 5.1 [2.11.7, loop construct, Restrictions]
   // A list item may not appear in a lastprivate clause unless it is the
   // loop iteration variable of a loop that is associated with the construct.
-  if (checkGenericLoopLastprivate(*this, Clauses, OMPD_target_parallel_loop,
+  if (checkGenericLoopLastprivate(SemaRef, Clauses, OMPD_target_parallel_loop,
                                   DSAStack))
     return StmtError();
 
@@ -11057,30 +11091,30 @@ StmtResult Sema::ActOnOpenMPTargetParallelGenericLoopDirective(
   // In presence of clause 'collapse', it will define the nested loops number.
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_target_parallel_loop, getCollapseNumberExpr(Clauses),
-                      /*OrderedLoopCountExpr=*/nullptr, CS, *this, *DSAStack,
+                      /*OrderedLoopCountExpr=*/nullptr, CS, SemaRef, *DSAStack,
                       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp loop exprs were not built");
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
   return OMPTargetParallelGenericLoopDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
 
-StmtResult Sema::ActOnOpenMPSingleDirective(ArrayRef<OMPClause *> Clauses,
-                                            Stmt *AStmt,
-                                            SourceLocation StartLoc,
-                                            SourceLocation EndLoc) {
+StmtResult SemaOpenMP::ActOnOpenMPSingleDirective(ArrayRef<OMPClause *> Clauses,
+                                                  Stmt *AStmt,
+                                                  SourceLocation StartLoc,
+                                                  SourceLocation EndLoc) {
   if (!AStmt)
     return StmtError();
 
   assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
   // OpenMP [2.7.3, single Construct, Restrictions]
   // The copyprivate clause must not be used with the nowait clause.
@@ -11099,33 +11133,35 @@ StmtResult Sema::ActOnOpenMPSingleDirective(ArrayRef<OMPClause *> Clauses,
     }
   }
 
-  return OMPSingleDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt);
+  return OMPSingleDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses,
+                                    AStmt);
 }
 
-StmtResult Sema::ActOnOpenMPMasterDirective(Stmt *AStmt,
-                                            SourceLocation StartLoc,
-                                            SourceLocation EndLoc) {
+StmtResult SemaOpenMP::ActOnOpenMPMasterDirective(Stmt *AStmt,
+                                                  SourceLocation StartLoc,
+                                                  SourceLocation EndLoc) {
   if (!AStmt)
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
-  return OMPMasterDirective::Create(Context, StartLoc, EndLoc, AStmt);
+  return OMPMasterDirective::Create(getASTContext(), StartLoc, EndLoc, AStmt);
 }
 
-StmtResult Sema::ActOnOpenMPMaskedDirective(ArrayRef<OMPClause *> Clauses,
-                                            Stmt *AStmt,
-                                            SourceLocation StartLoc,
-                                            SourceLocation EndLoc) {
+StmtResult SemaOpenMP::ActOnOpenMPMaskedDirective(ArrayRef<OMPClause *> Clauses,
+                                                  Stmt *AStmt,
+                                                  SourceLocation StartLoc,
+                                                  SourceLocation EndLoc) {
   if (!AStmt)
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
-  return OMPMaskedDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt);
+  return OMPMaskedDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses,
+                                    AStmt);
 }
 
-StmtResult Sema::ActOnOpenMPCriticalDirective(
+StmtResult SemaOpenMP::ActOnOpenMPCriticalDirective(
     const DeclarationNameInfo &DirName, ArrayRef<OMPClause *> Clauses,
     Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc) {
   if (!AStmt)
@@ -11146,7 +11182,7 @@ StmtResult Sema::ActOnOpenMPCriticalDirective(
           E->isInstantiationDependent()) {
         DependentHint = true;
       } else {
-        Hint = E->EvaluateKnownConstInt(Context);
+        Hint = E->EvaluateKnownConstInt(getASTContext());
         HintLoc = C->getBeginLoc();
       }
     }
@@ -11165,7 +11201,7 @@ StmtResult Sema::ActOnOpenMPCriticalDirective(
       if (const auto *C = Pair.first->getSingleClause<OMPHintClause>()) {
         Diag(C->getBeginLoc(), diag::note_omp_critical_hint_here)
             << 1
-            << toString(C->getHint()->EvaluateKnownConstInt(Context),
+            << toString(C->getHint()->EvaluateKnownConstInt(getASTContext()),
                         /*Radix=*/10, /*Signed=*/false);
       } else {
         Diag(Pair.first->getBeginLoc(), diag::note_omp_critical_no_hint) << 1;
@@ -11173,16 +11209,16 @@ StmtResult Sema::ActOnOpenMPCriticalDirective(
     }
   }
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
-  auto *Dir = OMPCriticalDirective::Create(Context, DirName, StartLoc, EndLoc,
-                                           Clauses, AStmt);
+  auto *Dir = OMPCriticalDirective::Create(getASTContext(), DirName, StartLoc,
+                                           EndLoc, Clauses, AStmt);
   if (!Pair.first && DirName.getName() && !DependentHint)
     DSAStack->addCriticalWithHint(Dir, Hint);
   return Dir;
 }
 
-StmtResult Sema::ActOnOpenMPParallelForDirective(
+StmtResult SemaOpenMP::ActOnOpenMPParallelForDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -11201,32 +11237,32 @@ StmtResult Sema::ActOnOpenMPParallelForDirective(
   // define the nested loops number.
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_parallel_for, getCollapseNumberExpr(Clauses),
-                      getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
+                      getOrderedNumberExpr(Clauses), AStmt, SemaRef, *DSAStack,
                       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp parallel for loop exprs were not built");
 
-  if (!CurContext->isDependentContext()) {
+  if (!SemaRef.CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (OMPClause *C : Clauses) {
       if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope,
-                                     DSAStack))
+                                     B.NumIterations, SemaRef,
+                                     SemaRef.getCurScope(), DSAStack))
           return StmtError();
     }
   }
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPParallelForDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
       DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
 }
 
-StmtResult Sema::ActOnOpenMPParallelForSimdDirective(
+StmtResult SemaOpenMP::ActOnOpenMPParallelForSimdDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -11245,34 +11281,33 @@ StmtResult Sema::ActOnOpenMPParallelForSimdDirective(
   // define the nested loops number.
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_parallel_for_simd, getCollapseNumberExpr(Clauses),
-                      getOrderedNumberExpr(Clauses), AStmt, *this, *DSAStack,
+                      getOrderedNumberExpr(Clauses), AStmt, SemaRef, *DSAStack,
                       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  if (!CurContext->isDependentContext()) {
+  if (!SemaRef.CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (OMPClause *C : Clauses) {
       if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope,
-                                     DSAStack))
+                                     B.NumIterations, SemaRef,
+                                     SemaRef.getCurScope(), DSAStack))
           return StmtError();
     }
   }
 
-  if (checkSimdlenSafelenSpecified(*this, Clauses))
+  if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPParallelForSimdDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
 
-StmtResult
-Sema::ActOnOpenMPParallelMasterDirective(ArrayRef<OMPClause *> Clauses,
-                                         Stmt *AStmt, SourceLocation StartLoc,
-                                         SourceLocation EndLoc) {
+StmtResult SemaOpenMP::ActOnOpenMPParallelMasterDirective(
+    ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+    SourceLocation EndLoc) {
   if (!AStmt)
     return StmtError();
 
@@ -11285,17 +11320,16 @@ Sema::ActOnOpenMPParallelMasterDirective(ArrayRef<OMPClause *> Clauses,
   // longjmp() and throw() must not violate the entry/exit criteria.
   CS->getCapturedDecl()->setNothrow();
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
   return OMPParallelMasterDirective::Create(
-      Context, StartLoc, EndLoc, Clauses, AStmt,
+      getASTContext(), StartLoc, EndLoc, Clauses, AStmt,
       DSAStack->getTaskgroupReductionRef());
 }
 
-StmtResult
-Sema::ActOnOpenMPParallelMaskedDirective(ArrayRef<OMPClause *> Clauses,
-                                         Stmt *AStmt, SourceLocation StartLoc,
-                                         SourceLocation EndLoc) {
+StmtResult SemaOpenMP::ActOnOpenMPParallelMaskedDirective(
+    ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+    SourceLocation EndLoc) {
   if (!AStmt)
     return StmtError();
 
@@ -11308,17 +11342,16 @@ Sema::ActOnOpenMPParallelMaskedDirective(ArrayRef<OMPClause *> Clauses,
   // longjmp() and throw() must not violate the entry/exit criteria.
   CS->getCapturedDecl()->setNothrow();
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
   return OMPParallelMaskedDirective::Create(
-      Context, StartLoc, EndLoc, Clauses, AStmt,
+      getASTContext(), StartLoc, EndLoc, Clauses, AStmt,
       DSAStack->getTaskgroupReductionRef());
 }
 
-StmtResult
-Sema::ActOnOpenMPParallelSectionsDirective(ArrayRef<OMPClause *> Clauses,
-                                           Stmt *AStmt, SourceLocation StartLoc,
-                                           SourceLocation EndLoc) {
+StmtResult SemaOpenMP::ActOnOpenMPParallelSectionsDirective(
+    ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+    SourceLocation EndLoc) {
   if (!AStmt)
     return StmtError();
 
@@ -11348,10 +11381,10 @@ Sema::ActOnOpenMPParallelSectionsDirective(ArrayRef<OMPClause *> Clauses,
     return StmtError();
   }
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
   return OMPParallelSectionsDirective::Create(
-      Context, StartLoc, EndLoc, Clauses, AStmt,
+      getASTContext(), StartLoc, EndLoc, Clauses, AStmt,
       DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
 }
 
@@ -11378,16 +11411,17 @@ static bool checkMutuallyExclusiveClauses(
   return ErrorFound;
 }
 
-StmtResult Sema::ActOnOpenMPTaskDirective(ArrayRef<OMPClause *> Clauses,
-                                          Stmt *AStmt, SourceLocation StartLoc,
-                                          SourceLocation EndLoc) {
+StmtResult SemaOpenMP::ActOnOpenMPTaskDirective(ArrayRef<OMPClause *> Clauses,
+                                                Stmt *AStmt,
+                                                SourceLocation StartLoc,
+                                                SourceLocation EndLoc) {
   if (!AStmt)
     return StmtError();
 
   // OpenMP 5.0, 2.10.1 task Construct
   // If a detach clause appears on the directive, then a mergeable clause cannot
   // appear on the same directive.
-  if (checkMutuallyExclusiveClauses(*this, Clauses,
+  if (checkMutuallyExclusiveClauses(SemaRef, Clauses,
                                     {OMPC_detach, OMPC_mergeable}))
     return StmtError();
 
@@ -11399,26 +11433,26 @@ StmtResult Sema::ActOnOpenMPTaskDirective(ArrayRef<OMPClause *> Clauses,
   // longjmp() and throw() must not violate the entry/exit criteria.
   CS->getCapturedDecl()->setNothrow();
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
-  return OMPTaskDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
-                                  DSAStack->isCancelRegion());
+  return OMPTaskDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses,
+                                  AStmt, DSAStack->isCancelRegion());
 }
 
-StmtResult Sema::ActOnOpenMPTaskyieldDirective(SourceLocation StartLoc,
-                                               SourceLocation EndLoc) {
-  return OMPTaskyieldDirective::Create(Context, StartLoc, EndLoc);
+StmtResult SemaOpenMP::ActOnOpenMPTaskyieldDirective(SourceLocation StartLoc,
+                                                     SourceLocation EndLoc) {
+  return OMPTaskyieldDirective::Create(getASTContext(), StartLoc, EndLoc);
 }
 
-StmtResult Sema::ActOnOpenMPBarrierDirective(SourceLocation StartLoc,
-                                             SourceLocation EndLoc) {
-  return OMPBarrierDirective::Create(Context, StartLoc, EndLoc);
+StmtResult SemaOpenMP::ActOnOpenMPBarrierDirective(SourceLocation StartLoc,
+                                                   SourceLocation EndLoc) {
+  return OMPBarrierDirective::Create(getASTContext(), StartLoc, EndLoc);
 }
 
-StmtResult Sema::ActOnOpenMPErrorDirective(ArrayRef<OMPClause *> Clauses,
-                                           SourceLocation StartLoc,
-                                           SourceLocation EndLoc,
-                                           bool InExContext) {
+StmtResult SemaOpenMP::ActOnOpenMPErrorDirective(ArrayRef<OMPClause *> Clauses,
+                                                 SourceLocation StartLoc,
+                                                 SourceLocation EndLoc,
+                                                 bool InExContext) {
   const OMPAtClause *AtC =
       OMPExecutableDirective::getSingleClause<OMPAtClause>(Clauses);
 
@@ -11443,12 +11477,13 @@ StmtResult Sema::ActOnOpenMPErrorDirective(ArrayRef<OMPClause *> Clauses,
     if (!SeverityC || SeverityC->getSeverityKind() != OMPC_SEVERITY_warning)
       return StmtError();
   }
-  return OMPErrorDirective::Create(Context, StartLoc, EndLoc, Clauses);
+  return OMPErrorDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses);
 }
 
-StmtResult Sema::ActOnOpenMPTaskwaitDirective(ArrayRef<OMPClause *> Clauses,
-                                              SourceLocation StartLoc,
-                                              SourceLocation EndLoc) {
+StmtResult
+SemaOpenMP::ActOnOpenMPTaskwaitDirective(ArrayRef<OMPClause *> Clauses,
+                                         SourceLocation StartLoc,
+                                         SourceLocation EndLoc) {
   const OMPNowaitClause *NowaitC =
       OMPExecutableDirective::getSingleClause<OMPNowaitClause>(Clauses);
   bool HasDependC =
@@ -11459,28 +11494,29 @@ StmtResult Sema::ActOnOpenMPTaskwaitDirective(ArrayRef<OMPClause *> Clauses,
     return StmtError();
   }
 
-  return OMPTaskwaitDirective::Create(Context, StartLoc, EndLoc, Clauses);
+  return OMPTaskwaitDirective::Create(getASTContext(), StartLoc, EndLoc,
+                                      Clauses);
 }
 
-StmtResult Sema::ActOnOpenMPTaskgroupDirective(ArrayRef<OMPClause *> Clauses,
-                                               Stmt *AStmt,
-                                               SourceLocation StartLoc,
-                                               SourceLocation EndLoc) {
+StmtResult
+SemaOpenMP::ActOnOpenMPTaskgroupDirective(ArrayRef<OMPClause *> Clauses,
+                                          Stmt *AStmt, SourceLocation StartLoc,
+                                          SourceLocation EndLoc) {
   if (!AStmt)
     return StmtError();
 
   assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
-  return OMPTaskgroupDirective::Create(Context, StartLoc, EndLoc, Clauses,
-                                       AStmt,
+  return OMPTaskgroupDirective::Create(getASTContext(), StartLoc, EndLoc,
+                                       Clauses, AStmt,
                                        DSAStack->getTaskgroupReductionRef());
 }
 
-StmtResult Sema::ActOnOpenMPFlushDirective(ArrayRef<OMPClause *> Clauses,
-                                           SourceLocation StartLoc,
-                                           SourceLocation EndLoc) {
+StmtResult SemaOpenMP::ActOnOpenMPFlushDirective(ArrayRef<OMPClause *> Clauses,
+                                                 SourceLocation StartLoc,
+                                                 SourceLocation EndLoc) {
   OMPFlushClause *FC = nullptr;
   OMPClause *OrderClause = nullptr;
   for (OMPClause *C : Clauses) {
@@ -11514,12 +11550,12 @@ StmtResult Sema::ActOnOpenMPFlushDirective(ArrayRef<OMPClause *> Clauses,
         << getOpenMPClauseName(OrderClause->getClauseKind());
     return StmtError();
   }
-  return OMPFlushDirective::Create(Context, StartLoc, EndLoc, Clauses);
+  return OMPFlushDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses);
 }
 
-StmtResult Sema::ActOnOpenMPDepobjDirective(ArrayRef<OMPClause *> Clauses,
-                                            SourceLocation StartLoc,
-                                            SourceLocation EndLoc) {
+StmtResult SemaOpenMP::ActOnOpenMPDepobjDirective(ArrayRef<OMPClause *> Clauses,
+                                                  SourceLocation StartLoc,
+                                                  SourceLocation EndLoc) {
   if (Clauses.empty()) {
     Diag(StartLoc, diag::err_omp_depobj_expected);
     return StmtError();
@@ -11536,12 +11572,12 @@ StmtResult Sema::ActOnOpenMPDepobjDirective(ArrayRef<OMPClause *> Clauses,
     Diag(Clauses[0]->getEndLoc(), diag::err_omp_depobj_single_clause_expected);
     return StmtError();
   }
-  return OMPDepobjDirective::Create(Context, StartLoc, EndLoc, Clauses);
+  return OMPDepobjDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses);
 }
 
-StmtResult Sema::ActOnOpenMPScanDirective(ArrayRef<OMPClause *> Clauses,
-                                          SourceLocation StartLoc,
-                                          SourceLocation EndLoc) {
+StmtResult SemaOpenMP::ActOnOpenMPScanDirective(ArrayRef<OMPClause *> Clauses,
+                                                SourceLocation StartLoc,
+                                                SourceLocation EndLoc) {
   // Check that exactly one clause is specified.
   if (Clauses.size() != 1) {
     Diag(Clauses.empty() ? EndLoc : Clauses[1]->getBeginLoc(),
@@ -11566,13 +11602,13 @@ StmtResult Sema::ActOnOpenMPScanDirective(ArrayRef<OMPClause *> Clauses,
     return StmtError();
   }
   DSAStack->setParentHasScanDirective(StartLoc);
-  return OMPScanDirective::Create(Context, StartLoc, EndLoc, Clauses);
+  return OMPScanDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses);
 }
 
-StmtResult Sema::ActOnOpenMPOrderedDirective(ArrayRef<OMPClause *> Clauses,
-                                             Stmt *AStmt,
-                                             SourceLocation StartLoc,
-                                             SourceLocation EndLoc) {
+StmtResult
+SemaOpenMP::ActOnOpenMPOrderedDirective(ArrayRef<OMPClause *> Clauses,
+                                        Stmt *AStmt, SourceLocation StartLoc,
+                                        SourceLocation EndLoc) {
   const OMPClause *DependFound = nullptr;
   const OMPClause *DependSourceClause = nullptr;
   const OMPClause *DependSinkClause = nullptr;
@@ -11631,7 +11667,7 @@ StmtResult Sema::ActOnOpenMPOrderedDirective(ArrayRef<OMPClause *> Clauses,
     // An ordered construct with the simd clause is the only OpenMP construct
     // that can appear in the simd region.
     Diag(StartLoc, diag::err_omp_prohibited_region_simd)
-        << (LangOpts.OpenMP >= 50 ? 1 : 0);
+        << (getLangOpts().OpenMP >= 50 ? 1 : 0);
     ErrorFound = true;
   } else if ((DependFound || DoacrossFound) && (TC || SC)) {
     SourceLocation Loc =
@@ -11678,10 +11714,11 @@ StmtResult Sema::ActOnOpenMPOrderedDirective(ArrayRef<OMPClause *> Clauses,
   if (AStmt) {
     assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
 
-    setFunctionHasBranchProtectedScope();
+    SemaRef.setFunctionHasBranchProtectedScope();
   }
 
-  return OMPOrderedDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt);
+  return OMPOrderedDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses,
+                                     AStmt);
 }
 
 namespace {
@@ -12739,10 +12776,11 @@ bool OpenMPAtomicCompareCaptureChecker::checkStmt(Stmt *S,
 }
 } // namespace
 
-StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
-                                            Stmt *AStmt,
-                                            SourceLocation StartLoc,
-                                            SourceLocation EndLoc) {
+StmtResult SemaOpenMP::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
+                                                  Stmt *AStmt,
+                                                  SourceLocation StartLoc,
+                                                  SourceLocation EndLoc) {
+  ASTContext &Context = getASTContext();
   // Register location of the first atomic directive.
   DSAStack->addAtomicDirectiveLoc(StartLoc);
   if (!AStmt)
@@ -12945,7 +12983,7 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
           << ErrorFound << NoteRange;
       return StmtError();
     }
-    if (CurContext->isDependentContext())
+    if (SemaRef.CurContext->isDependentContext())
       V = X = nullptr;
   } else if (AtomicKind == OMPC_write) {
     enum {
@@ -13007,7 +13045,7 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
           << ErrorFound << NoteRange;
       return StmtError();
     }
-    if (CurContext->isDependentContext())
+    if (SemaRef.CurContext->isDependentContext())
       E = X = nullptr;
   } else if (AtomicKind == OMPC_update || AtomicKind == OMPC_unknown) {
     // If clause is update:
@@ -13018,7 +13056,7 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
     //  x binop= expr;
     //  x = x binop expr;
     //  x = expr binop x;
-    OpenMPAtomicUpdateChecker Checker(*this);
+    OpenMPAtomicUpdateChecker Checker(SemaRef);
     if (Checker.checkStatement(
             Body,
             (AtomicKind == OMPC_update)
@@ -13026,7 +13064,7 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
                 : diag::err_omp_atomic_not_expression_statement,
             diag::note_omp_atomic_update))
       return StmtError();
-    if (!CurContext->isDependentContext()) {
+    if (!SemaRef.CurContext->isDependentContext()) {
       E = Checker.getExpr();
       X = Checker.getX();
       UE = Checker.getUpdateExpr();
@@ -13056,7 +13094,7 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
       if (AtomicBinOp && AtomicBinOp->getOpcode() == BO_Assign) {
         V = AtomicBinOp->getLHS();
         Body = AtomicBinOp->getRHS()->IgnoreParenImpCasts();
-        OpenMPAtomicUpdateChecker Checker(*this);
+        OpenMPAtomicUpdateChecker Checker(SemaRef);
         if (Checker.checkStatement(
                 Body, diag::err_omp_atomic_capture_not_expression_statement,
                 diag::note_omp_atomic_update))
@@ -13081,7 +13119,7 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
         Diag(NoteLoc, diag::note_omp_atomic_capture) << ErrorFound << NoteRange;
         return StmtError();
       }
-      if (CurContext->isDependentContext())
+      if (SemaRef.CurContext->isDependentContext())
         UE = V = E = X = nullptr;
     } else {
       // If clause is a capture:
@@ -13110,14 +13148,14 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
           if (auto *EWC = dyn_cast<ExprWithCleanups>(Second))
             Second = EWC->getSubExpr()->IgnoreParenImpCasts();
           // Need to find what subexpression is 'v' and what is 'x'.
-          OpenMPAtomicUpdateChecker Checker(*this);
+          OpenMPAtomicUpdateChecker Checker(SemaRef);
           bool IsUpdateExprFound = !Checker.checkStatement(Second);
           BinaryOperator *BinOp = nullptr;
           if (IsUpdateExprFound) {
             BinOp = dyn_cast<BinaryOperator>(First);
             IsUpdateExprFound = BinOp && BinOp->getOpcode() == BO_Assign;
           }
-          if (IsUpdateExprFound && !CurContext->isDependentContext()) {
+          if (IsUpdateExprFound && !SemaRef.CurContext->isDependentContext()) {
             //  { v = x; x++; }
             //  { v = x; x--; }
             //  { v = x; ++x; }
@@ -13147,7 +13185,8 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
               BinOp = dyn_cast<BinaryOperator>(Second);
               IsUpdateExprFound = BinOp && BinOp->getOpcode() == BO_Assign;
             }
-            if (IsUpdateExprFound && !CurContext->isDependentContext()) {
+            if (IsUpdateExprFound &&
+                !SemaRef.CurContext->isDependentContext()) {
               //  { x++; v = x; }
               //  { x--; v = x; }
               //  { ++x; v = x; }
@@ -13244,12 +13283,12 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
       Diag(NoteLoc, diag::note_omp_atomic_capture) << ErrorFound << NoteRange;
       return StmtError();
     }
-    if (CurContext->isDependentContext())
+    if (SemaRef.CurContext->isDependentContext())
       UE = V = E = X = nullptr;
   } else if (AtomicKind == OMPC_compare) {
     if (IsCompareCapture) {
       OpenMPAtomicCompareCaptureChecker::ErrorInfoTy ErrorInfo;
-      OpenMPAtomicCompareCaptureChecker Checker(*this);
+      OpenMPAtomicCompareCaptureChecker Checker(SemaRef);
       if (!Checker.checkStmt(Body, ErrorInfo)) {
         Diag(ErrorInfo.ErrorLoc, diag::err_omp_atomic_compare_capture)
             << ErrorInfo.ErrorRange;
@@ -13269,7 +13308,7 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
       IsPostfixUpdate = Checker.isPostfixUpdate();
     } else {
       OpenMPAtomicCompareChecker::ErrorInfoTy ErrorInfo;
-      OpenMPAtomicCompareChecker Checker(*this);
+      OpenMPAtomicCompareChecker Checker(SemaRef);
       if (!Checker.checkStmt(Body, ErrorInfo)) {
         Diag(ErrorInfo.ErrorLoc, diag::err_omp_atomic_compare)
             << ErrorInfo.ErrorRange;
@@ -13307,17 +13346,17 @@ StmtResult Sema::ActOnOpenMPAtomicDirective(ArrayRef<OMPClause *> Clauses,
     }
   }
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
   return OMPAtomicDirective::Create(
       Context, StartLoc, EndLoc, Clauses, AStmt,
       {X, V, R, E, UE, D, CE, IsXLHSInRHSPart, IsPostfixUpdate, IsFailOnly});
 }
 
-StmtResult Sema::ActOnOpenMPTargetDirective(ArrayRef<OMPClause *> Clauses,
-                                            Stmt *AStmt,
-                                            SourceLocation StartLoc,
-                                            SourceLocation EndLoc) {
+StmtResult SemaOpenMP::ActOnOpenMPTargetDirective(ArrayRef<OMPClause *> Clauses,
+                                                  Stmt *AStmt,
+                                                  SourceLocation StartLoc,
+                                                  SourceLocation EndLoc) {
   if (!AStmt)
     return StmtError();
 
@@ -13374,15 +13413,15 @@ StmtResult Sema::ActOnOpenMPTargetDirective(ArrayRef<OMPClause *> Clauses,
     }
   }
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
-  return OMPTargetDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt);
+  return OMPTargetDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses,
+                                    AStmt);
 }
 
-StmtResult
-Sema::ActOnOpenMPTargetParallelDirective(ArrayRef<OMPClause *> Clauses,
-                                         Stmt *AStmt, SourceLocation StartLoc,
-                                         SourceLocation EndLoc) {
+StmtResult SemaOpenMP::ActOnOpenMPTargetParallelDirective(
+    ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+    SourceLocation EndLoc) {
   if (!AStmt)
     return StmtError();
 
@@ -13404,14 +13443,14 @@ Sema::ActOnOpenMPTargetParallelDirective(ArrayRef<OMPClause *> Clauses,
     CS->getCapturedDecl()->setNothrow();
   }
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
   return OMPTargetParallelDirective::Create(
-      Context, StartLoc, EndLoc, Clauses, AStmt,
+      getASTContext(), StartLoc, EndLoc, Clauses, AStmt,
       DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
 }
 
-StmtResult Sema::ActOnOpenMPTargetParallelForDirective(
+StmtResult SemaOpenMP::ActOnOpenMPTargetParallelForDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -13440,28 +13479,28 @@ StmtResult Sema::ActOnOpenMPTargetParallelForDirective(
   // define the nested loops number.
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_target_parallel_for, getCollapseNumberExpr(Clauses),
-                      getOrderedNumberExpr(Clauses), CS, *this, *DSAStack,
+                      getOrderedNumberExpr(Clauses), CS, SemaRef, *DSAStack,
                       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp target parallel for loop exprs were not built");
 
-  if (!CurContext->isDependentContext()) {
+  if (!SemaRef.CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (OMPClause *C : Clauses) {
       if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope,
-                                     DSAStack))
+                                     B.NumIterations, SemaRef,
+                                     SemaRef.getCurScope(), DSAStack))
           return StmtError();
     }
   }
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPTargetParallelForDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
       DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
 }
 
@@ -13498,10 +13537,10 @@ static bool isClauseMappable(ArrayRef<OMPClause *> Clauses) {
   return true;
 }
 
-StmtResult Sema::ActOnOpenMPTargetDataDirective(ArrayRef<OMPClause *> Clauses,
-                                                Stmt *AStmt,
-                                                SourceLocation StartLoc,
-                                                SourceLocation EndLoc) {
+StmtResult
+SemaOpenMP::ActOnOpenMPTargetDataDirective(ArrayRef<OMPClause *> Clauses,
+                                           Stmt *AStmt, SourceLocation StartLoc,
+                                           SourceLocation EndLoc) {
   if (!AStmt)
     return StmtError();
 
@@ -13511,9 +13550,10 @@ StmtResult Sema::ActOnOpenMPTargetDataDirective(ArrayRef<OMPClause *> Clauses,
   // At least one map, use_device_addr or use_device_ptr clause must appear on
   // the directive.
   if (!hasClauses(Clauses, OMPC_map, OMPC_use_device_ptr) &&
-      (LangOpts.OpenMP < 50 || !hasClauses(Clauses, OMPC_use_device_addr))) {
+      (getLangOpts().OpenMP < 50 ||
+       !hasClauses(Clauses, OMPC_use_device_addr))) {
     StringRef Expected;
-    if (LangOpts.OpenMP < 50)
+    if (getLangOpts().OpenMP < 50)
       Expected = "'map' or 'use_device_ptr'";
     else
       Expected = "'map', 'use_device_ptr', or 'use_device_addr'";
@@ -13522,16 +13562,15 @@ StmtResult Sema::ActOnOpenMPTargetDataDirective(ArrayRef<OMPClause *> Clauses,
     return StmtError();
   }
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
-  return OMPTargetDataDirective::Create(Context, StartLoc, EndLoc, Clauses,
-                                        AStmt);
+  return OMPTargetDataDirective::Create(getASTContext(), StartLoc, EndLoc,
+                                        Clauses, AStmt);
 }
 
-StmtResult
-Sema::ActOnOpenMPTargetEnterDataDirective(ArrayRef<OMPClause *> Clauses,
-                                          SourceLocation StartLoc,
-                                          SourceLocation EndLoc, Stmt *AStmt) {
+StmtResult SemaOpenMP::ActOnOpenMPTargetEnterDataDirective(
+    ArrayRef<OMPClause *> Clauses, SourceLocation StartLoc,
+    SourceLocation EndLoc, Stmt *AStmt) {
   if (!AStmt)
     return StmtError();
 
@@ -13561,14 +13600,13 @@ Sema::ActOnOpenMPTargetEnterDataDirective(ArrayRef<OMPClause *> Clauses,
     return StmtError();
   }
 
-  return OMPTargetEnterDataDirective::Create(Context, StartLoc, EndLoc, Clauses,
-                                             AStmt);
+  return OMPTargetEnterDataDirective::Create(getASTContext(), StartLoc, EndLoc,
+                                             Clauses, AStmt);
 }
 
-StmtResult
-Sema::ActOnOpenMPTargetExitDataDirective(ArrayRef<OMPClause *> Clauses,
-                                         SourceLocation StartLoc,
-                                         SourceLocation EndLoc, Stmt *AStmt) {
+StmtResult SemaOpenMP::ActOnOpenMPTargetExitDataDirective(
+    ArrayRef<OMPClause *> Clauses, SourceLocation StartLoc,
+    SourceLocation EndLoc, Stmt *AStmt) {
   if (!AStmt)
     return StmtError();
 
@@ -13598,14 +13636,13 @@ Sema::ActOnOpenMPTargetExitDataDirective(ArrayRef<OMPClause *> Clauses,
     return StmtError();
   }
 
-  return OMPTargetExitDataDirective::Create(Context, StartLoc, EndLoc, Clauses,
-                                            AStmt);
+  return OMPTargetExitDataDirective::Create(getASTContext(), StartLoc, EndLoc,
+                                            Clauses, AStmt);
 }
 
-StmtResult Sema::ActOnOpenMPTargetUpdateDirective(ArrayRef<OMPClause *> Clauses,
-                                                  SourceLocation StartLoc,
-                                                  SourceLocation EndLoc,
-                                                  Stmt *AStmt) {
+StmtResult SemaOpenMP::ActOnOpenMPTargetUpdateDirective(
+    ArrayRef<OMPClause *> Clauses, SourceLocation StartLoc,
+    SourceLocation EndLoc, Stmt *AStmt) {
   if (!AStmt)
     return StmtError();
 
@@ -13637,13 +13674,14 @@ StmtResult Sema::ActOnOpenMPTargetUpdateDirective(ArrayRef<OMPClause *> Clauses,
     return StmtError();
   }
 
-  return OMPTargetUpdateDirective::Create(Context, StartLoc, EndLoc, Clauses,
-                                          AStmt);
+  return OMPTargetUpdateDirective::Create(getASTContext(), StartLoc, EndLoc,
+                                          Clauses, AStmt);
 }
 
-StmtResult Sema::ActOnOpenMPTeamsDirective(ArrayRef<OMPClause *> Clauses,
-                                           Stmt *AStmt, SourceLocation StartLoc,
-                                           SourceLocation EndLoc) {
+StmtResult SemaOpenMP::ActOnOpenMPTeamsDirective(ArrayRef<OMPClause *> Clauses,
+                                                 Stmt *AStmt,
+                                                 SourceLocation StartLoc,
+                                                 SourceLocation EndLoc) {
   if (!AStmt)
     return StmtError();
 
@@ -13659,17 +13697,17 @@ StmtResult Sema::ActOnOpenMPTeamsDirective(ArrayRef<OMPClause *> Clauses,
   // longjmp() and throw() must not violate the entry/exit criteria.
   CS->getCapturedDecl()->setNothrow();
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
   DSAStack->setParentTeamsRegionLoc(StartLoc);
 
-  return OMPTeamsDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt);
+  return OMPTeamsDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses,
+                                   AStmt);
 }
 
-StmtResult
-Sema::ActOnOpenMPCancellationPointDirective(SourceLocation StartLoc,
-                                            SourceLocation EndLoc,
-                                            OpenMPDirectiveKind CancelRegion) {
+StmtResult SemaOpenMP::ActOnOpenMPCancellationPointDirective(
+    SourceLocation StartLoc, SourceLocation EndLoc,
+    OpenMPDirectiveKind CancelRegion) {
   if (DSAStack->isParentNowaitRegion()) {
     Diag(StartLoc, diag::err_omp_parent_cancel_region_nowait) << 0;
     return StmtError();
@@ -13678,14 +13716,13 @@ Sema::ActOnOpenMPCancellationPointDirective(SourceLocation StartLoc,
     Diag(StartLoc, diag::err_omp_parent_cancel_region_ordered) << 0;
     return StmtError();
   }
-  return OMPCancellationPointDirective::Create(Context, StartLoc, EndLoc,
-                                               CancelRegion);
+  return OMPCancellationPointDirective::Create(getASTContext(), StartLoc,
+                                               EndLoc, CancelRegion);
 }
 
-StmtResult Sema::ActOnOpenMPCancelDirective(ArrayRef<OMPClause *> Clauses,
-                                            SourceLocation StartLoc,
-                                            SourceLocation EndLoc,
-                                            OpenMPDirectiveKind CancelRegion) {
+StmtResult SemaOpenMP::ActOnOpenMPCancelDirective(
+    ArrayRef<OMPClause *> Clauses, SourceLocation StartLoc,
+    SourceLocation EndLoc, OpenMPDirectiveKind CancelRegion) {
   if (DSAStack->isParentNowaitRegion()) {
     Diag(StartLoc, diag::err_omp_parent_cancel_region_nowait) << 1;
     return StmtError();
@@ -13695,7 +13732,7 @@ StmtResult Sema::ActOnOpenMPCancelDirective(ArrayRef<OMPClause *> Clauses,
     return StmtError();
   }
   DSAStack->setParentCancelRegion(/*Cancel=*/true);
-  return OMPCancelDirective::Create(Context, StartLoc, EndLoc, Clauses,
+  return OMPCancelDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses,
                                     CancelRegion);
 }
 
@@ -13726,7 +13763,7 @@ static bool checkReductionClauseWithNogroup(Sema &S,
   return false;
 }
 
-StmtResult Sema::ActOnOpenMPTaskLoopDirective(
+StmtResult SemaOpenMP::ActOnOpenMPTaskLoopDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -13738,33 +13775,33 @@ StmtResult Sema::ActOnOpenMPTaskLoopDirective(
   // define the nested loops number.
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_taskloop, getCollapseNumberExpr(Clauses),
-                      /*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack,
-                      VarsWithImplicitDSA, B);
+                      /*OrderedLoopCountExpr=*/nullptr, AStmt, SemaRef,
+                      *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp for loop exprs were not built");
 
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // The grainsize clause and num_tasks clause are mutually exclusive and may
   // not appear on the same taskloop directive.
-  if (checkMutuallyExclusiveClauses(*this, Clauses,
+  if (checkMutuallyExclusiveClauses(SemaRef, Clauses,
                                     {OMPC_grainsize, OMPC_num_tasks}))
     return StmtError();
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // If a reduction clause is present on the taskloop directive, the nogroup
   // clause must not be specified.
-  if (checkReductionClauseWithNogroup(*this, Clauses))
+  if (checkReductionClauseWithNogroup(SemaRef, Clauses))
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
-  return OMPTaskLoopDirective::Create(Context, StartLoc, EndLoc,
+  SemaRef.setFunctionHasBranchProtectedScope();
+  return OMPTaskLoopDirective::Create(getASTContext(), StartLoc, EndLoc,
                                       NestedLoopCount, Clauses, AStmt, B,
                                       DSAStack->isCancelRegion());
 }
 
-StmtResult Sema::ActOnOpenMPTaskLoopSimdDirective(
+StmtResult SemaOpenMP::ActOnOpenMPTaskLoopSimdDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -13776,21 +13813,21 @@ StmtResult Sema::ActOnOpenMPTaskLoopSimdDirective(
   // define the nested loops number.
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_taskloop_simd, getCollapseNumberExpr(Clauses),
-                      /*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack,
-                      VarsWithImplicitDSA, B);
+                      /*OrderedLoopCountExpr=*/nullptr, AStmt, SemaRef,
+                      *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp for loop exprs were not built");
 
-  if (!CurContext->isDependentContext()) {
+  if (!SemaRef.CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (OMPClause *C : Clauses) {
       if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope,
-                                     DSAStack))
+                                     B.NumIterations, SemaRef,
+                                     SemaRef.getCurScope(), DSAStack))
           return StmtError();
     }
   }
@@ -13798,23 +13835,23 @@ StmtResult Sema::ActOnOpenMPTaskLoopSimdDirective(
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // The grainsize clause and num_tasks clause are mutually exclusive and may
   // not appear on the same taskloop directive.
-  if (checkMutuallyExclusiveClauses(*this, Clauses,
+  if (checkMutuallyExclusiveClauses(SemaRef, Clauses,
                                     {OMPC_grainsize, OMPC_num_tasks}))
     return StmtError();
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // If a reduction clause is present on the taskloop directive, the nogroup
   // clause must not be specified.
-  if (checkReductionClauseWithNogroup(*this, Clauses))
+  if (checkReductionClauseWithNogroup(SemaRef, Clauses))
     return StmtError();
-  if (checkSimdlenSafelenSpecified(*this, Clauses))
+  if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
-  return OMPTaskLoopSimdDirective::Create(Context, StartLoc, EndLoc,
+  SemaRef.setFunctionHasBranchProtectedScope();
+  return OMPTaskLoopSimdDirective::Create(getASTContext(), StartLoc, EndLoc,
                                           NestedLoopCount, Clauses, AStmt, B);
 }
 
-StmtResult Sema::ActOnOpenMPMasterTaskLoopDirective(
+StmtResult SemaOpenMP::ActOnOpenMPMasterTaskLoopDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -13826,33 +13863,33 @@ StmtResult Sema::ActOnOpenMPMasterTaskLoopDirective(
   // define the nested loops number.
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_master_taskloop, getCollapseNumberExpr(Clauses),
-                      /*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack,
-                      VarsWithImplicitDSA, B);
+                      /*OrderedLoopCountExpr=*/nullptr, AStmt, SemaRef,
+                      *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp for loop exprs were not built");
 
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // The grainsize clause and num_tasks clause are mutually exclusive and may
   // not appear on the same taskloop directive.
-  if (checkMutuallyExclusiveClauses(*this, Clauses,
+  if (checkMutuallyExclusiveClauses(SemaRef, Clauses,
                                     {OMPC_grainsize, OMPC_num_tasks}))
     return StmtError();
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // If a reduction clause is present on the taskloop directive, the nogroup
   // clause must not be specified.
-  if (checkReductionClauseWithNogroup(*this, Clauses))
+  if (checkReductionClauseWithNogroup(SemaRef, Clauses))
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
-  return OMPMasterTaskLoopDirective::Create(Context, StartLoc, EndLoc,
+  SemaRef.setFunctionHasBranchProtectedScope();
+  return OMPMasterTaskLoopDirective::Create(getASTContext(), StartLoc, EndLoc,
                                             NestedLoopCount, Clauses, AStmt, B,
                                             DSAStack->isCancelRegion());
 }
 
-StmtResult Sema::ActOnOpenMPMaskedTaskLoopDirective(
+StmtResult SemaOpenMP::ActOnOpenMPMaskedTaskLoopDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -13864,33 +13901,33 @@ StmtResult Sema::ActOnOpenMPMaskedTaskLoopDirective(
   // define the nested loops number.
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_masked_taskloop, getCollapseNumberExpr(Clauses),
-                      /*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack,
-                      VarsWithImplicitDSA, B);
+                      /*OrderedLoopCountExpr=*/nullptr, AStmt, SemaRef,
+                      *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp for loop exprs were not built");
 
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // The grainsize clause and num_tasks clause are mutually exclusive and may
   // not appear on the same taskloop directive.
-  if (checkMutuallyExclusiveClauses(*this, Clauses,
+  if (checkMutuallyExclusiveClauses(SemaRef, Clauses,
                                     {OMPC_grainsize, OMPC_num_tasks}))
     return StmtError();
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // If a reduction clause is present on the taskloop directive, the nogroup
   // clause must not be specified.
-  if (checkReductionClauseWithNogroup(*this, Clauses))
+  if (checkReductionClauseWithNogroup(SemaRef, Clauses))
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
-  return OMPMaskedTaskLoopDirective::Create(Context, StartLoc, EndLoc,
+  SemaRef.setFunctionHasBranchProtectedScope();
+  return OMPMaskedTaskLoopDirective::Create(getASTContext(), StartLoc, EndLoc,
                                             NestedLoopCount, Clauses, AStmt, B,
                                             DSAStack->isCancelRegion());
 }
 
-StmtResult Sema::ActOnOpenMPMasterTaskLoopSimdDirective(
+StmtResult SemaOpenMP::ActOnOpenMPMasterTaskLoopSimdDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -13902,21 +13939,21 @@ StmtResult Sema::ActOnOpenMPMasterTaskLoopSimdDirective(
   // define the nested loops number.
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_master_taskloop_simd, getCollapseNumberExpr(Clauses),
-                      /*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack,
-                      VarsWithImplicitDSA, B);
+                      /*OrderedLoopCountExpr=*/nullptr, AStmt, SemaRef,
+                      *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp for loop exprs were not built");
 
-  if (!CurContext->isDependentContext()) {
+  if (!SemaRef.CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (OMPClause *C : Clauses) {
       if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope,
-                                     DSAStack))
+                                     B.NumIterations, SemaRef,
+                                     SemaRef.getCurScope(), DSAStack))
           return StmtError();
     }
   }
@@ -13924,23 +13961,23 @@ StmtResult Sema::ActOnOpenMPMasterTaskLoopSimdDirective(
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // The grainsize clause and num_tasks clause are mutually exclusive and may
   // not appear on the same taskloop directive.
-  if (checkMutuallyExclusiveClauses(*this, Clauses,
+  if (checkMutuallyExclusiveClauses(SemaRef, Clauses,
                                     {OMPC_grainsize, OMPC_num_tasks}))
     return StmtError();
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // If a reduction clause is present on the taskloop directive, the nogroup
   // clause must not be specified.
-  if (checkReductionClauseWithNogroup(*this, Clauses))
+  if (checkReductionClauseWithNogroup(SemaRef, Clauses))
     return StmtError();
-  if (checkSimdlenSafelenSpecified(*this, Clauses))
+  if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPMasterTaskLoopSimdDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
 
-StmtResult Sema::ActOnOpenMPMaskedTaskLoopSimdDirective(
+StmtResult SemaOpenMP::ActOnOpenMPMaskedTaskLoopSimdDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -13952,21 +13989,21 @@ StmtResult Sema::ActOnOpenMPMaskedTaskLoopSimdDirective(
   // define the nested loops number.
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_masked_taskloop_simd, getCollapseNumberExpr(Clauses),
-                      /*OrderedLoopCountExpr=*/nullptr, AStmt, *this, *DSAStack,
-                      VarsWithImplicitDSA, B);
+                      /*OrderedLoopCountExpr=*/nullptr, AStmt, SemaRef,
+                      *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp for loop exprs were not built");
 
-  if (!CurContext->isDependentContext()) {
+  if (!SemaRef.CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (OMPClause *C : Clauses) {
       if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope,
-                                     DSAStack))
+                                     B.NumIterations, SemaRef,
+                                     SemaRef.getCurScope(), DSAStack))
           return StmtError();
     }
   }
@@ -13974,23 +14011,23 @@ StmtResult Sema::ActOnOpenMPMaskedTaskLoopSimdDirective(
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // The grainsize clause and num_tasks clause are mutually exclusive and may
   // not appear on the same taskloop directive.
-  if (checkMutuallyExclusiveClauses(*this, Clauses,
+  if (checkMutuallyExclusiveClauses(SemaRef, Clauses,
                                     {OMPC_grainsize, OMPC_num_tasks}))
     return StmtError();
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // If a reduction clause is present on the taskloop directive, the nogroup
   // clause must not be specified.
-  if (checkReductionClauseWithNogroup(*this, Clauses))
+  if (checkReductionClauseWithNogroup(SemaRef, Clauses))
     return StmtError();
-  if (checkSimdlenSafelenSpecified(*this, Clauses))
+  if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPMaskedTaskLoopSimdDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
 
-StmtResult Sema::ActOnOpenMPParallelMasterTaskLoopDirective(
+StmtResult SemaOpenMP::ActOnOpenMPParallelMasterTaskLoopDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -14021,33 +14058,33 @@ StmtResult Sema::ActOnOpenMPParallelMasterTaskLoopDirective(
   // define the nested loops number.
   unsigned NestedLoopCount = checkOpenMPLoop(
       OMPD_parallel_master_taskloop, getCollapseNumberExpr(Clauses),
-      /*OrderedLoopCountExpr=*/nullptr, CS, *this, *DSAStack,
+      /*OrderedLoopCountExpr=*/nullptr, CS, SemaRef, *DSAStack,
       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp for loop exprs were not built");
 
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // The grainsize clause and num_tasks clause are mutually exclusive and may
   // not appear on the same taskloop directive.
-  if (checkMutuallyExclusiveClauses(*this, Clauses,
+  if (checkMutuallyExclusiveClauses(SemaRef, Clauses,
                                     {OMPC_grainsize, OMPC_num_tasks}))
     return StmtError();
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // If a reduction clause is present on the taskloop directive, the nogroup
   // clause must not be specified.
-  if (checkReductionClauseWithNogroup(*this, Clauses))
+  if (checkReductionClauseWithNogroup(SemaRef, Clauses))
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPParallelMasterTaskLoopDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
       DSAStack->isCancelRegion());
 }
 
-StmtResult Sema::ActOnOpenMPParallelMaskedTaskLoopDirective(
+StmtResult SemaOpenMP::ActOnOpenMPParallelMaskedTaskLoopDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -14078,33 +14115,33 @@ StmtResult Sema::ActOnOpenMPParallelMaskedTaskLoopDirective(
   // define the nested loops number.
   unsigned NestedLoopCount = checkOpenMPLoop(
       OMPD_parallel_masked_taskloop, getCollapseNumberExpr(Clauses),
-      /*OrderedLoopCountExpr=*/nullptr, CS, *this, *DSAStack,
+      /*OrderedLoopCountExpr=*/nullptr, CS, SemaRef, *DSAStack,
       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp for loop exprs were not built");
 
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // The grainsize clause and num_tasks clause are mutually exclusive and may
   // not appear on the same taskloop directive.
-  if (checkMutuallyExclusiveClauses(*this, Clauses,
+  if (checkMutuallyExclusiveClauses(SemaRef, Clauses,
                                     {OMPC_grainsize, OMPC_num_tasks}))
     return StmtError();
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // If a reduction clause is present on the taskloop directive, the nogroup
   // clause must not be specified.
-  if (checkReductionClauseWithNogroup(*this, Clauses))
+  if (checkReductionClauseWithNogroup(SemaRef, Clauses))
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPParallelMaskedTaskLoopDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
       DSAStack->isCancelRegion());
 }
 
-StmtResult Sema::ActOnOpenMPParallelMasterTaskLoopSimdDirective(
+StmtResult SemaOpenMP::ActOnOpenMPParallelMasterTaskLoopSimdDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -14135,21 +14172,21 @@ StmtResult Sema::ActOnOpenMPParallelMasterTaskLoopSimdDirective(
   // define the nested loops number.
   unsigned NestedLoopCount = checkOpenMPLoop(
       OMPD_parallel_master_taskloop_simd, getCollapseNumberExpr(Clauses),
-      /*OrderedLoopCountExpr=*/nullptr, CS, *this, *DSAStack,
+      /*OrderedLoopCountExpr=*/nullptr, CS, SemaRef, *DSAStack,
       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp for loop exprs were not built");
 
-  if (!CurContext->isDependentContext()) {
+  if (!SemaRef.CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (OMPClause *C : Clauses) {
       if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope,
-                                     DSAStack))
+                                     B.NumIterations, SemaRef,
+                                     SemaRef.getCurScope(), DSAStack))
           return StmtError();
     }
   }
@@ -14157,23 +14194,23 @@ StmtResult Sema::ActOnOpenMPParallelMasterTaskLoopSimdDirective(
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // The grainsize clause and num_tasks clause are mutually exclusive and may
   // not appear on the same taskloop directive.
-  if (checkMutuallyExclusiveClauses(*this, Clauses,
+  if (checkMutuallyExclusiveClauses(SemaRef, Clauses,
                                     {OMPC_grainsize, OMPC_num_tasks}))
     return StmtError();
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // If a reduction clause is present on the taskloop directive, the nogroup
   // clause must not be specified.
-  if (checkReductionClauseWithNogroup(*this, Clauses))
+  if (checkReductionClauseWithNogroup(SemaRef, Clauses))
     return StmtError();
-  if (checkSimdlenSafelenSpecified(*this, Clauses))
+  if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPParallelMasterTaskLoopSimdDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
 
-StmtResult Sema::ActOnOpenMPParallelMaskedTaskLoopSimdDirective(
+StmtResult SemaOpenMP::ActOnOpenMPParallelMaskedTaskLoopSimdDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -14204,21 +14241,21 @@ StmtResult Sema::ActOnOpenMPParallelMaskedTaskLoopSimdDirective(
   // define the nested loops number.
   unsigned NestedLoopCount = checkOpenMPLoop(
       OMPD_parallel_masked_taskloop_simd, getCollapseNumberExpr(Clauses),
-      /*OrderedLoopCountExpr=*/nullptr, CS, *this, *DSAStack,
+      /*OrderedLoopCountExpr=*/nullptr, CS, SemaRef, *DSAStack,
       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp for loop exprs were not built");
 
-  if (!CurContext->isDependentContext()) {
+  if (!SemaRef.CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (OMPClause *C : Clauses) {
       if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope,
-                                     DSAStack))
+                                     B.NumIterations, SemaRef,
+                                     SemaRef.getCurScope(), DSAStack))
           return StmtError();
     }
   }
@@ -14226,23 +14263,23 @@ StmtResult Sema::ActOnOpenMPParallelMaskedTaskLoopSimdDirective(
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // The grainsize clause and num_tasks clause are mutually exclusive and may
   // not appear on the same taskloop directive.
-  if (checkMutuallyExclusiveClauses(*this, Clauses,
+  if (checkMutuallyExclusiveClauses(SemaRef, Clauses,
                                     {OMPC_grainsize, OMPC_num_tasks}))
     return StmtError();
   // OpenMP, [2.9.2 taskloop Construct, Restrictions]
   // If a reduction clause is present on the taskloop directive, the nogroup
   // clause must not be specified.
-  if (checkReductionClauseWithNogroup(*this, Clauses))
+  if (checkReductionClauseWithNogroup(SemaRef, Clauses))
     return StmtError();
-  if (checkSimdlenSafelenSpecified(*this, Clauses))
+  if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPParallelMaskedTaskLoopSimdDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
 
-StmtResult Sema::ActOnOpenMPDistributeDirective(
+StmtResult SemaOpenMP::ActOnOpenMPDistributeDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -14258,21 +14295,21 @@ StmtResult Sema::ActOnOpenMPDistributeDirective(
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_distribute, getCollapseNumberExpr(Clauses),
                       nullptr /*ordered not a clause on distribute*/, AStmt,
-                      *this, *DSAStack, VarsWithImplicitDSA, B);
+                      SemaRef, *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp for loop exprs were not built");
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
   auto *DistributeDirective = OMPDistributeDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
       DSAStack->getMappedDirective());
   return DistributeDirective;
 }
 
-StmtResult Sema::ActOnOpenMPDistributeParallelForDirective(
+StmtResult SemaOpenMP::ActOnOpenMPDistributeParallelForDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -14302,21 +14339,21 @@ StmtResult Sema::ActOnOpenMPDistributeParallelForDirective(
   // define the nested loops number.
   unsigned NestedLoopCount = checkOpenMPLoop(
       OMPD_distribute_parallel_for, getCollapseNumberExpr(Clauses),
-      nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack,
+      nullptr /*ordered not a clause on distribute*/, CS, SemaRef, *DSAStack,
       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp for loop exprs were not built");
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPDistributeParallelForDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
       DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
 }
 
-StmtResult Sema::ActOnOpenMPDistributeParallelForSimdDirective(
+StmtResult SemaOpenMP::ActOnOpenMPDistributeParallelForSimdDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -14346,34 +14383,34 @@ StmtResult Sema::ActOnOpenMPDistributeParallelForSimdDirective(
   // define the nested loops number.
   unsigned NestedLoopCount = checkOpenMPLoop(
       OMPD_distribute_parallel_for_simd, getCollapseNumberExpr(Clauses),
-      nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack,
+      nullptr /*ordered not a clause on distribute*/, CS, SemaRef, *DSAStack,
       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp for loop exprs were not built");
 
-  if (!CurContext->isDependentContext()) {
+  if (!SemaRef.CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (OMPClause *C : Clauses) {
       if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope,
-                                     DSAStack))
+                                     B.NumIterations, SemaRef,
+                                     SemaRef.getCurScope(), DSAStack))
           return StmtError();
     }
   }
 
-  if (checkSimdlenSafelenSpecified(*this, Clauses))
+  if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPDistributeParallelForSimdDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
 
-StmtResult Sema::ActOnOpenMPDistributeSimdDirective(
+StmtResult SemaOpenMP::ActOnOpenMPDistributeSimdDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -14402,34 +14439,34 @@ StmtResult Sema::ActOnOpenMPDistributeSimdDirective(
   // define the nested loops number.
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_distribute_simd, getCollapseNumberExpr(Clauses),
-                      nullptr /*ordered not a clause on distribute*/, CS, *this,
-                      *DSAStack, VarsWithImplicitDSA, B);
+                      nullptr /*ordered not a clause on distribute*/, CS,
+                      SemaRef, *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp for loop exprs were not built");
 
-  if (!CurContext->isDependentContext()) {
+  if (!SemaRef.CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (OMPClause *C : Clauses) {
       if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope,
-                                     DSAStack))
+                                     B.NumIterations, SemaRef,
+                                     SemaRef.getCurScope(), DSAStack))
           return StmtError();
     }
   }
 
-  if (checkSimdlenSafelenSpecified(*this, Clauses))
+  if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
-  return OMPDistributeSimdDirective::Create(Context, StartLoc, EndLoc,
+  SemaRef.setFunctionHasBranchProtectedScope();
+  return OMPDistributeSimdDirective::Create(getASTContext(), StartLoc, EndLoc,
                                             NestedLoopCount, Clauses, AStmt, B);
 }
 
-StmtResult Sema::ActOnOpenMPTargetParallelForSimdDirective(
+StmtResult SemaOpenMP::ActOnOpenMPTargetParallelForSimdDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -14459,33 +14496,33 @@ StmtResult Sema::ActOnOpenMPTargetParallelForSimdDirective(
   // define the nested loops number.
   unsigned NestedLoopCount = checkOpenMPLoop(
       OMPD_target_parallel_for_simd, getCollapseNumberExpr(Clauses),
-      getOrderedNumberExpr(Clauses), CS, *this, *DSAStack, VarsWithImplicitDSA,
-      B);
+      getOrderedNumberExpr(Clauses), CS, SemaRef, *DSAStack,
+      VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp target parallel for simd loop exprs were not built");
 
-  if (!CurContext->isDependentContext()) {
+  if (!SemaRef.CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (OMPClause *C : Clauses) {
       if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope,
-                                     DSAStack))
+                                     B.NumIterations, SemaRef,
+                                     SemaRef.getCurScope(), DSAStack))
           return StmtError();
     }
   }
-  if (checkSimdlenSafelenSpecified(*this, Clauses))
+  if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPTargetParallelForSimdDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
 
-StmtResult Sema::ActOnOpenMPTargetSimdDirective(
+StmtResult SemaOpenMP::ActOnOpenMPTargetSimdDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -14514,34 +14551,34 @@ StmtResult Sema::ActOnOpenMPTargetSimdDirective(
   // nested loops number.
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_target_simd, getCollapseNumberExpr(Clauses),
-                      getOrderedNumberExpr(Clauses), CS, *this, *DSAStack,
+                      getOrderedNumberExpr(Clauses), CS, SemaRef, *DSAStack,
                       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp target simd loop exprs were not built");
 
-  if (!CurContext->isDependentContext()) {
+  if (!SemaRef.CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (OMPClause *C : Clauses) {
       if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope,
-                                     DSAStack))
+                                     B.NumIterations, SemaRef,
+                                     SemaRef.getCurScope(), DSAStack))
           return StmtError();
     }
   }
 
-  if (checkSimdlenSafelenSpecified(*this, Clauses))
+  if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
-  return OMPTargetSimdDirective::Create(Context, StartLoc, EndLoc,
+  SemaRef.setFunctionHasBranchProtectedScope();
+  return OMPTargetSimdDirective::Create(getASTContext(), StartLoc, EndLoc,
                                         NestedLoopCount, Clauses, AStmt, B);
 }
 
-StmtResult Sema::ActOnOpenMPTeamsDistributeDirective(
+StmtResult SemaOpenMP::ActOnOpenMPTeamsDistributeDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -14570,23 +14607,23 @@ StmtResult Sema::ActOnOpenMPTeamsDistributeDirective(
   // define the nested loops number.
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_teams_distribute, getCollapseNumberExpr(Clauses),
-                      nullptr /*ordered not a clause on distribute*/, CS, *this,
-                      *DSAStack, VarsWithImplicitDSA, B);
+                      nullptr /*ordered not a clause on distribute*/, CS,
+                      SemaRef, *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp teams distribute loop exprs were not built");
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
   DSAStack->setParentTeamsRegionLoc(StartLoc);
 
   return OMPTeamsDistributeDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
 
-StmtResult Sema::ActOnOpenMPTeamsDistributeSimdDirective(
+StmtResult SemaOpenMP::ActOnOpenMPTeamsDistributeSimdDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -14616,38 +14653,38 @@ StmtResult Sema::ActOnOpenMPTeamsDistributeSimdDirective(
   // define the nested loops number.
   unsigned NestedLoopCount = checkOpenMPLoop(
       OMPD_teams_distribute_simd, getCollapseNumberExpr(Clauses),
-      nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack,
+      nullptr /*ordered not a clause on distribute*/, CS, SemaRef, *DSAStack,
       VarsWithImplicitDSA, B);
 
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp teams distribute simd loop exprs were not built");
 
-  if (!CurContext->isDependentContext()) {
+  if (!SemaRef.CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (OMPClause *C : Clauses) {
       if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope,
-                                     DSAStack))
+                                     B.NumIterations, SemaRef,
+                                     SemaRef.getCurScope(), DSAStack))
           return StmtError();
     }
   }
 
-  if (checkSimdlenSafelenSpecified(*this, Clauses))
+  if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
   DSAStack->setParentTeamsRegionLoc(StartLoc);
 
   return OMPTeamsDistributeSimdDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
 
-StmtResult Sema::ActOnOpenMPTeamsDistributeParallelForSimdDirective(
+StmtResult SemaOpenMP::ActOnOpenMPTeamsDistributeParallelForSimdDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -14678,38 +14715,38 @@ StmtResult Sema::ActOnOpenMPTeamsDistributeParallelForSimdDirective(
   // define the nested loops number.
   unsigned NestedLoopCount = checkOpenMPLoop(
       OMPD_teams_distribute_parallel_for_simd, getCollapseNumberExpr(Clauses),
-      nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack,
+      nullptr /*ordered not a clause on distribute*/, CS, SemaRef, *DSAStack,
       VarsWithImplicitDSA, B);
 
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp for loop exprs were not built");
 
-  if (!CurContext->isDependentContext()) {
+  if (!SemaRef.CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (OMPClause *C : Clauses) {
       if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope,
-                                     DSAStack))
+                                     B.NumIterations, SemaRef,
+                                     SemaRef.getCurScope(), DSAStack))
           return StmtError();
     }
   }
 
-  if (checkSimdlenSafelenSpecified(*this, Clauses))
+  if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
   DSAStack->setParentTeamsRegionLoc(StartLoc);
 
   return OMPTeamsDistributeParallelForSimdDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
 
-StmtResult Sema::ActOnOpenMPTeamsDistributeParallelForDirective(
+StmtResult SemaOpenMP::ActOnOpenMPTeamsDistributeParallelForDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -14740,28 +14777,27 @@ StmtResult Sema::ActOnOpenMPTeamsDistributeParallelForDirective(
   // define the nested loops number.
   unsigned NestedLoopCount = checkOpenMPLoop(
       OMPD_teams_distribute_parallel_for, getCollapseNumberExpr(Clauses),
-      nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack,
+      nullptr /*ordered not a clause on distribute*/, CS, SemaRef, *DSAStack,
       VarsWithImplicitDSA, B);
 
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp for loop exprs were not built");
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
   DSAStack->setParentTeamsRegionLoc(StartLoc);
 
   return OMPTeamsDistributeParallelForDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
       DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
 }
 
-StmtResult Sema::ActOnOpenMPTargetTeamsDirective(ArrayRef<OMPClause *> Clauses,
-                                                 Stmt *AStmt,
-                                                 SourceLocation StartLoc,
-                                                 SourceLocation EndLoc) {
+StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDirective(
+    ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
+    SourceLocation EndLoc) {
   if (!AStmt)
     return StmtError();
 
@@ -14783,7 +14819,7 @@ StmtResult Sema::ActOnOpenMPTargetTeamsDirective(ArrayRef<OMPClause *> Clauses,
     // longjmp() and throw() must not violate the entry/exit criteria.
     CS->getCapturedDecl()->setNothrow();
   }
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
   const OMPClause *BareClause = nullptr;
   bool HasThreadLimitAndNumTeamsClause = hasClauses(Clauses, OMPC_num_teams) &&
@@ -14798,11 +14834,11 @@ StmtResult Sema::ActOnOpenMPTargetTeamsDirective(ArrayRef<OMPClause *> Clauses,
     return StmtError();
   }
 
-  return OMPTargetTeamsDirective::Create(Context, StartLoc, EndLoc, Clauses,
-                                         AStmt);
+  return OMPTargetTeamsDirective::Create(getASTContext(), StartLoc, EndLoc,
+                                         Clauses, AStmt);
 }
 
-StmtResult Sema::ActOnOpenMPTargetTeamsDistributeDirective(
+StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDistributeDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -14832,20 +14868,20 @@ StmtResult Sema::ActOnOpenMPTargetTeamsDistributeDirective(
   // define the nested loops number.
   unsigned NestedLoopCount = checkOpenMPLoop(
       OMPD_target_teams_distribute, getCollapseNumberExpr(Clauses),
-      nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack,
+      nullptr /*ordered not a clause on distribute*/, CS, SemaRef, *DSAStack,
       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp target teams distribute loop exprs were not built");
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPTargetTeamsDistributeDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
 
-StmtResult Sema::ActOnOpenMPTargetTeamsDistributeParallelForDirective(
+StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDistributeParallelForDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -14875,32 +14911,32 @@ StmtResult Sema::ActOnOpenMPTargetTeamsDistributeParallelForDirective(
   // define the nested loops number.
   unsigned NestedLoopCount = checkOpenMPLoop(
       OMPD_target_teams_distribute_parallel_for, getCollapseNumberExpr(Clauses),
-      nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack,
+      nullptr /*ordered not a clause on distribute*/, CS, SemaRef, *DSAStack,
       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp target teams distribute parallel for loop exprs were not built");
 
-  if (!CurContext->isDependentContext()) {
+  if (!SemaRef.CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (OMPClause *C : Clauses) {
       if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope,
-                                     DSAStack))
+                                     B.NumIterations, SemaRef,
+                                     SemaRef.getCurScope(), DSAStack))
           return StmtError();
     }
   }
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPTargetTeamsDistributeParallelForDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B,
       DSAStack->getTaskgroupReductionRef(), DSAStack->isCancelRegion());
 }
 
-StmtResult Sema::ActOnOpenMPTargetTeamsDistributeParallelForSimdDirective(
+StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDistributeParallelForSimdDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -14931,35 +14967,35 @@ StmtResult Sema::ActOnOpenMPTargetTeamsDistributeParallelForSimdDirective(
   unsigned NestedLoopCount =
       checkOpenMPLoop(OMPD_target_teams_distribute_parallel_for_simd,
                       getCollapseNumberExpr(Clauses),
-                      nullptr /*ordered not a clause on distribute*/, CS, *this,
-                      *DSAStack, VarsWithImplicitDSA, B);
+                      nullptr /*ordered not a clause on distribute*/, CS,
+                      SemaRef, *DSAStack, VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp target teams distribute parallel for simd loop exprs were not "
          "built");
 
-  if (!CurContext->isDependentContext()) {
+  if (!SemaRef.CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (OMPClause *C : Clauses) {
       if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope,
-                                     DSAStack))
+                                     B.NumIterations, SemaRef,
+                                     SemaRef.getCurScope(), DSAStack))
           return StmtError();
     }
   }
 
-  if (checkSimdlenSafelenSpecified(*this, Clauses))
+  if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPTargetTeamsDistributeParallelForSimdDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
 
-StmtResult Sema::ActOnOpenMPTargetTeamsDistributeSimdDirective(
+StmtResult SemaOpenMP::ActOnOpenMPTargetTeamsDistributeSimdDirective(
     ArrayRef<OMPClause *> Clauses, Stmt *AStmt, SourceLocation StartLoc,
     SourceLocation EndLoc, VarsWithInheritedDSAType &VarsWithImplicitDSA) {
   if (!AStmt)
@@ -14989,34 +15025,34 @@ StmtResult Sema::ActOnOpenMPTargetTeamsDistributeSimdDirective(
   // define the nested loops number.
   unsigned NestedLoopCount = checkOpenMPLoop(
       OMPD_target_teams_distribute_simd, getCollapseNumberExpr(Clauses),
-      nullptr /*ordered not a clause on distribute*/, CS, *this, *DSAStack,
+      nullptr /*ordered not a clause on distribute*/, CS, SemaRef, *DSAStack,
       VarsWithImplicitDSA, B);
   if (NestedLoopCount == 0)
     return StmtError();
 
-  assert((CurContext->isDependentContext() || B.builtAll()) &&
+  assert((SemaRef.CurContext->isDependentContext() || B.builtAll()) &&
          "omp target teams distribute simd loop exprs were not built");
 
-  if (!CurContext->isDependentContext()) {
+  if (!SemaRef.CurContext->isDependentContext()) {
     // Finalize the clauses that need pre-built expressions for CodeGen.
     for (OMPClause *C : Clauses) {
       if (auto *LC = dyn_cast<OMPLinearClause>(C))
         if (FinishOpenMPLinearClause(*LC, cast<DeclRefExpr>(B.IterationVarRef),
-                                     B.NumIterations, *this, CurScope,
-                                     DSAStack))
+                                     B.NumIterations, SemaRef,
+                                     SemaRef.getCurScope(), DSAStack))
           return StmtError();
     }
   }
 
-  if (checkSimdlenSafelenSpecified(*this, Clauses))
+  if (checkSimdlenSafelenSpecified(SemaRef, Clauses))
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
   return OMPTargetTeamsDistributeSimdDirective::Create(
-      Context, StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
+      getASTContext(), StartLoc, EndLoc, NestedLoopCount, Clauses, AStmt, B);
 }
 
-bool Sema::checkTransformableLoopNest(
+bool SemaOpenMP::checkTransformableLoopNest(
     OpenMPDirectiveKind Kind, Stmt *AStmt, int NumLoops,
     SmallVectorImpl<OMPLoopBasedDirective::HelperExprs> &LoopHelpers,
     Stmt *&Body,
@@ -15029,7 +15065,7 @@ bool Sema::checkTransformableLoopNest(
                                                         Stmt *CurStmt) {
         VarsWithInheritedDSAType TmpDSA;
         unsigned SingleNumLoops =
-            checkOpenMPLoop(Kind, nullptr, nullptr, CurStmt, *this, *DSAStack,
+            checkOpenMPLoop(Kind, nullptr, nullptr, CurStmt, SemaRef, *DSAStack,
                             TmpDSA, LoopHelpers[Cnt]);
         if (SingleNumLoops == 0)
           return true;
@@ -15065,9 +15101,11 @@ bool Sema::checkTransformableLoopNest(
   return Result;
 }
 
-StmtResult Sema::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
-                                          Stmt *AStmt, SourceLocation StartLoc,
-                                          SourceLocation EndLoc) {
+StmtResult SemaOpenMP::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
+                                                Stmt *AStmt,
+                                                SourceLocation StartLoc,
+                                                SourceLocation EndLoc) {
+  ASTContext &Context = getASTContext();
   auto SizesClauses =
       OMPExecutableDirective::getClausesOfKind<OMPSizesClause>(Clauses);
   if (SizesClauses.empty()) {
@@ -15091,7 +15129,7 @@ StmtResult Sema::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
     return StmtError();
 
   // Delay tiling to when template is completely instantiated.
-  if (CurContext->isDependentContext())
+  if (SemaRef.CurContext->isDependentContext())
     return OMPTileDirective::Create(Context, StartLoc, EndLoc, Clauses,
                                     NumLoops, AStmt, nullptr, nullptr);
 
@@ -15117,7 +15155,7 @@ StmtResult Sema::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
       std::string FloorCntName =
           (Twine(".floor_") + llvm::utostr(I) + ".iv." + OrigVarName).str();
       VarDecl *FloorCntDecl =
-          buildVarDecl(*this, {}, CntTy, FloorCntName, nullptr, OrigCntVar);
+          buildVarDecl(SemaRef, {}, CntTy, FloorCntName, nullptr, OrigCntVar);
       FloorIndVars[I] = FloorCntDecl;
     }
 
@@ -15130,7 +15168,8 @@ StmtResult Sema::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
       // used by the expressions to derive the original iteration variable's
       // value from the logical iteration number.
       auto *TileCntDecl = cast<VarDecl>(IterVarRef->getDecl());
-      TileCntDecl->setDeclName(&PP.getIdentifierTable().get(TileCntName));
+      TileCntDecl->setDeclName(
+          &SemaRef.PP.getIdentifierTable().get(TileCntName));
       TileIndVars[I] = TileCntDecl;
     }
     for (auto &P : OriginalInits[I]) {
@@ -15159,17 +15198,18 @@ StmtResult Sema::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
     auto *OrigCntVar = cast<DeclRefExpr>(LoopHelper.Counters[0]);
     QualType CntTy = OrigCntVar->getType();
     Expr *DimTileSize = SizesClause->getSizesRefs()[I];
-    Scope *CurScope = getCurScope();
+    Scope *CurScope = SemaRef.getCurScope();
 
     // Commonly used variables.
-    DeclRefExpr *TileIV = buildDeclRefExpr(*this, TileIndVars[I], CntTy,
+    DeclRefExpr *TileIV = buildDeclRefExpr(SemaRef, TileIndVars[I], CntTy,
                                            OrigCntVar->getExprLoc());
-    DeclRefExpr *FloorIV = buildDeclRefExpr(*this, FloorIndVars[I], CntTy,
+    DeclRefExpr *FloorIV = buildDeclRefExpr(SemaRef, FloorIndVars[I], CntTy,
                                             OrigCntVar->getExprLoc());
 
     // For init-statement: auto .tile.iv = .floor.iv
-    AddInitializerToDecl(TileIndVars[I], DefaultLvalueConversion(FloorIV).get(),
-                         /*DirectInit=*/false);
+    SemaRef.AddInitializerToDecl(TileIndVars[I],
+                                 SemaRef.DefaultLvalueConversion(FloorIV).get(),
+                                 /*DirectInit=*/false);
     Decl *CounterDecl = TileIndVars[I];
     StmtResult InitStmt = new (Context)
         DeclStmt(DeclGroupRef::Create(Context, &CounterDecl, 1),
@@ -15179,28 +15219,29 @@ StmtResult Sema::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
 
     // For cond-expression: .tile.iv < min(.floor.iv + DimTileSize,
     // NumIterations)
-    ExprResult EndOfTile = BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(),
-                                      BO_Add, FloorIV, DimTileSize);
+    ExprResult EndOfTile = SemaRef.BuildBinOp(
+        CurScope, LoopHelper.Cond->getExprLoc(), BO_Add, FloorIV, DimTileSize);
     if (!EndOfTile.isUsable())
       return StmtError();
     ExprResult IsPartialTile =
-        BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT,
-                   NumIterations, EndOfTile.get());
+        SemaRef.BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT,
+                           NumIterations, EndOfTile.get());
     if (!IsPartialTile.isUsable())
       return StmtError();
-    ExprResult MinTileAndIterSpace = ActOnConditionalOp(
+    ExprResult MinTileAndIterSpace = SemaRef.ActOnConditionalOp(
         LoopHelper.Cond->getBeginLoc(), LoopHelper.Cond->getEndLoc(),
         IsPartialTile.get(), NumIterations, EndOfTile.get());
     if (!MinTileAndIterSpace.isUsable())
       return StmtError();
-    ExprResult CondExpr = BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(),
-                                     BO_LT, TileIV, MinTileAndIterSpace.get());
+    ExprResult CondExpr =
+        SemaRef.BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT,
+                           TileIV, MinTileAndIterSpace.get());
     if (!CondExpr.isUsable())
       return StmtError();
 
     // For incr-statement: ++.tile.iv
-    ExprResult IncrStmt =
-        BuildUnaryOp(CurScope, LoopHelper.Inc->getExprLoc(), UO_PreInc, TileIV);
+    ExprResult IncrStmt = SemaRef.BuildUnaryOp(
+        CurScope, LoopHelper.Inc->getExprLoc(), UO_PreInc, TileIV);
     if (!IncrStmt.isUsable())
       return StmtError();
 
@@ -15235,16 +15276,16 @@ StmtResult Sema::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
     DeclRefExpr *OrigCntVar = cast<DeclRefExpr>(LoopHelper.Counters[0]);
     QualType CntTy = OrigCntVar->getType();
     Expr *DimTileSize = SizesClause->getSizesRefs()[I];
-    Scope *CurScope = getCurScope();
+    Scope *CurScope = SemaRef.getCurScope();
 
     // Commonly used variables.
-    DeclRefExpr *FloorIV = buildDeclRefExpr(*this, FloorIndVars[I], CntTy,
+    DeclRefExpr *FloorIV = buildDeclRefExpr(SemaRef, FloorIndVars[I], CntTy,
                                             OrigCntVar->getExprLoc());
 
     // For init-statement: auto .floor.iv = 0
-    AddInitializerToDecl(
+    SemaRef.AddInitializerToDecl(
         FloorIndVars[I],
-        ActOnIntegerConstant(LoopHelper.Init->getExprLoc(), 0).get(),
+        SemaRef.ActOnIntegerConstant(LoopHelper.Init->getExprLoc(), 0).get(),
         /*DirectInit=*/false);
     Decl *CounterDecl = FloorIndVars[I];
     StmtResult InitStmt = new (Context)
@@ -15254,14 +15295,15 @@ StmtResult Sema::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
       return StmtError();
 
     // For cond-expression: .floor.iv < NumIterations
-    ExprResult CondExpr = BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(),
-                                     BO_LT, FloorIV, NumIterations);
+    ExprResult CondExpr = SemaRef.BuildBinOp(
+        CurScope, LoopHelper.Cond->getExprLoc(), BO_LT, FloorIV, NumIterations);
     if (!CondExpr.isUsable())
       return StmtError();
 
     // For incr-statement: .floor.iv += DimTileSize
-    ExprResult IncrStmt = BuildBinOp(CurScope, LoopHelper.Inc->getExprLoc(),
-                                     BO_AddAssign, FloorIV, DimTileSize);
+    ExprResult IncrStmt =
+        SemaRef.BuildBinOp(CurScope, LoopHelper.Inc->getExprLoc(), BO_AddAssign,
+                           FloorIV, DimTileSize);
     if (!IncrStmt.isUsable())
       return StmtError();
 
@@ -15276,15 +15318,18 @@ StmtResult Sema::ActOnOpenMPTileDirective(ArrayRef<OMPClause *> Clauses,
                                   buildPreInits(Context, PreInits));
 }
 
-StmtResult Sema::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
-                                            Stmt *AStmt,
-                                            SourceLocation StartLoc,
-                                            SourceLocation EndLoc) {
+StmtResult SemaOpenMP::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
+                                                  Stmt *AStmt,
+                                                  SourceLocation StartLoc,
+                                                  SourceLocation EndLoc) {
+  ASTContext &Context = getASTContext();
+  Scope *CurScope = SemaRef.getCurScope();
   // Empty statement should only be possible if there already was an error.
   if (!AStmt)
     return StmtError();
 
-  if (checkMutuallyExclusiveClauses(*this, Clauses, {OMPC_partial, OMPC_full}))
+  if (checkMutuallyExclusiveClauses(SemaRef, Clauses,
+                                    {OMPC_partial, OMPC_full}))
     return StmtError();
 
   const OMPFullClause *FullClause =
@@ -15307,7 +15352,7 @@ StmtResult Sema::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
   unsigned NumGeneratedLoops = PartialClause ? 1 : 0;
 
   // Delay unrolling to when template is completely instantiated.
-  if (CurContext->isDependentContext())
+  if (SemaRef.CurContext->isDependentContext())
     return OMPUnrollDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt,
                                       NumGeneratedLoops, nullptr, nullptr);
 
@@ -15412,8 +15457,8 @@ StmtResult Sema::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
   assert(Factor > 0 && "Expected positive unroll factor");
   auto MakeFactorExpr = [this, Factor, IVTy, FactorLoc]() {
     return IntegerLiteral::Create(
-        Context, llvm::APInt(Context.getIntWidth(IVTy), Factor), IVTy,
-        FactorLoc);
+        getASTContext(), llvm::APInt(getASTContext().getIntWidth(IVTy), Factor),
+        IVTy, FactorLoc);
   };
 
   // Iteration variable SourceLocations.
@@ -15430,30 +15475,31 @@ StmtResult Sema::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
 
   // Create the iteration variable for the unrolled loop.
   VarDecl *OuterIVDecl =
-      buildVarDecl(*this, {}, IVTy, OuterIVName, nullptr, OrigVar);
+      buildVarDecl(SemaRef, {}, IVTy, OuterIVName, nullptr, OrigVar);
   auto MakeOuterRef = [this, OuterIVDecl, IVTy, OrigVarLoc]() {
-    return buildDeclRefExpr(*this, OuterIVDecl, IVTy, OrigVarLoc);
+    return buildDeclRefExpr(SemaRef, OuterIVDecl, IVTy, OrigVarLoc);
   };
 
   // Iteration variable for the inner loop: Reuse the iteration variable created
   // by checkOpenMPLoop.
   auto *InnerIVDecl = cast<VarDecl>(IterationVarRef->getDecl());
-  InnerIVDecl->setDeclName(&PP.getIdentifierTable().get(InnerIVName));
+  InnerIVDecl->setDeclName(&SemaRef.PP.getIdentifierTable().get(InnerIVName));
   auto MakeInnerRef = [this, InnerIVDecl, IVTy, OrigVarLoc]() {
-    return buildDeclRefExpr(*this, InnerIVDecl, IVTy, OrigVarLoc);
+    return buildDeclRefExpr(SemaRef, InnerIVDecl, IVTy, OrigVarLoc);
   };
 
   // Make a copy of the NumIterations expression for each use: By the AST
   // constraints, every expression object in a DeclContext must be unique.
-  CaptureVars CopyTransformer(*this);
+  CaptureVars CopyTransformer(SemaRef);
   auto MakeNumIterations = [&CopyTransformer, &LoopHelper]() -> Expr * {
     return AssertSuccess(
         CopyTransformer.TransformExpr(LoopHelper.NumIterations));
   };
 
   // Inner For init-statement: auto .unroll_inner.iv = .unrolled.iv
-  ExprResult LValueConv = DefaultLvalueConversion(MakeOuterRef());
-  AddInitializerToDecl(InnerIVDecl, LValueConv.get(), /*DirectInit=*/false);
+  ExprResult LValueConv = SemaRef.DefaultLvalueConversion(MakeOuterRef());
+  SemaRef.AddInitializerToDecl(InnerIVDecl, LValueConv.get(),
+                               /*DirectInit=*/false);
   StmtResult InnerInit = new (Context)
       DeclStmt(DeclGroupRef(InnerIVDecl), OrigVarLocBegin, OrigVarLocEnd);
   if (!InnerInit.isUsable())
@@ -15466,28 +15512,30 @@ StmtResult Sema::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
   // \endcode
   // This conjunction of two conditions allows ScalarEvolution to derive the
   // maximum trip count of the inner loop.
-  ExprResult EndOfTile = BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(),
-                                    BO_Add, MakeOuterRef(), MakeFactorExpr());
+  ExprResult EndOfTile =
+      SemaRef.BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_Add,
+                         MakeOuterRef(), MakeFactorExpr());
   if (!EndOfTile.isUsable())
     return StmtError();
-  ExprResult InnerCond1 = BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(),
-                                     BO_LT, MakeInnerRef(), EndOfTile.get());
+  ExprResult InnerCond1 =
+      SemaRef.BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT,
+                         MakeInnerRef(), EndOfTile.get());
   if (!InnerCond1.isUsable())
     return StmtError();
   ExprResult InnerCond2 =
-      BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT, MakeInnerRef(),
-                 MakeNumIterations());
+      SemaRef.BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT,
+                         MakeInnerRef(), MakeNumIterations());
   if (!InnerCond2.isUsable())
     return StmtError();
   ExprResult InnerCond =
-      BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LAnd,
-                 InnerCond1.get(), InnerCond2.get());
+      SemaRef.BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LAnd,
+                         InnerCond1.get(), InnerCond2.get());
   if (!InnerCond.isUsable())
     return StmtError();
 
   // Inner For incr-statement: ++.unroll_inner.iv
-  ExprResult InnerIncr = BuildUnaryOp(CurScope, LoopHelper.Inc->getExprLoc(),
-                                      UO_PreInc, MakeInnerRef());
+  ExprResult InnerIncr = SemaRef.BuildUnaryOp(
+      CurScope, LoopHelper.Inc->getExprLoc(), UO_PreInc, MakeInnerRef());
   if (!InnerIncr.isUsable())
     return StmtError();
 
@@ -15496,7 +15544,7 @@ StmtResult Sema::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
   InnerBodyStmts.append(LoopHelper.Updates.begin(), LoopHelper.Updates.end());
   InnerBodyStmts.push_back(Body);
   CompoundStmt *InnerBody =
-      CompoundStmt::Create(Context, InnerBodyStmts, FPOptionsOverride(),
+      CompoundStmt::Create(getASTContext(), InnerBodyStmts, FPOptionsOverride(),
                            Body->getBeginLoc(), Body->getEndLoc());
   ForStmt *InnerFor = new (Context)
       ForStmt(Context, InnerInit.get(), InnerCond.get(), nullptr,
@@ -15518,12 +15566,13 @@ StmtResult Sema::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
   LoopHintAttr *UnrollHintAttr =
       LoopHintAttr::CreateImplicit(Context, LoopHintAttr::UnrollCount,
                                    LoopHintAttr::Numeric, MakeFactorExpr());
-  AttributedStmt *InnerUnrolled =
-      AttributedStmt::Create(Context, StartLoc, {UnrollHintAttr}, InnerFor);
+  AttributedStmt *InnerUnrolled = AttributedStmt::Create(
+      getASTContext(), StartLoc, {UnrollHintAttr}, InnerFor);
 
   // Outer For init-statement: auto .unrolled.iv = 0
-  AddInitializerToDecl(
-      OuterIVDecl, ActOnIntegerConstant(LoopHelper.Init->getExprLoc(), 0).get(),
+  SemaRef.AddInitializerToDecl(
+      OuterIVDecl,
+      SemaRef.ActOnIntegerConstant(LoopHelper.Init->getExprLoc(), 0).get(),
       /*DirectInit=*/false);
   StmtResult OuterInit = new (Context)
       DeclStmt(DeclGroupRef(OuterIVDecl), OrigVarLocBegin, OrigVarLocEnd);
@@ -15532,15 +15581,15 @@ StmtResult Sema::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
 
   // Outer For cond-expression: .unrolled.iv < NumIterations
   ExprResult OuterConde =
-      BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT, MakeOuterRef(),
-                 MakeNumIterations());
+      SemaRef.BuildBinOp(CurScope, LoopHelper.Cond->getExprLoc(), BO_LT,
+                         MakeOuterRef(), MakeNumIterations());
   if (!OuterConde.isUsable())
     return StmtError();
 
   // Outer For incr-statement: .unrolled.iv += Factor
   ExprResult OuterIncr =
-      BuildBinOp(CurScope, LoopHelper.Inc->getExprLoc(), BO_AddAssign,
-                 MakeOuterRef(), MakeFactorExpr());
+      SemaRef.BuildBinOp(CurScope, LoopHelper.Inc->getExprLoc(), BO_AddAssign,
+                         MakeOuterRef(), MakeFactorExpr());
   if (!OuterIncr.isUsable())
     return StmtError();
 
@@ -15555,10 +15604,11 @@ StmtResult Sema::ActOnOpenMPUnrollDirective(ArrayRef<OMPClause *> Clauses,
                                     buildPreInits(Context, PreInits));
 }
 
-OMPClause *Sema::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, Expr *Expr,
-                                             SourceLocation StartLoc,
-                                             SourceLocation LParenLoc,
-                                             SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind,
+                                                   Expr *Expr,
+                                                   SourceLocation StartLoc,
+                                                   SourceLocation LParenLoc,
+                                                   SourceLocation EndLoc) {
   OMPClause *Res = nullptr;
   switch (Kind) {
   case OMPC_final:
@@ -16646,19 +16696,17 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause(
   return CaptureRegion;
 }
 
-OMPClause *Sema::ActOnOpenMPIfClause(OpenMPDirectiveKind NameModifier,
-                                     Expr *Condition, SourceLocation StartLoc,
-                                     SourceLocation LParenLoc,
-                                     SourceLocation NameModifierLoc,
-                                     SourceLocation ColonLoc,
-                                     SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPIfClause(
+    OpenMPDirectiveKind NameModifier, Expr *Condition, SourceLocation StartLoc,
+    SourceLocation LParenLoc, SourceLocation NameModifierLoc,
+    SourceLocation ColonLoc, SourceLocation EndLoc) {
   Expr *ValExpr = Condition;
   Stmt *HelperValStmt = nullptr;
   OpenMPDirectiveKind CaptureRegion = OMPD_unknown;
   if (!Condition->isValueDependent() && !Condition->isTypeDependent() &&
       !Condition->isInstantiationDependent() &&
       !Condition->containsUnexpandedParameterPack()) {
-    ExprResult Val = CheckBooleanCondition(StartLoc, Condition);
+    ExprResult Val = SemaRef.CheckBooleanCondition(StartLoc, Condition);
     if (Val.isInvalid())
       return nullptr;
 
@@ -16666,57 +16714,60 @@ OMPClause *Sema::ActOnOpenMPIfClause(OpenMPDirectiveKind NameModifier,
 
     OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
     CaptureRegion = getOpenMPCaptureRegionForClause(
-        DKind, OMPC_if, LangOpts.OpenMP, NameModifier);
-    if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
-      ValExpr = MakeFullExpr(ValExpr).get();
+        DKind, OMPC_if, getLangOpts().OpenMP, NameModifier);
+    if (CaptureRegion != OMPD_unknown &&
+        !SemaRef.CurContext->isDependentContext()) {
+      ValExpr = SemaRef.MakeFullExpr(ValExpr).get();
       llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
-      ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
-      HelperValStmt = buildPreInits(Context, Captures);
+      ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get();
+      HelperValStmt = buildPreInits(getASTContext(), Captures);
     }
   }
 
-  return new (Context)
+  return new (getASTContext())
       OMPIfClause(NameModifier, ValExpr, HelperValStmt, CaptureRegion, StartLoc,
                   LParenLoc, NameModifierLoc, ColonLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPFinalClause(Expr *Condition,
-                                        SourceLocation StartLoc,
-                                        SourceLocation LParenLoc,
-                                        SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPFinalClause(Expr *Condition,
+                                              SourceLocation StartLoc,
+                                              SourceLocation LParenLoc,
+                                              SourceLocation EndLoc) {
   Expr *ValExpr = Condition;
   Stmt *HelperValStmt = nullptr;
   OpenMPDirectiveKind CaptureRegion = OMPD_unknown;
   if (!Condition->isValueDependent() && !Condition->isTypeDependent() &&
       !Condition->isInstantiationDependent() &&
       !Condition->containsUnexpandedParameterPack()) {
-    ExprResult Val = CheckBooleanCondition(StartLoc, Condition);
+    ExprResult Val = SemaRef.CheckBooleanCondition(StartLoc, Condition);
     if (Val.isInvalid())
       return nullptr;
 
-    ValExpr = MakeFullExpr(Val.get()).get();
+    ValExpr = SemaRef.MakeFullExpr(Val.get()).get();
 
     OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
-    CaptureRegion =
-        getOpenMPCaptureRegionForClause(DKind, OMPC_final, LangOpts.OpenMP);
-    if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
-      ValExpr = MakeFullExpr(ValExpr).get();
+    CaptureRegion = getOpenMPCaptureRegionForClause(DKind, OMPC_final,
+                                                    getLangOpts().OpenMP);
+    if (CaptureRegion != OMPD_unknown &&
+        !SemaRef.CurContext->isDependentContext()) {
+      ValExpr = SemaRef.MakeFullExpr(ValExpr).get();
       llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
-      ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
-      HelperValStmt = buildPreInits(Context, Captures);
+      ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get();
+      HelperValStmt = buildPreInits(getASTContext(), Captures);
     }
   }
 
-  return new (Context) OMPFinalClause(ValExpr, HelperValStmt, CaptureRegion,
-                                      StartLoc, LParenLoc, EndLoc);
+  return new (getASTContext()) OMPFinalClause(
+      ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc);
 }
 
-ExprResult Sema::PerformOpenMPImplicitIntegerConversion(SourceLocation Loc,
-                                                        Expr *Op) {
+ExprResult
+SemaOpenMP::PerformOpenMPImplicitIntegerConversion(SourceLocation Loc,
+                                                   Expr *Op) {
   if (!Op)
     return ExprError();
 
-  class IntConvertDiagnoser : public ICEConvertDiagnoser {
+  class IntConvertDiagnoser : public Sema::ICEConvertDiagnoser {
   public:
     IntConvertDiagnoser()
         : ICEConvertDiagnoser(/*AllowScopedEnumerations*/ false, false, true) {}
@@ -16752,7 +16803,7 @@ ExprResult Sema::PerformOpenMPImplicitIntegerConversion(SourceLocation Loc,
       llvm_unreachable("conversion functions are permitted");
     }
   } ConvertDiagnoser;
-  return PerformContextualImplicitConversion(Loc, Op, ConvertDiagnoser);
+  return SemaRef.PerformContextualImplicitConversion(Loc, Op, ConvertDiagnoser);
 }
 
 static bool
@@ -16765,7 +16816,7 @@ isNonNegativeIntegerValue(Expr *&ValExpr, Sema &SemaRef, OpenMPClauseKind CKind,
       !ValExpr->isInstantiationDependent()) {
     SourceLocation Loc = ValExpr->getExprLoc();
     ExprResult Value =
-        SemaRef.PerformOpenMPImplicitIntegerConversion(Loc, ValExpr);
+        SemaRef.OpenMP().PerformOpenMPImplicitIntegerConversion(Loc, ValExpr);
     if (Value.isInvalid())
       return false;
 
@@ -16797,37 +16848,37 @@ isNonNegativeIntegerValue(Expr *&ValExpr, Sema &SemaRef, OpenMPClauseKind CKind,
   return true;
 }
 
-OMPClause *Sema::ActOnOpenMPNumThreadsClause(Expr *NumThreads,
-                                             SourceLocation StartLoc,
-                                             SourceLocation LParenLoc,
-                                             SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPNumThreadsClause(Expr *NumThreads,
+                                                   SourceLocation StartLoc,
+                                                   SourceLocation LParenLoc,
+                                                   SourceLocation EndLoc) {
   Expr *ValExpr = NumThreads;
   Stmt *HelperValStmt = nullptr;
 
   // OpenMP [2.5, Restrictions]
   //  The num_threads expression must evaluate to a positive integer value.
-  if (!isNonNegativeIntegerValue(ValExpr, *this, OMPC_num_threads,
+  if (!isNonNegativeIntegerValue(ValExpr, SemaRef, OMPC_num_threads,
                                  /*StrictlyPositive=*/true))
     return nullptr;
 
   OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
-  OpenMPDirectiveKind CaptureRegion =
-      getOpenMPCaptureRegionForClause(DKind, OMPC_num_threads, LangOpts.OpenMP);
-  if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
-    ValExpr = MakeFullExpr(ValExpr).get();
+  OpenMPDirectiveKind CaptureRegion = getOpenMPCaptureRegionForClause(
+      DKind, OMPC_num_threads, getLangOpts().OpenMP);
+  if (CaptureRegion != OMPD_unknown &&
+      !SemaRef.CurContext->isDependentContext()) {
+    ValExpr = SemaRef.MakeFullExpr(ValExpr).get();
     llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
-    ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
-    HelperValStmt = buildPreInits(Context, Captures);
+    ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get();
+    HelperValStmt = buildPreInits(getASTContext(), Captures);
   }
 
-  return new (Context) OMPNumThreadsClause(
+  return new (getASTContext()) OMPNumThreadsClause(
       ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc);
 }
 
-ExprResult Sema::VerifyPositiveIntegerConstantInClause(Expr *E,
-                                                       OpenMPClauseKind CKind,
-                                                       bool StrictlyPositive,
-                                                       bool SuppressExprDiags) {
+ExprResult SemaOpenMP::VerifyPositiveIntegerConstantInClause(
+    Expr *E, OpenMPClauseKind CKind, bool StrictlyPositive,
+    bool SuppressExprDiags) {
   if (!E)
     return ExprError();
   if (E->isValueDependent() || E->isTypeDependent() ||
@@ -16841,14 +16892,16 @@ ExprResult Sema::VerifyPositiveIntegerConstantInClause(Expr *E,
     // expression.
     struct SuppressedDiagnoser : public Sema::VerifyICEDiagnoser {
       SuppressedDiagnoser() : VerifyICEDiagnoser(/*Suppress=*/true) {}
-      Sema::SemaDiagnosticBuilder diagnoseNotICE(Sema &S,
-                                                 SourceLocation Loc) override {
+      SemaBase::SemaDiagnosticBuilder
+      diagnoseNotICE(Sema &S, SourceLocation Loc) override {
         llvm_unreachable("Diagnostic suppressed");
       }
     } Diagnoser;
-    ICE = VerifyIntegerConstantExpression(E, &Result, Diagnoser, AllowFold);
+    ICE = SemaRef.VerifyIntegerConstantExpression(E, &Result, Diagnoser,
+                                                  Sema::AllowFold);
   } else {
-    ICE = VerifyIntegerConstantExpression(E, &Result, /*FIXME*/ AllowFold);
+    ICE = SemaRef.VerifyIntegerConstantExpression(E, &Result,
+                                                  /*FIXME*/ Sema::AllowFold);
   }
   if (ICE.isInvalid())
     return ExprError();
@@ -16872,29 +16925,31 @@ ExprResult Sema::VerifyPositiveIntegerConstantInClause(Expr *E,
   return ICE;
 }
 
-OMPClause *Sema::ActOnOpenMPSafelenClause(Expr *Len, SourceLocation StartLoc,
-                                          SourceLocation LParenLoc,
-                                          SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPSafelenClause(Expr *Len,
+                                                SourceLocation StartLoc,
+                                                SourceLocation LParenLoc,
+                                                SourceLocation EndLoc) {
   // OpenMP [2.8.1, simd construct, Description]
   // The parameter of the safelen clause must be a constant
   // positive integer expression.
   ExprResult Safelen = VerifyPositiveIntegerConstantInClause(Len, OMPC_safelen);
   if (Safelen.isInvalid())
     return nullptr;
-  return new (Context)
+  return new (getASTContext())
       OMPSafelenClause(Safelen.get(), StartLoc, LParenLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPSimdlenClause(Expr *Len, SourceLocation StartLoc,
-                                          SourceLocation LParenLoc,
-                                          SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPSimdlenClause(Expr *Len,
+                                                SourceLocation StartLoc,
+                                                SourceLocation LParenLoc,
+                                                SourceLocation EndLoc) {
   // OpenMP [2.8.1, simd construct, Description]
   // The parameter of the simdlen clause must be a constant
   // positive integer expression.
   ExprResult Simdlen = VerifyPositiveIntegerConstantInClause(Len, OMPC_simdlen);
   if (Simdlen.isInvalid())
     return nullptr;
-  return new (Context)
+  return new (getASTContext())
       OMPSimdlenClause(Simdlen.get(), StartLoc, LParenLoc, EndLoc);
 }
 
@@ -16954,31 +17009,32 @@ static bool findOMPAllocatorHandleT(Sema &S, SourceLocation Loc,
   return true;
 }
 
-OMPClause *Sema::ActOnOpenMPAllocatorClause(Expr *A, SourceLocation StartLoc,
-                                            SourceLocation LParenLoc,
-                                            SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPAllocatorClause(Expr *A,
+                                                  SourceLocation StartLoc,
+                                                  SourceLocation LParenLoc,
+                                                  SourceLocation EndLoc) {
   // OpenMP [2.11.3, allocate Directive, Description]
   // allocator is an expression of omp_allocator_handle_t type.
-  if (!findOMPAllocatorHandleT(*this, A->getExprLoc(), DSAStack))
+  if (!findOMPAllocatorHandleT(SemaRef, A->getExprLoc(), DSAStack))
     return nullptr;
 
-  ExprResult Allocator = DefaultLvalueConversion(A);
+  ExprResult Allocator = SemaRef.DefaultLvalueConversion(A);
   if (Allocator.isInvalid())
     return nullptr;
-  Allocator = PerformImplicitConversion(Allocator.get(),
-                                        DSAStack->getOMPAllocatorHandleT(),
-                                        Sema::AA_Initializing,
-                                        /*AllowExplicit=*/true);
+  Allocator = SemaRef.PerformImplicitConversion(
+      Allocator.get(), DSAStack->getOMPAllocatorHandleT(),
+      Sema::AA_Initializing,
+      /*AllowExplicit=*/true);
   if (Allocator.isInvalid())
     return nullptr;
-  return new (Context)
+  return new (getASTContext())
       OMPAllocatorClause(Allocator.get(), StartLoc, LParenLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPCollapseClause(Expr *NumForLoops,
-                                           SourceLocation StartLoc,
-                                           SourceLocation LParenLoc,
-                                           SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPCollapseClause(Expr *NumForLoops,
+                                                 SourceLocation StartLoc,
+                                                 SourceLocation LParenLoc,
+                                                 SourceLocation EndLoc) {
   // OpenMP [2.7.1, loop construct, Description]
   // OpenMP [2.8.1, simd construct, Description]
   // OpenMP [2.9.6, distribute construct, Description]
@@ -16988,14 +17044,14 @@ OMPClause *Sema::ActOnOpenMPCollapseClause(Expr *NumForLoops,
       VerifyPositiveIntegerConstantInClause(NumForLoops, OMPC_collapse);
   if (NumForLoopsResult.isInvalid())
     return nullptr;
-  return new (Context)
+  return new (getASTContext())
       OMPCollapseClause(NumForLoopsResult.get(), StartLoc, LParenLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPOrderedClause(SourceLocation StartLoc,
-                                          SourceLocation EndLoc,
-                                          SourceLocation LParenLoc,
-                                          Expr *NumForLoops) {
+OMPClause *SemaOpenMP::ActOnOpenMPOrderedClause(SourceLocation StartLoc,
+                                                SourceLocation EndLoc,
+                                                SourceLocation LParenLoc,
+                                                Expr *NumForLoops) {
   // OpenMP [2.7.1, loop construct, Description]
   // OpenMP [2.8.1, simd construct, Description]
   // OpenMP [2.9.6, distribute construct, Description]
@@ -17010,14 +17066,15 @@ OMPClause *Sema::ActOnOpenMPOrderedClause(SourceLocation StartLoc,
   } else {
     NumForLoops = nullptr;
   }
-  auto *Clause = OMPOrderedClause::Create(
-      Context, NumForLoops, NumForLoops ? DSAStack->getAssociatedLoops() : 0,
-      StartLoc, LParenLoc, EndLoc);
+  auto *Clause =
+      OMPOrderedClause::Create(getASTContext(), NumForLoops,
+                               NumForLoops ? DSAStack->getAssociatedLoops() : 0,
+                               StartLoc, LParenLoc, EndLoc);
   DSAStack->setOrderedRegion(/*IsOrdered=*/true, NumForLoops, Clause);
   return Clause;
 }
 
-OMPClause *Sema::ActOnOpenMPSimpleClause(
+OMPClause *SemaOpenMP::ActOnOpenMPSimpleClause(
     OpenMPClauseKind Kind, unsigned Argument, SourceLocation ArgumentLoc,
     SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) {
   OMPClause *Res = nullptr;
@@ -17159,11 +17216,11 @@ getListOfPossibleValues(OpenMPClauseKind K, unsigned First, unsigned Last,
   return std::string(Out.str());
 }
 
-OMPClause *Sema::ActOnOpenMPDefaultClause(DefaultKind Kind,
-                                          SourceLocation KindKwLoc,
-                                          SourceLocation StartLoc,
-                                          SourceLocation LParenLoc,
-                                          SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPDefaultClause(DefaultKind Kind,
+                                                SourceLocation KindKwLoc,
+                                                SourceLocation StartLoc,
+                                                SourceLocation LParenLoc,
+                                                SourceLocation EndLoc) {
   if (Kind == OMP_DEFAULT_unknown) {
     Diag(KindKwLoc, diag::err_omp_unexpected_clause_value)
         << getListOfPossibleValues(OMPC_default, /*First=*/0,
@@ -17189,39 +17246,39 @@ OMPClause *Sema::ActOnOpenMPDefaultClause(DefaultKind Kind,
     llvm_unreachable("DSA unexpected in OpenMP default clause");
   }
 
-  return new (Context)
+  return new (getASTContext())
       OMPDefaultClause(Kind, KindKwLoc, StartLoc, LParenLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPProcBindClause(ProcBindKind Kind,
-                                           SourceLocation KindKwLoc,
-                                           SourceLocation StartLoc,
-                                           SourceLocation LParenLoc,
-                                           SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPProcBindClause(ProcBindKind Kind,
+                                                 SourceLocation KindKwLoc,
+                                                 SourceLocation StartLoc,
+                                                 SourceLocation LParenLoc,
+                                                 SourceLocation EndLoc) {
   if (Kind == OMP_PROC_BIND_unknown) {
     Diag(KindKwLoc, diag::err_omp_unexpected_clause_value)
         << getListOfPossibleValues(OMPC_proc_bind,
                                    /*First=*/unsigned(OMP_PROC_BIND_master),
                                    /*Last=*/
-                                   unsigned(LangOpts.OpenMP > 50
+                                   unsigned(getLangOpts().OpenMP > 50
                                                 ? OMP_PROC_BIND_primary
                                                 : OMP_PROC_BIND_spread) +
                                        1)
         << getOpenMPClauseName(OMPC_proc_bind);
     return nullptr;
   }
-  if (Kind == OMP_PROC_BIND_primary && LangOpts.OpenMP < 51)
+  if (Kind == OMP_PROC_BIND_primary && getLangOpts().OpenMP < 51)
     Diag(KindKwLoc, diag::err_omp_unexpected_clause_value)
         << getListOfPossibleValues(OMPC_proc_bind,
                                    /*First=*/unsigned(OMP_PROC_BIND_master),
                                    /*Last=*/
                                    unsigned(OMP_PROC_BIND_spread) + 1)
         << getOpenMPClauseName(OMPC_proc_bind);
-  return new (Context)
+  return new (getASTContext())
       OMPProcBindClause(Kind, KindKwLoc, StartLoc, LParenLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPAtomicDefaultMemOrderClause(
+OMPClause *SemaOpenMP::ActOnOpenMPAtomicDefaultMemOrderClause(
     OpenMPAtomicDefaultMemOrderClauseKind Kind, SourceLocation KindKwLoc,
     SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc) {
   if (Kind == OMPC_ATOMIC_DEFAULT_MEM_ORDER_unknown) {
@@ -17232,15 +17289,15 @@ OMPClause *Sema::ActOnOpenMPAtomicDefaultMemOrderClause(
         << getOpenMPClauseName(OMPC_atomic_default_mem_order);
     return nullptr;
   }
-  return new (Context) OMPAtomicDefaultMemOrderClause(Kind, KindKwLoc, StartLoc,
-                                                      LParenLoc, EndLoc);
+  return new (getASTContext()) OMPAtomicDefaultMemOrderClause(
+      Kind, KindKwLoc, StartLoc, LParenLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPAtClause(OpenMPAtClauseKind Kind,
-                                     SourceLocation KindKwLoc,
-                                     SourceLocation StartLoc,
-                                     SourceLocation LParenLoc,
-                                     SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPAtClause(OpenMPAtClauseKind Kind,
+                                           SourceLocation KindKwLoc,
+                                           SourceLocation StartLoc,
+                                           SourceLocation LParenLoc,
+                                           SourceLocation EndLoc) {
   if (Kind == OMPC_AT_unknown) {
     Diag(KindKwLoc, diag::err_omp_unexpected_clause_value)
         << getListOfPossibleValues(OMPC_at, /*First=*/0,
@@ -17248,15 +17305,15 @@ OMPClause *Sema::ActOnOpenMPAtClause(OpenMPAtClauseKind Kind,
         << getOpenMPClauseName(OMPC_at);
     return nullptr;
   }
-  return new (Context)
+  return new (getASTContext())
       OMPAtClause(Kind, KindKwLoc, StartLoc, LParenLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPSeverityClause(OpenMPSeverityClauseKind Kind,
-                                           SourceLocation KindKwLoc,
-                                           SourceLocation StartLoc,
-                                           SourceLocation LParenLoc,
-                                           SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPSeverityClause(OpenMPSeverityClauseKind Kind,
+                                                 SourceLocation KindKwLoc,
+                                                 SourceLocation StartLoc,
+                                                 SourceLocation LParenLoc,
+                                                 SourceLocation EndLoc) {
   if (Kind == OMPC_SEVERITY_unknown) {
     Diag(KindKwLoc, diag::err_omp_unexpected_clause_value)
         << getListOfPossibleValues(OMPC_severity, /*First=*/0,
@@ -17264,28 +17321,30 @@ OMPClause *Sema::ActOnOpenMPSeverityClause(OpenMPSeverityClauseKind Kind,
         << getOpenMPClauseName(OMPC_severity);
     return nullptr;
   }
-  return new (Context)
+  return new (getASTContext())
       OMPSeverityClause(Kind, KindKwLoc, StartLoc, LParenLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPMessageClause(Expr *ME, SourceLocation StartLoc,
-                                          SourceLocation LParenLoc,
-                                          SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPMessageClause(Expr *ME,
+                                                SourceLocation StartLoc,
+                                                SourceLocation LParenLoc,
+                                                SourceLocation EndLoc) {
   assert(ME && "NULL expr in Message clause");
   if (!isa<StringLiteral>(ME)) {
     Diag(ME->getBeginLoc(), diag::warn_clause_expected_string)
         << getOpenMPClauseName(OMPC_message);
     return nullptr;
   }
-  return new (Context) OMPMessageClause(ME, StartLoc, LParenLoc, EndLoc);
+  return new (getASTContext())
+      OMPMessageClause(ME, StartLoc, LParenLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPOrderClause(
+OMPClause *SemaOpenMP::ActOnOpenMPOrderClause(
     OpenMPOrderClauseModifier Modifier, OpenMPOrderClauseKind Kind,
     SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation MLoc,
     SourceLocation KindLoc, SourceLocation EndLoc) {
   if (Kind != OMPC_ORDER_concurrent ||
-      (LangOpts.OpenMP < 51 && MLoc.isValid())) {
+      (getLangOpts().OpenMP < 51 && MLoc.isValid())) {
     // Kind should be concurrent,
     // Modifiers introduced in OpenMP 5.1
     static_assert(OMPC_ORDER_unknown > 0,
@@ -17298,7 +17357,7 @@ OMPClause *Sema::ActOnOpenMPOrderClause(
         << getOpenMPClauseName(OMPC_order);
     return nullptr;
   }
-  if (LangOpts.OpenMP >= 51) {
+  if (getLangOpts().OpenMP >= 51) {
     if (Modifier == OMPC_ORDER_MODIFIER_unknown && MLoc.isValid()) {
       Diag(MLoc, diag::err_omp_unexpected_clause_value)
           << getListOfPossibleValues(OMPC_order,
@@ -17315,21 +17374,21 @@ OMPClause *Sema::ActOnOpenMPOrderClause(
       }
     }
   }
-  return new (Context) OMPOrderClause(Kind, KindLoc, StartLoc, LParenLoc,
-                                      EndLoc, Modifier, MLoc);
+  return new (getASTContext()) OMPOrderClause(
+      Kind, KindLoc, StartLoc, LParenLoc, EndLoc, Modifier, MLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPUpdateClause(OpenMPDependClauseKind Kind,
-                                         SourceLocation KindKwLoc,
-                                         SourceLocation StartLoc,
-                                         SourceLocation LParenLoc,
-                                         SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPUpdateClause(OpenMPDependClauseKind Kind,
+                                               SourceLocation KindKwLoc,
+                                               SourceLocation StartLoc,
+                                               SourceLocation LParenLoc,
+                                               SourceLocation EndLoc) {
   if (Kind == OMPC_DEPEND_unknown || Kind == OMPC_DEPEND_source ||
       Kind == OMPC_DEPEND_sink || Kind == OMPC_DEPEND_depobj) {
     SmallVector<unsigned> Except = {
         OMPC_DEPEND_source, OMPC_DEPEND_sink, OMPC_DEPEND_depobj,
         OMPC_DEPEND_outallmemory, OMPC_DEPEND_inoutallmemory};
-    if (LangOpts.OpenMP < 51)
+    if (getLangOpts().OpenMP < 51)
       Except.push_back(OMPC_DEPEND_inoutset);
     Diag(KindKwLoc, diag::err_omp_unexpected_clause_value)
         << getListOfPossibleValues(OMPC_depend, /*First=*/0,
@@ -17337,14 +17396,14 @@ OMPClause *Sema::ActOnOpenMPUpdateClause(OpenMPDependClauseKind Kind,
         << getOpenMPClauseName(OMPC_update);
     return nullptr;
   }
-  return OMPUpdateClause::Create(Context, StartLoc, LParenLoc, KindKwLoc, Kind,
-                                 EndLoc);
+  return OMPUpdateClause::Create(getASTContext(), StartLoc, LParenLoc,
+                                 KindKwLoc, Kind, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPSizesClause(ArrayRef<Expr *> SizeExprs,
-                                        SourceLocation StartLoc,
-                                        SourceLocation LParenLoc,
-                                        SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPSizesClause(ArrayRef<Expr *> SizeExprs,
+                                              SourceLocation StartLoc,
+                                              SourceLocation LParenLoc,
+                                              SourceLocation EndLoc) {
   for (Expr *SizeExpr : SizeExprs) {
     ExprResult NumForLoopsResult = VerifyPositiveIntegerConstantInClause(
         SizeExpr, OMPC_sizes, /*StrictlyPositive=*/true);
@@ -17353,19 +17412,19 @@ OMPClause *Sema::ActOnOpenMPSizesClause(ArrayRef<Expr *> SizeExprs,
   }
 
   DSAStack->setAssociatedLoops(SizeExprs.size());
-  return OMPSizesClause::Create(Context, StartLoc, LParenLoc, EndLoc,
+  return OMPSizesClause::Create(getASTContext(), StartLoc, LParenLoc, EndLoc,
                                 SizeExprs);
 }
 
-OMPClause *Sema::ActOnOpenMPFullClause(SourceLocation StartLoc,
-                                       SourceLocation EndLoc) {
-  return OMPFullClause::Create(Context, StartLoc, EndLoc);
+OMPClause *SemaOpenMP::ActOnOpenMPFullClause(SourceLocation StartLoc,
+                                             SourceLocation EndLoc) {
+  return OMPFullClause::Create(getASTContext(), StartLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPPartialClause(Expr *FactorExpr,
-                                          SourceLocation StartLoc,
-                                          SourceLocation LParenLoc,
-                                          SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPPartialClause(Expr *FactorExpr,
+                                                SourceLocation StartLoc,
+                                                SourceLocation LParenLoc,
+                                                SourceLocation EndLoc) {
   if (FactorExpr) {
     // If an argument is specified, it must be a constant (or an unevaluated
     // template expression).
@@ -17376,22 +17435,22 @@ OMPClause *Sema::ActOnOpenMPPartialClause(Expr *FactorExpr,
     FactorExpr = FactorResult.get();
   }
 
-  return OMPPartialClause::Create(Context, StartLoc, LParenLoc, EndLoc,
+  return OMPPartialClause::Create(getASTContext(), StartLoc, LParenLoc, EndLoc,
                                   FactorExpr);
 }
 
-OMPClause *Sema::ActOnOpenMPAlignClause(Expr *A, SourceLocation StartLoc,
-                                        SourceLocation LParenLoc,
-                                        SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPAlignClause(Expr *A, SourceLocation StartLoc,
+                                              SourceLocation LParenLoc,
+                                              SourceLocation EndLoc) {
   ExprResult AlignVal;
   AlignVal = VerifyPositiveIntegerConstantInClause(A, OMPC_align);
   if (AlignVal.isInvalid())
     return nullptr;
-  return OMPAlignClause::Create(Context, AlignVal.get(), StartLoc, LParenLoc,
-                                EndLoc);
+  return OMPAlignClause::Create(getASTContext(), AlignVal.get(), StartLoc,
+                                LParenLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPSingleExprWithArgClause(
+OMPClause *SemaOpenMP::ActOnOpenMPSingleExprWithArgClause(
     OpenMPClauseKind Kind, ArrayRef<unsigned> Argument, Expr *Expr,
     SourceLocation StartLoc, SourceLocation LParenLoc,
     ArrayRef<SourceLocation> ArgumentLoc, SourceLocation DelimLoc,
@@ -17559,13 +17618,13 @@ static bool checkScheduleModifiers(Sema &S, OpenMPScheduleClauseModifier M1,
   return false;
 }
 
-OMPClause *Sema::ActOnOpenMPScheduleClause(
+OMPClause *SemaOpenMP::ActOnOpenMPScheduleClause(
     OpenMPScheduleClauseModifier M1, OpenMPScheduleClauseModifier M2,
     OpenMPScheduleClauseKind Kind, Expr *ChunkSize, SourceLocation StartLoc,
     SourceLocation LParenLoc, SourceLocation M1Loc, SourceLocation M2Loc,
     SourceLocation KindLoc, SourceLocation CommaLoc, SourceLocation EndLoc) {
-  if (checkScheduleModifiers(*this, M1, M2, M1Loc, M2Loc) ||
-      checkScheduleModifiers(*this, M2, M1, M2Loc, M1Loc))
+  if (checkScheduleModifiers(SemaRef, M1, M2, M1Loc, M2Loc) ||
+      checkScheduleModifiers(SemaRef, M2, M1, M2Loc, M1Loc))
     return nullptr;
   // OpenMP, 2.7.1, Loop Construct, Restrictions
   // Either the monotonic modifier or the nonmonotonic modifier can be specified
@@ -17599,7 +17658,7 @@ OMPClause *Sema::ActOnOpenMPScheduleClause(
   // The nonmonotonic modifier can only be specified with schedule(dynamic) or
   // schedule(guided).
   // OpenMP 5.0 does not have this restriction.
-  if (LangOpts.OpenMP < 50 &&
+  if (getLangOpts().OpenMP < 50 &&
       (M1 == OMPC_SCHEDULE_MODIFIER_nonmonotonic ||
        M2 == OMPC_SCHEDULE_MODIFIER_nonmonotonic) &&
       Kind != OMPC_SCHEDULE_dynamic && Kind != OMPC_SCHEDULE_guided) {
@@ -17625,7 +17684,7 @@ OMPClause *Sema::ActOnOpenMPScheduleClause(
       //  chunk_size must be a loop invariant integer expression with a positive
       //  value.
       if (std::optional<llvm::APSInt> Result =
-              ValExpr->getIntegerConstantExpr(Context)) {
+              ValExpr->getIntegerConstantExpr(getASTContext())) {
         if (Result->isSigned() && !Result->isStrictlyPositive()) {
           Diag(ChunkSizeLoc, diag::err_omp_negative_expression_in_clause)
               << "schedule" << 1 << ChunkSize->getSourceRange();
@@ -17633,24 +17692,24 @@ OMPClause *Sema::ActOnOpenMPScheduleClause(
         }
       } else if (getOpenMPCaptureRegionForClause(
                      DSAStack->getCurrentDirective(), OMPC_schedule,
-                     LangOpts.OpenMP) != OMPD_unknown &&
-                 !CurContext->isDependentContext()) {
-        ValExpr = MakeFullExpr(ValExpr).get();
+                     getLangOpts().OpenMP) != OMPD_unknown &&
+                 !SemaRef.CurContext->isDependentContext()) {
+        ValExpr = SemaRef.MakeFullExpr(ValExpr).get();
         llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
-        ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
-        HelperValStmt = buildPreInits(Context, Captures);
+        ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get();
+        HelperValStmt = buildPreInits(getASTContext(), Captures);
       }
     }
   }
 
-  return new (Context)
+  return new (getASTContext())
       OMPScheduleClause(StartLoc, LParenLoc, KindLoc, CommaLoc, EndLoc, Kind,
                         ValExpr, HelperValStmt, M1, M1Loc, M2, M2Loc);
 }
 
-OMPClause *Sema::ActOnOpenMPClause(OpenMPClauseKind Kind,
-                                   SourceLocation StartLoc,
-                                   SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPClause(OpenMPClauseKind Kind,
+                                         SourceLocation StartLoc,
+                                         SourceLocation EndLoc) {
   OMPClause *Res = nullptr;
   switch (Kind) {
   case OMPC_ordered:
@@ -17804,134 +17863,138 @@ OMPClause *Sema::ActOnOpenMPClause(OpenMPClauseKind Kind,
   return Res;
 }
 
-OMPClause *Sema::ActOnOpenMPNowaitClause(SourceLocation StartLoc,
-                                         SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPNowaitClause(SourceLocation StartLoc,
+                                               SourceLocation EndLoc) {
   DSAStack->setNowaitRegion();
-  return new (Context) OMPNowaitClause(StartLoc, EndLoc);
+  return new (getASTContext()) OMPNowaitClause(StartLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPUntiedClause(SourceLocation StartLoc,
-                                         SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPUntiedClause(SourceLocation StartLoc,
+                                               SourceLocation EndLoc) {
   DSAStack->setUntiedRegion();
-  return new (Context) OMPUntiedClause(StartLoc, EndLoc);
+  return new (getASTContext()) OMPUntiedClause(StartLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPMergeableClause(SourceLocation StartLoc,
-                                            SourceLocation EndLoc) {
-  return new (Context) OMPMergeableClause(StartLoc, EndLoc);
+OMPClause *SemaOpenMP::ActOnOpenMPMergeableClause(SourceLocation StartLoc,
+                                                  SourceLocation EndLoc) {
+  return new (getASTContext()) OMPMergeableClause(StartLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPReadClause(SourceLocation StartLoc,
-                                       SourceLocation EndLoc) {
-  return new (Context) OMPReadClause(StartLoc, EndLoc);
+OMPClause *SemaOpenMP::ActOnOpenMPReadClause(SourceLocation StartLoc,
+                                             SourceLocation EndLoc) {
+  return new (getASTContext()) OMPReadClause(StartLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPWriteClause(SourceLocation StartLoc,
-                                        SourceLocation EndLoc) {
-  return new (Context) OMPWriteClause(StartLoc, EndLoc);
+OMPClause *SemaOpenMP::ActOnOpenMPWriteClause(SourceLocation StartLoc,
+                                              SourceLocation EndLoc) {
+  return new (getASTContext()) OMPWriteClause(StartLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPUpdateClause(SourceLocation StartLoc,
-                                         SourceLocation EndLoc) {
-  return OMPUpdateClause::Create(Context, StartLoc, EndLoc);
+OMPClause *SemaOpenMP::ActOnOpenMPUpdateClause(SourceLocation StartLoc,
+                                               SourceLocation EndLoc) {
+  return OMPUpdateClause::Create(getASTContext(), StartLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPCaptureClause(SourceLocation StartLoc,
-                                          SourceLocation EndLoc) {
-  return new (Context) OMPCaptureClause(StartLoc, EndLoc);
+OMPClause *SemaOpenMP::ActOnOpenMPCaptureClause(SourceLocation StartLoc,
+                                                SourceLocation EndLoc) {
+  return new (getASTContext()) OMPCaptureClause(StartLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPCompareClause(SourceLocation StartLoc,
-                                          SourceLocation EndLoc) {
-  return new (Context) OMPCompareClause(StartLoc, EndLoc);
+OMPClause *SemaOpenMP::ActOnOpenMPCompareClause(SourceLocation StartLoc,
+                                                SourceLocation EndLoc) {
+  return new (getASTContext()) OMPCompareClause(StartLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPFailClause(SourceLocation StartLoc,
-                                       SourceLocation EndLoc) {
-  return new (Context) OMPFailClause(StartLoc, EndLoc);
+OMPClause *SemaOpenMP::ActOnOpenMPFailClause(SourceLocation StartLoc,
+                                             SourceLocation EndLoc) {
+  return new (getASTContext()) OMPFailClause(StartLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPFailClause(
-      OpenMPClauseKind Parameter, SourceLocation KindLoc,
-      SourceLocation StartLoc, SourceLocation LParenLoc,
-      SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPFailClause(OpenMPClauseKind Parameter,
+                                             SourceLocation KindLoc,
+                                             SourceLocation StartLoc,
+                                             SourceLocation LParenLoc,
+                                             SourceLocation EndLoc) {
 
   if (!checkFailClauseParameter(Parameter)) {
     Diag(KindLoc, diag::err_omp_atomic_fail_wrong_or_no_clauses);
     return nullptr;
   }
-  return new (Context)
+  return new (getASTContext())
       OMPFailClause(Parameter, KindLoc, StartLoc, LParenLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPSeqCstClause(SourceLocation StartLoc,
-                                         SourceLocation EndLoc) {
-  return new (Context) OMPSeqCstClause(StartLoc, EndLoc);
+OMPClause *SemaOpenMP::ActOnOpenMPSeqCstClause(SourceLocation StartLoc,
+                                               SourceLocation EndLoc) {
+  return new (getASTContext()) OMPSeqCstClause(StartLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPAcqRelClause(SourceLocation StartLoc,
-                                         SourceLocation EndLoc) {
-  return new (Context) OMPAcqRelClause(StartLoc, EndLoc);
+OMPClause *SemaOpenMP::ActOnOpenMPAcqRelClause(SourceLocation StartLoc,
+                                               SourceLocation EndLoc) {
+  return new (getASTContext()) OMPAcqRelClause(StartLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPAcquireClause(SourceLocation StartLoc,
-                                          SourceLocation EndLoc) {
-  return new (Context) OMPAcquireClause(StartLoc, EndLoc);
+OMPClause *SemaOpenMP::ActOnOpenMPAcquireClause(SourceLocation StartLoc,
+                                                SourceLocation EndLoc) {
+  return new (getASTContext()) OMPAcquireClause(StartLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPReleaseClause(SourceLocation StartLoc,
-                                          SourceLocation EndLoc) {
-  return new (Context) OMPReleaseClause(StartLoc, EndLoc);
+OMPClause *SemaOpenMP::ActOnOpenMPReleaseClause(SourceLocation StartLoc,
+                                                SourceLocation EndLoc) {
+  return new (getASTContext()) OMPReleaseClause(StartLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPRelaxedClause(SourceLocation StartLoc,
-                                          SourceLocation EndLoc) {
-  return new (Context) OMPRelaxedClause(StartLoc, EndLoc);
+OMPClause *SemaOpenMP::ActOnOpenMPRelaxedClause(SourceLocation StartLoc,
+                                                SourceLocation EndLoc) {
+  return new (getASTContext()) OMPRelaxedClause(StartLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPWeakClause(SourceLocation StartLoc,
-                                       SourceLocation EndLoc) {
-  return new (Context) OMPWeakClause(StartLoc, EndLoc);
+OMPClause *SemaOpenMP::ActOnOpenMPWeakClause(SourceLocation StartLoc,
+                                             SourceLocation EndLoc) {
+  return new (getASTContext()) OMPWeakClause(StartLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPThreadsClause(SourceLocation StartLoc,
-                                          SourceLocation EndLoc) {
-  return new (Context) OMPThreadsClause(StartLoc, EndLoc);
+OMPClause *SemaOpenMP::ActOnOpenMPThreadsClause(SourceLocation StartLoc,
+                                                SourceLocation EndLoc) {
+  return new (getASTContext()) OMPThreadsClause(StartLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPSIMDClause(SourceLocation StartLoc,
-                                       SourceLocation EndLoc) {
-  return new (Context) OMPSIMDClause(StartLoc, EndLoc);
+OMPClause *SemaOpenMP::ActOnOpenMPSIMDClause(SourceLocation StartLoc,
+                                             SourceLocation EndLoc) {
+  return new (getASTContext()) OMPSIMDClause(StartLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPNogroupClause(SourceLocation StartLoc,
-                                          SourceLocation EndLoc) {
-  return new (Context) OMPNogroupClause(StartLoc, EndLoc);
+OMPClause *SemaOpenMP::ActOnOpenMPNogroupClause(SourceLocation StartLoc,
+                                                SourceLocation EndLoc) {
+  return new (getASTContext()) OMPNogroupClause(StartLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPUnifiedAddressClause(SourceLocation StartLoc,
-                                                 SourceLocation EndLoc) {
-  return new (Context) OMPUnifiedAddressClause(StartLoc, EndLoc);
+OMPClause *SemaOpenMP::ActOnOpenMPUnifiedAddressClause(SourceLocation StartLoc,
+                                                       SourceLocation EndLoc) {
+  return new (getASTContext()) OMPUnifiedAddressClause(StartLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPUnifiedSharedMemoryClause(SourceLocation StartLoc,
-                                                      SourceLocation EndLoc) {
-  return new (Context) OMPUnifiedSharedMemoryClause(StartLoc, EndLoc);
+OMPClause *
+SemaOpenMP::ActOnOpenMPUnifiedSharedMemoryClause(SourceLocation StartLoc,
+                                                 SourceLocation EndLoc) {
+  return new (getASTContext()) OMPUnifiedSharedMemoryClause(StartLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPReverseOffloadClause(SourceLocation StartLoc,
-                                                 SourceLocation EndLoc) {
-  return new (Context) OMPReverseOffloadClause(StartLoc, EndLoc);
+OMPClause *SemaOpenMP::ActOnOpenMPReverseOffloadClause(SourceLocation StartLoc,
+                                                       SourceLocation EndLoc) {
+  return new (getASTContext()) OMPReverseOffloadClause(StartLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPDynamicAllocatorsClause(SourceLocation StartLoc,
-                                                    SourceLocation EndLoc) {
-  return new (Context) OMPDynamicAllocatorsClause(StartLoc, EndLoc);
+OMPClause *
+SemaOpenMP::ActOnOpenMPDynamicAllocatorsClause(SourceLocation StartLoc,
+                                               SourceLocation EndLoc) {
+  return new (getASTContext()) OMPDynamicAllocatorsClause(StartLoc, EndLoc);
 }
 
-StmtResult Sema::ActOnOpenMPInteropDirective(ArrayRef<OMPClause *> Clauses,
-                                             SourceLocation StartLoc,
-                                             SourceLocation EndLoc) {
+StmtResult
+SemaOpenMP::ActOnOpenMPInteropDirective(ArrayRef<OMPClause *> Clauses,
+                                        SourceLocation StartLoc,
+                                        SourceLocation EndLoc) {
 
   // OpenMP 5.1 [2.15.1, interop Construct, Restrictions]
   // At least one action-clause must appear on a directive.
@@ -17981,13 +18044,13 @@ StmtResult Sema::ActOnOpenMPInteropDirective(ArrayRef<OMPClause *> Clauses,
 
     if (ClauseKind == OMPC_init) {
       auto *E = cast<OMPInitClause>(C)->getInteropVar();
-      DeclResult = getPrivateItem(*this, E, ELoc, ERange);
+      DeclResult = getPrivateItem(SemaRef, E, ELoc, ERange);
     } else if (ClauseKind == OMPC_use) {
       auto *E = cast<OMPUseClause>(C)->getInteropVar();
-      DeclResult = getPrivateItem(*this, E, ELoc, ERange);
+      DeclResult = getPrivateItem(SemaRef, E, ELoc, ERange);
     } else if (ClauseKind == OMPC_destroy) {
       auto *E = cast<OMPDestroyClause>(C)->getInteropVar();
-      DeclResult = getPrivateItem(*this, E, ELoc, ERange);
+      DeclResult = getPrivateItem(SemaRef, E, ELoc, ERange);
     }
 
     if (DeclResult.first) {
@@ -17999,7 +18062,8 @@ StmtResult Sema::ActOnOpenMPInteropDirective(ArrayRef<OMPClause *> Clauses,
     }
   }
 
-  return OMPInteropDirective::Create(Context, StartLoc, EndLoc, Clauses);
+  return OMPInteropDirective::Create(getASTContext(), StartLoc, EndLoc,
+                                     Clauses);
 }
 
 static bool isValidInteropVariable(Sema &SemaRef, Expr *InteropVarExpr,
@@ -18059,12 +18123,11 @@ static bool isValidInteropVariable(Sema &SemaRef, Expr *InteropVarExpr,
   return true;
 }
 
-OMPClause *
-Sema::ActOnOpenMPInitClause(Expr *InteropVar, OMPInteropInfo &InteropInfo,
-                            SourceLocation StartLoc, SourceLocation LParenLoc,
-                            SourceLocation VarLoc, SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPInitClause(
+    Expr *InteropVar, OMPInteropInfo &InteropInfo, SourceLocation StartLoc,
+    SourceLocation LParenLoc, SourceLocation VarLoc, SourceLocation EndLoc) {
 
-  if (!isValidInteropVariable(*this, InteropVar, VarLoc, OMPC_init))
+  if (!isValidInteropVariable(SemaRef, InteropVar, VarLoc, OMPC_init))
     return nullptr;
 
   // Check prefer_type values.  These foreign-runtime-id values are either
@@ -18073,7 +18136,7 @@ Sema::ActOnOpenMPInitClause(Expr *InteropVar, OMPInteropInfo &InteropInfo,
     if (E->isValueDependent() || E->isTypeDependent() ||
         E->isInstantiationDependent() || E->containsUnexpandedParameterPack())
       continue;
-    if (E->isIntegerConstantExpr(Context))
+    if (E->isIntegerConstantExpr(getASTContext()))
       continue;
     if (isa<StringLiteral>(E))
       continue;
@@ -18081,28 +18144,29 @@ Sema::ActOnOpenMPInitClause(Expr *InteropVar, OMPInteropInfo &InteropInfo,
     return nullptr;
   }
 
-  return OMPInitClause::Create(Context, InteropVar, InteropInfo, StartLoc,
-                               LParenLoc, VarLoc, EndLoc);
+  return OMPInitClause::Create(getASTContext(), InteropVar, InteropInfo,
+                               StartLoc, LParenLoc, VarLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPUseClause(Expr *InteropVar, SourceLocation StartLoc,
-                                      SourceLocation LParenLoc,
-                                      SourceLocation VarLoc,
-                                      SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPUseClause(Expr *InteropVar,
+                                            SourceLocation StartLoc,
+                                            SourceLocation LParenLoc,
+                                            SourceLocation VarLoc,
+                                            SourceLocation EndLoc) {
 
-  if (!isValidInteropVariable(*this, InteropVar, VarLoc, OMPC_use))
+  if (!isValidInteropVariable(SemaRef, InteropVar, VarLoc, OMPC_use))
     return nullptr;
 
-  return new (Context)
+  return new (getASTContext())
       OMPUseClause(InteropVar, StartLoc, LParenLoc, VarLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPDestroyClause(Expr *InteropVar,
-                                          SourceLocation StartLoc,
-                                          SourceLocation LParenLoc,
-                                          SourceLocation VarLoc,
-                                          SourceLocation EndLoc) {
-  if (!InteropVar && LangOpts.OpenMP >= 52 &&
+OMPClause *SemaOpenMP::ActOnOpenMPDestroyClause(Expr *InteropVar,
+                                                SourceLocation StartLoc,
+                                                SourceLocation LParenLoc,
+                                                SourceLocation VarLoc,
+                                                SourceLocation EndLoc) {
+  if (!InteropVar && getLangOpts().OpenMP >= 52 &&
       DSAStack->getCurrentDirective() == OMPD_depobj) {
     Diag(StartLoc, diag::err_omp_expected_clause_argument)
         << getOpenMPClauseName(OMPC_destroy)
@@ -18110,100 +18174,103 @@ OMPClause *Sema::ActOnOpenMPDestroyClause(Expr *InteropVar,
     return nullptr;
   }
   if (InteropVar &&
-      !isValidInteropVariable(*this, InteropVar, VarLoc, OMPC_destroy))
+      !isValidInteropVariable(SemaRef, InteropVar, VarLoc, OMPC_destroy))
     return nullptr;
 
-  return new (Context)
+  return new (getASTContext())
       OMPDestroyClause(InteropVar, StartLoc, LParenLoc, VarLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPNovariantsClause(Expr *Condition,
-                                             SourceLocation StartLoc,
-                                             SourceLocation LParenLoc,
-                                             SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPNovariantsClause(Expr *Condition,
+                                                   SourceLocation StartLoc,
+                                                   SourceLocation LParenLoc,
+                                                   SourceLocation EndLoc) {
   Expr *ValExpr = Condition;
   Stmt *HelperValStmt = nullptr;
   OpenMPDirectiveKind CaptureRegion = OMPD_unknown;
   if (!Condition->isValueDependent() && !Condition->isTypeDependent() &&
       !Condition->isInstantiationDependent() &&
       !Condition->containsUnexpandedParameterPack()) {
-    ExprResult Val = CheckBooleanCondition(StartLoc, Condition);
+    ExprResult Val = SemaRef.CheckBooleanCondition(StartLoc, Condition);
     if (Val.isInvalid())
       return nullptr;
 
-    ValExpr = MakeFullExpr(Val.get()).get();
+    ValExpr = SemaRef.MakeFullExpr(Val.get()).get();
 
     OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
     CaptureRegion = getOpenMPCaptureRegionForClause(DKind, OMPC_novariants,
-                                                    LangOpts.OpenMP);
-    if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
-      ValExpr = MakeFullExpr(ValExpr).get();
+                                                    getLangOpts().OpenMP);
+    if (CaptureRegion != OMPD_unknown &&
+        !SemaRef.CurContext->isDependentContext()) {
+      ValExpr = SemaRef.MakeFullExpr(ValExpr).get();
       llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
-      ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
-      HelperValStmt = buildPreInits(Context, Captures);
+      ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get();
+      HelperValStmt = buildPreInits(getASTContext(), Captures);
     }
   }
 
-  return new (Context) OMPNovariantsClause(
+  return new (getASTContext()) OMPNovariantsClause(
       ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPNocontextClause(Expr *Condition,
-                                            SourceLocation StartLoc,
-                                            SourceLocation LParenLoc,
-                                            SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPNocontextClause(Expr *Condition,
+                                                  SourceLocation StartLoc,
+                                                  SourceLocation LParenLoc,
+                                                  SourceLocation EndLoc) {
   Expr *ValExpr = Condition;
   Stmt *HelperValStmt = nullptr;
   OpenMPDirectiveKind CaptureRegion = OMPD_unknown;
   if (!Condition->isValueDependent() && !Condition->isTypeDependent() &&
       !Condition->isInstantiationDependent() &&
       !Condition->containsUnexpandedParameterPack()) {
-    ExprResult Val = CheckBooleanCondition(StartLoc, Condition);
+    ExprResult Val = SemaRef.CheckBooleanCondition(StartLoc, Condition);
     if (Val.isInvalid())
       return nullptr;
 
-    ValExpr = MakeFullExpr(Val.get()).get();
+    ValExpr = SemaRef.MakeFullExpr(Val.get()).get();
 
     OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
-    CaptureRegion =
-        getOpenMPCaptureRegionForClause(DKind, OMPC_nocontext, LangOpts.OpenMP);
-    if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
-      ValExpr = MakeFullExpr(ValExpr).get();
+    CaptureRegion = getOpenMPCaptureRegionForClause(DKind, OMPC_nocontext,
+                                                    getLangOpts().OpenMP);
+    if (CaptureRegion != OMPD_unknown &&
+        !SemaRef.CurContext->isDependentContext()) {
+      ValExpr = SemaRef.MakeFullExpr(ValExpr).get();
       llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
-      ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
-      HelperValStmt = buildPreInits(Context, Captures);
+      ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get();
+      HelperValStmt = buildPreInits(getASTContext(), Captures);
     }
   }
 
-  return new (Context) OMPNocontextClause(ValExpr, HelperValStmt, CaptureRegion,
-                                          StartLoc, LParenLoc, EndLoc);
+  return new (getASTContext()) OMPNocontextClause(
+      ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPFilterClause(Expr *ThreadID,
-                                         SourceLocation StartLoc,
-                                         SourceLocation LParenLoc,
-                                         SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPFilterClause(Expr *ThreadID,
+                                               SourceLocation StartLoc,
+                                               SourceLocation LParenLoc,
+                                               SourceLocation EndLoc) {
   Expr *ValExpr = ThreadID;
   Stmt *HelperValStmt = nullptr;
 
   OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
   OpenMPDirectiveKind CaptureRegion =
-      getOpenMPCaptureRegionForClause(DKind, OMPC_filter, LangOpts.OpenMP);
-  if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
-    ValExpr = MakeFullExpr(ValExpr).get();
+      getOpenMPCaptureRegionForClause(DKind, OMPC_filter, getLangOpts().OpenMP);
+  if (CaptureRegion != OMPD_unknown &&
+      !SemaRef.CurContext->isDependentContext()) {
+    ValExpr = SemaRef.MakeFullExpr(ValExpr).get();
     llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
-    ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
-    HelperValStmt = buildPreInits(Context, Captures);
+    ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get();
+    HelperValStmt = buildPreInits(getASTContext(), Captures);
   }
 
-  return new (Context) OMPFilterClause(ValExpr, HelperValStmt, CaptureRegion,
-                                       StartLoc, LParenLoc, EndLoc);
+  return new (getASTContext()) OMPFilterClause(
+      ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPVarListClause(OpenMPClauseKind Kind,
-                                          ArrayRef<Expr *> VarList,
-                                          const OMPVarListLocTy &Locs,
-                                          OpenMPVarListDataTy &Data) {
+OMPClause *SemaOpenMP::ActOnOpenMPVarListClause(OpenMPClauseKind Kind,
+                                                ArrayRef<Expr *> VarList,
+                                                const OMPVarListLocTy &Locs,
+                                                OpenMPVarListDataTy &Data) {
   SourceLocation StartLoc = Locs.StartLoc;
   SourceLocation LParenLoc = Locs.LParenLoc;
   SourceLocation EndLoc = Locs.EndLoc;
@@ -18395,29 +18462,30 @@ OMPClause *Sema::ActOnOpenMPVarListClause(OpenMPClauseKind Kind,
   return Res;
 }
 
-ExprResult Sema::getOpenMPCapturedExpr(VarDecl *Capture, ExprValueKind VK,
-                                       ExprObjectKind OK, SourceLocation Loc) {
-  ExprResult Res = BuildDeclRefExpr(
+ExprResult SemaOpenMP::getOpenMPCapturedExpr(VarDecl *Capture, ExprValueKind VK,
+                                             ExprObjectKind OK,
+                                             SourceLocation Loc) {
+  ExprResult Res = SemaRef.BuildDeclRefExpr(
       Capture, Capture->getType().getNonReferenceType(), VK_LValue, Loc);
   if (!Res.isUsable())
     return ExprError();
   if (OK == OK_Ordinary && !getLangOpts().CPlusPlus) {
-    Res = CreateBuiltinUnaryOp(Loc, UO_Deref, Res.get());
+    Res = SemaRef.CreateBuiltinUnaryOp(Loc, UO_Deref, Res.get());
     if (!Res.isUsable())
       return ExprError();
   }
   if (VK != VK_LValue && Res.get()->isGLValue()) {
-    Res = DefaultLvalueConversion(Res.get());
+    Res = SemaRef.DefaultLvalueConversion(Res.get());
     if (!Res.isUsable())
       return ExprError();
   }
   return Res;
 }
 
-OMPClause *Sema::ActOnOpenMPPrivateClause(ArrayRef<Expr *> VarList,
-                                          SourceLocation StartLoc,
-                                          SourceLocation LParenLoc,
-                                          SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPPrivateClause(ArrayRef<Expr *> VarList,
+                                                SourceLocation StartLoc,
+                                                SourceLocation LParenLoc,
+                                                SourceLocation EndLoc) {
   SmallVector<Expr *, 8> Vars;
   SmallVector<Expr *, 8> PrivateCopies;
   bool IsImplicitClause =
@@ -18427,7 +18495,7 @@ OMPClause *Sema::ActOnOpenMPPrivateClause(ArrayRef<Expr *> VarList,
     SourceLocation ELoc;
     SourceRange ERange;
     Expr *SimpleRefExpr = RefExpr;
-    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
+    auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange);
     if (Res.second) {
       // It will be analyzed later.
       Vars.push_back(RefExpr);
@@ -18443,7 +18511,8 @@ OMPClause *Sema::ActOnOpenMPPrivateClause(ArrayRef<Expr *> VarList,
     // OpenMP [2.9.3.3, Restrictions, C/C++, p.3]
     //  A variable that appears in a private clause must not have an incomplete
     //  type or a reference type.
-    if (RequireCompleteType(ELoc, Type, diag::err_omp_private_incomplete_type))
+    if (SemaRef.RequireCompleteType(ELoc, Type,
+                                    diag::err_omp_private_incomplete_type))
       continue;
     Type = Type.getNonReferenceType();
 
@@ -18455,7 +18524,7 @@ OMPClause *Sema::ActOnOpenMPPrivateClause(ArrayRef<Expr *> VarList,
     // OpenMP 3.1 [2.9.3.3, private clause, Restrictions]
     // A variable that appears in a private clause must not have a
     // const-qualified type unless it is of class type with a mutable member.
-    if (rejectConstNotMutableType(*this, D, Type, OMPC_private, ELoc))
+    if (rejectConstNotMutableType(SemaRef, D, Type, OMPC_private, ELoc))
       continue;
 
     // OpenMP [2.9.1.1, Data-sharing Attribute Rules for Variables Referenced
@@ -18469,7 +18538,7 @@ OMPClause *Sema::ActOnOpenMPPrivateClause(ArrayRef<Expr *> VarList,
     if (DVar.CKind != OMPC_unknown && DVar.CKind != OMPC_private) {
       Diag(ELoc, diag::err_omp_wrong_dsa) << getOpenMPClauseName(DVar.CKind)
                                           << getOpenMPClauseName(OMPC_private);
-      reportOriginalDsa(*this, DSAStack, D, DVar);
+      reportOriginalDsa(SemaRef, DSAStack, D, DVar);
       continue;
     }
 
@@ -18480,7 +18549,7 @@ OMPClause *Sema::ActOnOpenMPPrivateClause(ArrayRef<Expr *> VarList,
       Diag(ELoc, diag::err_omp_variably_modified_type_not_supported)
           << getOpenMPClauseName(OMPC_private) << Type
           << getOpenMPDirectiveName(CurrDir);
-      bool IsDecl = !VD || VD->isThisDeclarationADefinition(Context) ==
+      bool IsDecl = !VD || VD->isThisDeclarationADefinition(getASTContext()) ==
                                VarDecl::DeclarationOnly;
       Diag(D->getLocation(),
            IsDecl ? diag::note_previous_decl : diag::note_defined_here)
@@ -18496,7 +18565,8 @@ OMPClause *Sema::ActOnOpenMPPrivateClause(ArrayRef<Expr *> VarList,
     // A list item cannot appear in both a map clause and a data-sharing
     // attribute clause on the same construct unless the construct is a
     // combined construct.
-    if ((LangOpts.OpenMP <= 45 && isOpenMPTargetExecutionDirective(CurrDir)) ||
+    if ((getLangOpts().OpenMP <= 45 &&
+         isOpenMPTargetExecutionDirective(CurrDir)) ||
         CurrDir == OMPD_target) {
       OpenMPClauseKind ConflictKind;
       if (DSAStack->checkMappableExprComponentListsForDecl(
@@ -18510,7 +18580,7 @@ OMPClause *Sema::ActOnOpenMPPrivateClause(ArrayRef<Expr *> VarList,
             << getOpenMPClauseName(OMPC_private)
             << getOpenMPClauseName(ConflictKind)
             << getOpenMPDirectiveName(CurrDir);
-        reportOriginalDsa(*this, DSAStack, D, DVar);
+        reportOriginalDsa(SemaRef, DSAStack, D, DVar);
         continue;
       }
     }
@@ -18526,28 +18596,28 @@ OMPClause *Sema::ActOnOpenMPPrivateClause(ArrayRef<Expr *> VarList,
     // proper diagnostics.
     Type = Type.getUnqualifiedType();
     VarDecl *VDPrivate =
-        buildVarDecl(*this, ELoc, Type, D->getName(),
+        buildVarDecl(SemaRef, ELoc, Type, D->getName(),
                      D->hasAttrs() ? &D->getAttrs() : nullptr,
                      VD ? cast<DeclRefExpr>(SimpleRefExpr) : nullptr);
-    ActOnUninitializedDecl(VDPrivate);
+    SemaRef.ActOnUninitializedDecl(VDPrivate);
     if (VDPrivate->isInvalidDecl())
       continue;
     DeclRefExpr *VDPrivateRefExpr = buildDeclRefExpr(
-        *this, VDPrivate, RefExpr->getType().getUnqualifiedType(), ELoc);
+        SemaRef, VDPrivate, RefExpr->getType().getUnqualifiedType(), ELoc);
 
     DeclRefExpr *Ref = nullptr;
-    if (!VD && !CurContext->isDependentContext()) {
+    if (!VD && !SemaRef.CurContext->isDependentContext()) {
       auto *FD = dyn_cast<FieldDecl>(D);
       VarDecl *VD = FD ? DSAStack->getImplicitFDCapExprDecl(FD) : nullptr;
       if (VD)
-        Ref = buildDeclRefExpr(*this, VD, VD->getType().getNonReferenceType(),
+        Ref = buildDeclRefExpr(SemaRef, VD, VD->getType().getNonReferenceType(),
                                RefExpr->getExprLoc());
       else
-        Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/false);
+        Ref = buildCapture(SemaRef, D, SimpleRefExpr, /*WithInit=*/false);
     }
     if (!IsImplicitClause)
       DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_private, Ref);
-    Vars.push_back((VD || CurContext->isDependentContext())
+    Vars.push_back((VD || SemaRef.CurContext->isDependentContext())
                        ? RefExpr->IgnoreParens()
                        : Ref);
     PrivateCopies.push_back(VDPrivateRefExpr);
@@ -18556,14 +18626,14 @@ OMPClause *Sema::ActOnOpenMPPrivateClause(ArrayRef<Expr *> VarList,
   if (Vars.empty())
     return nullptr;
 
-  return OMPPrivateClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars,
-                                  PrivateCopies);
+  return OMPPrivateClause::Create(getASTContext(), StartLoc, LParenLoc, EndLoc,
+                                  Vars, PrivateCopies);
 }
 
-OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
-                                               SourceLocation StartLoc,
-                                               SourceLocation LParenLoc,
-                                               SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
+                                                     SourceLocation StartLoc,
+                                                     SourceLocation LParenLoc,
+                                                     SourceLocation EndLoc) {
   SmallVector<Expr *, 8> Vars;
   SmallVector<Expr *, 8> PrivateCopies;
   SmallVector<Expr *, 8> Inits;
@@ -18577,7 +18647,7 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
     SourceLocation ELoc;
     SourceRange ERange;
     Expr *SimpleRefExpr = RefExpr;
-    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
+    auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange);
     if (Res.second) {
       // It will be analyzed later.
       Vars.push_back(RefExpr);
@@ -18595,8 +18665,8 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
     // OpenMP [2.9.3.3, Restrictions, C/C++, p.3]
     //  A variable that appears in a private clause must not have an incomplete
     //  type or a reference type.
-    if (RequireCompleteType(ELoc, Type,
-                            diag::err_omp_firstprivate_incomplete_type))
+    if (SemaRef.RequireCompleteType(ELoc, Type,
+                                    diag::err_omp_firstprivate_incomplete_type))
       continue;
     Type = Type.getNonReferenceType();
 
@@ -18604,7 +18674,8 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
     //  A variable of class type (or array thereof) that appears in a private
     //  clause requires an accessible, unambiguous copy constructor for the
     //  class type.
-    QualType ElemType = Context.getBaseElementType(Type).getNonReferenceType();
+    QualType ElemType =
+        getASTContext().getBaseElementType(Type).getNonReferenceType();
 
     // If an implicit firstprivate variable found it was checked already.
     DSAStackTy::DSAVarData TopDVar;
@@ -18613,7 +18684,7 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
           DSAStack->getTopDSA(D, /*FromParent=*/false);
       TopDVar = DVar;
       OpenMPDirectiveKind CurrDir = DSAStack->getCurrentDirective();
-      bool IsConstant = ElemType.isConstant(Context);
+      bool IsConstant = ElemType.isConstant(getASTContext());
       // OpenMP [2.4.13, Data-sharing Attribute Clauses]
       //  A list item that specifies a given variable may not appear in more
       // than one clause on the same directive, except that a variable may be
@@ -18628,7 +18699,7 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
         Diag(ELoc, diag::err_omp_wrong_dsa)
             << getOpenMPClauseName(DVar.CKind)
             << getOpenMPClauseName(OMPC_firstprivate);
-        reportOriginalDsa(*this, DSAStack, D, DVar);
+        reportOriginalDsa(SemaRef, DSAStack, D, DVar);
         continue;
       }
 
@@ -18648,7 +18719,7 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
         Diag(ELoc, diag::err_omp_wrong_dsa)
             << getOpenMPClauseName(DVar.CKind)
             << getOpenMPClauseName(OMPC_firstprivate);
-        reportOriginalDsa(*this, DSAStack, D, DVar);
+        reportOriginalDsa(SemaRef, DSAStack, D, DVar);
         continue;
       }
 
@@ -18679,7 +18750,7 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
           Diag(ELoc, diag::err_omp_required_access)
               << getOpenMPClauseName(OMPC_firstprivate)
               << getOpenMPClauseName(OMPC_shared);
-          reportOriginalDsa(*this, DSAStack, D, DVar);
+          reportOriginalDsa(SemaRef, DSAStack, D, DVar);
           continue;
         }
       }
@@ -18712,7 +18783,7 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
              isOpenMPTeamsDirective(DVar.DKind))) {
           Diag(ELoc, diag::err_omp_parallel_reduction_in_task_firstprivate)
               << getOpenMPDirectiveName(DVar.DKind);
-          reportOriginalDsa(*this, DSAStack, D, DVar);
+          reportOriginalDsa(SemaRef, DSAStack, D, DVar);
           continue;
         }
       }
@@ -18725,7 +18796,7 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
       // A list item cannot appear in both a map clause and a data-sharing
       // attribute clause on the same construct unless the construct is a
       // combined construct.
-      if ((LangOpts.OpenMP <= 45 &&
+      if ((getLangOpts().OpenMP <= 45 &&
            isOpenMPTargetExecutionDirective(CurrDir)) ||
           CurrDir == OMPD_target) {
         OpenMPClauseKind ConflictKind;
@@ -18741,7 +18812,7 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
               << getOpenMPClauseName(OMPC_firstprivate)
               << getOpenMPClauseName(ConflictKind)
               << getOpenMPDirectiveName(DSAStack->getCurrentDirective());
-          reportOriginalDsa(*this, DSAStack, D, DVar);
+          reportOriginalDsa(SemaRef, DSAStack, D, DVar);
           continue;
         }
       }
@@ -18753,7 +18824,7 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
       Diag(ELoc, diag::err_omp_variably_modified_type_not_supported)
           << getOpenMPClauseName(OMPC_firstprivate) << Type
           << getOpenMPDirectiveName(DSAStack->getCurrentDirective());
-      bool IsDecl = !VD || VD->isThisDeclarationADefinition(Context) ==
+      bool IsDecl = !VD || VD->isThisDeclarationADefinition(getASTContext()) ==
                                VarDecl::DeclarationOnly;
       Diag(D->getLocation(),
            IsDecl ? diag::note_previous_decl : diag::note_defined_here)
@@ -18763,7 +18834,7 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
 
     Type = Type.getUnqualifiedType();
     VarDecl *VDPrivate =
-        buildVarDecl(*this, ELoc, Type, D->getName(),
+        buildVarDecl(SemaRef, ELoc, Type, D->getName(),
                      D->hasAttrs() ? &D->getAttrs() : nullptr,
                      VD ? cast<DeclRefExpr>(SimpleRefExpr) : nullptr);
     // Generate helper private variable and initialize it with the value of the
@@ -18776,32 +18847,32 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
     // original array element in CodeGen.
     if (Type->isArrayType()) {
       VarDecl *VDInit =
-          buildVarDecl(*this, RefExpr->getExprLoc(), ElemType, D->getName());
-      VDInitRefExpr = buildDeclRefExpr(*this, VDInit, ElemType, ELoc);
-      Expr *Init = DefaultLvalueConversion(VDInitRefExpr).get();
+          buildVarDecl(SemaRef, RefExpr->getExprLoc(), ElemType, D->getName());
+      VDInitRefExpr = buildDeclRefExpr(SemaRef, VDInit, ElemType, ELoc);
+      Expr *Init = SemaRef.DefaultLvalueConversion(VDInitRefExpr).get();
       ElemType = ElemType.getUnqualifiedType();
-      VarDecl *VDInitTemp = buildVarDecl(*this, RefExpr->getExprLoc(), ElemType,
-                                         ".firstprivate.temp");
+      VarDecl *VDInitTemp = buildVarDecl(SemaRef, RefExpr->getExprLoc(),
+                                         ElemType, ".firstprivate.temp");
       InitializedEntity Entity =
           InitializedEntity::InitializeVariable(VDInitTemp);
       InitializationKind Kind = InitializationKind::CreateCopy(ELoc, ELoc);
 
-      InitializationSequence InitSeq(*this, Entity, Kind, Init);
-      ExprResult Result = InitSeq.Perform(*this, Entity, Kind, Init);
+      InitializationSequence InitSeq(SemaRef, Entity, Kind, Init);
+      ExprResult Result = InitSeq.Perform(SemaRef, Entity, Kind, Init);
       if (Result.isInvalid())
         VDPrivate->setInvalidDecl();
       else
         VDPrivate->setInit(Result.getAs<Expr>());
       // Remove temp variable declaration.
-      Context.Deallocate(VDInitTemp);
+      getASTContext().Deallocate(VDInitTemp);
     } else {
-      VarDecl *VDInit = buildVarDecl(*this, RefExpr->getExprLoc(), Type,
+      VarDecl *VDInit = buildVarDecl(SemaRef, RefExpr->getExprLoc(), Type,
                                      ".firstprivate.temp");
-      VDInitRefExpr = buildDeclRefExpr(*this, VDInit, RefExpr->getType(),
+      VDInitRefExpr = buildDeclRefExpr(SemaRef, VDInit, RefExpr->getType(),
                                        RefExpr->getExprLoc());
-      AddInitializerToDecl(VDPrivate,
-                           DefaultLvalueConversion(VDInitRefExpr).get(),
-                           /*DirectInit=*/false);
+      SemaRef.AddInitializerToDecl(
+          VDPrivate, SemaRef.DefaultLvalueConversion(VDInitRefExpr).get(),
+          /*DirectInit=*/false);
     }
     if (VDPrivate->isInvalidDecl()) {
       if (IsImplicitClause) {
@@ -18810,29 +18881,30 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
       }
       continue;
     }
-    CurContext->addDecl(VDPrivate);
+    SemaRef.CurContext->addDecl(VDPrivate);
     DeclRefExpr *VDPrivateRefExpr = buildDeclRefExpr(
-        *this, VDPrivate, RefExpr->getType().getUnqualifiedType(),
+        SemaRef, VDPrivate, RefExpr->getType().getUnqualifiedType(),
         RefExpr->getExprLoc());
     DeclRefExpr *Ref = nullptr;
-    if (!VD && !CurContext->isDependentContext()) {
+    if (!VD && !SemaRef.CurContext->isDependentContext()) {
       if (TopDVar.CKind == OMPC_lastprivate) {
         Ref = TopDVar.PrivateCopy;
       } else {
         auto *FD = dyn_cast<FieldDecl>(D);
         VarDecl *VD = FD ? DSAStack->getImplicitFDCapExprDecl(FD) : nullptr;
         if (VD)
-          Ref = buildDeclRefExpr(*this, VD, VD->getType().getNonReferenceType(),
-                                 RefExpr->getExprLoc());
+          Ref =
+              buildDeclRefExpr(SemaRef, VD, VD->getType().getNonReferenceType(),
+                               RefExpr->getExprLoc());
         else
-          Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true);
+          Ref = buildCapture(SemaRef, D, SimpleRefExpr, /*WithInit=*/true);
         if (VD || !isOpenMPCapturedDecl(D))
           ExprCaptures.push_back(Ref->getDecl());
       }
     }
     if (!IsImplicitClause)
       DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_firstprivate, Ref);
-    Vars.push_back((VD || CurContext->isDependentContext())
+    Vars.push_back((VD || SemaRef.CurContext->isDependentContext())
                        ? RefExpr->IgnoreParens()
                        : Ref);
     PrivateCopies.push_back(VDPrivateRefExpr);
@@ -18842,12 +18914,12 @@ OMPClause *Sema::ActOnOpenMPFirstprivateClause(ArrayRef<Expr *> VarList,
   if (Vars.empty())
     return nullptr;
 
-  return OMPFirstprivateClause::Create(Context, StartLoc, LParenLoc, EndLoc,
-                                       Vars, PrivateCopies, Inits,
-                                       buildPreInits(Context, ExprCaptures));
+  return OMPFirstprivateClause::Create(
+      getASTContext(), StartLoc, LParenLoc, EndLoc, Vars, PrivateCopies, Inits,
+      buildPreInits(getASTContext(), ExprCaptures));
 }
 
-OMPClause *Sema::ActOnOpenMPLastprivateClause(
+OMPClause *SemaOpenMP::ActOnOpenMPLastprivateClause(
     ArrayRef<Expr *> VarList, OpenMPLastprivateModifier LPKind,
     SourceLocation LPKindLoc, SourceLocation ColonLoc, SourceLocation StartLoc,
     SourceLocation LParenLoc, SourceLocation EndLoc) {
@@ -18871,7 +18943,7 @@ OMPClause *Sema::ActOnOpenMPLastprivateClause(
     SourceLocation ELoc;
     SourceRange ERange;
     Expr *SimpleRefExpr = RefExpr;
-    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
+    auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange);
     if (Res.second) {
       // It will be analyzed later.
       Vars.push_back(RefExpr);
@@ -18889,8 +18961,8 @@ OMPClause *Sema::ActOnOpenMPLastprivateClause(
     // OpenMP [2.14.3.5, Restrictions, C/C++, p.2]
     //  A variable that appears in a lastprivate clause must not have an
     //  incomplete type or a reference type.
-    if (RequireCompleteType(ELoc, Type,
-                            diag::err_omp_lastprivate_incomplete_type))
+    if (SemaRef.RequireCompleteType(ELoc, Type,
+                                    diag::err_omp_lastprivate_incomplete_type))
       continue;
     Type = Type.getNonReferenceType();
 
@@ -18902,7 +18974,7 @@ OMPClause *Sema::ActOnOpenMPLastprivateClause(
     // OpenMP 3.1 [2.9.3.5, lastprivate clause, Restrictions]
     // A variable that appears in a lastprivate clause must not have a
     // const-qualified type unless it is of class type with a mutable member.
-    if (rejectConstNotMutableType(*this, D, Type, OMPC_lastprivate, ELoc))
+    if (rejectConstNotMutableType(SemaRef, D, Type, OMPC_lastprivate, ELoc))
       continue;
 
     // OpenMP 5.0 [2.19.4.5 lastprivate Clause, Restrictions]
@@ -18910,7 +18982,7 @@ OMPClause *Sema::ActOnOpenMPLastprivateClause(
     // modifier must be a scalar variable.
     if (LPKind == OMPC_LASTPRIVATE_conditional && !Type->isScalarType()) {
       Diag(ELoc, diag::err_omp_lastprivate_conditional_non_scalar);
-      bool IsDecl = !VD || VD->isThisDeclarationADefinition(Context) ==
+      bool IsDecl = !VD || VD->isThisDeclarationADefinition(getASTContext()) ==
                                VarDecl::DeclarationOnly;
       Diag(D->getLocation(),
            IsDecl ? diag::note_previous_decl : diag::note_defined_here)
@@ -18935,7 +19007,7 @@ OMPClause *Sema::ActOnOpenMPLastprivateClause(
       Diag(ELoc, diag::err_omp_wrong_dsa)
           << getOpenMPClauseName(DVar.CKind)
           << getOpenMPClauseName(OMPC_lastprivate);
-      reportOriginalDsa(*this, DSAStack, D, DVar);
+      reportOriginalDsa(SemaRef, DSAStack, D, DVar);
       continue;
     }
 
@@ -18954,7 +19026,7 @@ OMPClause *Sema::ActOnOpenMPLastprivateClause(
         Diag(ELoc, diag::err_omp_required_access)
             << getOpenMPClauseName(OMPC_lastprivate)
             << getOpenMPClauseName(OMPC_shared);
-        reportOriginalDsa(*this, DSAStack, D, DVar);
+        reportOriginalDsa(SemaRef, DSAStack, D, DVar);
         continue;
       }
     }
@@ -18967,53 +19039,53 @@ OMPClause *Sema::ActOnOpenMPLastprivateClause(
     //  A variable of class type (or array thereof) that appears in a
     //  lastprivate clause requires an accessible, unambiguous copy assignment
     //  operator for the class type.
-    Type = Context.getBaseElementType(Type).getNonReferenceType();
-    VarDecl *SrcVD = buildVarDecl(*this, ERange.getBegin(),
+    Type = getASTContext().getBaseElementType(Type).getNonReferenceType();
+    VarDecl *SrcVD = buildVarDecl(SemaRef, ERange.getBegin(),
                                   Type.getUnqualifiedType(), ".lastprivate.src",
                                   D->hasAttrs() ? &D->getAttrs() : nullptr);
     DeclRefExpr *PseudoSrcExpr =
-        buildDeclRefExpr(*this, SrcVD, Type.getUnqualifiedType(), ELoc);
+        buildDeclRefExpr(SemaRef, SrcVD, Type.getUnqualifiedType(), ELoc);
     VarDecl *DstVD =
-        buildVarDecl(*this, ERange.getBegin(), Type, ".lastprivate.dst",
+        buildVarDecl(SemaRef, ERange.getBegin(), Type, ".lastprivate.dst",
                      D->hasAttrs() ? &D->getAttrs() : nullptr);
-    DeclRefExpr *PseudoDstExpr = buildDeclRefExpr(*this, DstVD, Type, ELoc);
+    DeclRefExpr *PseudoDstExpr = buildDeclRefExpr(SemaRef, DstVD, Type, ELoc);
     // For arrays generate assignment operation for single element and replace
     // it by the original array element in CodeGen.
-    ExprResult AssignmentOp = BuildBinOp(/*S=*/nullptr, ELoc, BO_Assign,
-                                         PseudoDstExpr, PseudoSrcExpr);
+    ExprResult AssignmentOp = SemaRef.BuildBinOp(/*S=*/nullptr, ELoc, BO_Assign,
+                                                 PseudoDstExpr, PseudoSrcExpr);
     if (AssignmentOp.isInvalid())
       continue;
-    AssignmentOp =
-        ActOnFinishFullExpr(AssignmentOp.get(), ELoc, /*DiscardedValue*/ false);
+    AssignmentOp = SemaRef.ActOnFinishFullExpr(AssignmentOp.get(), ELoc,
+                                               /*DiscardedValue*/ false);
     if (AssignmentOp.isInvalid())
       continue;
 
     DeclRefExpr *Ref = nullptr;
-    if (!VD && !CurContext->isDependentContext()) {
+    if (!VD && !SemaRef.CurContext->isDependentContext()) {
       if (TopDVar.CKind == OMPC_firstprivate) {
         Ref = TopDVar.PrivateCopy;
       } else {
-        Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/false);
+        Ref = buildCapture(SemaRef, D, SimpleRefExpr, /*WithInit=*/false);
         if (!isOpenMPCapturedDecl(D))
           ExprCaptures.push_back(Ref->getDecl());
       }
       if ((TopDVar.CKind == OMPC_firstprivate && !TopDVar.PrivateCopy) ||
           (!isOpenMPCapturedDecl(D) &&
            Ref->getDecl()->hasAttr<OMPCaptureNoInitAttr>())) {
-        ExprResult RefRes = DefaultLvalueConversion(Ref);
+        ExprResult RefRes = SemaRef.DefaultLvalueConversion(Ref);
         if (!RefRes.isUsable())
           continue;
         ExprResult PostUpdateRes =
-            BuildBinOp(DSAStack->getCurScope(), ELoc, BO_Assign, SimpleRefExpr,
-                       RefRes.get());
+            SemaRef.BuildBinOp(DSAStack->getCurScope(), ELoc, BO_Assign,
+                               SimpleRefExpr, RefRes.get());
         if (!PostUpdateRes.isUsable())
           continue;
         ExprPostUpdates.push_back(
-            IgnoredValueConversions(PostUpdateRes.get()).get());
+            SemaRef.IgnoredValueConversions(PostUpdateRes.get()).get());
       }
     }
     DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_lastprivate, Ref);
-    Vars.push_back((VD || CurContext->isDependentContext())
+    Vars.push_back((VD || SemaRef.CurContext->isDependentContext())
                        ? RefExpr->IgnoreParens()
                        : Ref);
     SrcExprs.push_back(PseudoSrcExpr);
@@ -19024,24 +19096,24 @@ OMPClause *Sema::ActOnOpenMPLastprivateClause(
   if (Vars.empty())
     return nullptr;
 
-  return OMPLastprivateClause::Create(Context, StartLoc, LParenLoc, EndLoc,
-                                      Vars, SrcExprs, DstExprs, AssignmentOps,
-                                      LPKind, LPKindLoc, ColonLoc,
-                                      buildPreInits(Context, ExprCaptures),
-                                      buildPostUpdate(*this, ExprPostUpdates));
+  return OMPLastprivateClause::Create(
+      getASTContext(), StartLoc, LParenLoc, EndLoc, Vars, SrcExprs, DstExprs,
+      AssignmentOps, LPKind, LPKindLoc, ColonLoc,
+      buildPreInits(getASTContext(), ExprCaptures),
+      buildPostUpdate(SemaRef, ExprPostUpdates));
 }
 
-OMPClause *Sema::ActOnOpenMPSharedClause(ArrayRef<Expr *> VarList,
-                                         SourceLocation StartLoc,
-                                         SourceLocation LParenLoc,
-                                         SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPSharedClause(ArrayRef<Expr *> VarList,
+                                               SourceLocation StartLoc,
+                                               SourceLocation LParenLoc,
+                                               SourceLocation EndLoc) {
   SmallVector<Expr *, 8> Vars;
   for (Expr *RefExpr : VarList) {
     assert(RefExpr && "NULL expr in OpenMP lastprivate clause.");
     SourceLocation ELoc;
     SourceRange ERange;
     Expr *SimpleRefExpr = RefExpr;
-    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
+    auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange);
     if (Res.second) {
       // It will be analyzed later.
       Vars.push_back(RefExpr);
@@ -19063,15 +19135,16 @@ OMPClause *Sema::ActOnOpenMPSharedClause(ArrayRef<Expr *> VarList,
         DVar.RefExpr) {
       Diag(ELoc, diag::err_omp_wrong_dsa) << getOpenMPClauseName(DVar.CKind)
                                           << getOpenMPClauseName(OMPC_shared);
-      reportOriginalDsa(*this, DSAStack, D, DVar);
+      reportOriginalDsa(SemaRef, DSAStack, D, DVar);
       continue;
     }
 
     DeclRefExpr *Ref = nullptr;
-    if (!VD && isOpenMPCapturedDecl(D) && !CurContext->isDependentContext())
-      Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true);
+    if (!VD && isOpenMPCapturedDecl(D) &&
+        !SemaRef.CurContext->isDependentContext())
+      Ref = buildCapture(SemaRef, D, SimpleRefExpr, /*WithInit=*/true);
     DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_shared, Ref);
-    Vars.push_back((VD || !Ref || CurContext->isDependentContext())
+    Vars.push_back((VD || !Ref || SemaRef.CurContext->isDependentContext())
                        ? RefExpr->IgnoreParens()
                        : Ref);
   }
@@ -19079,7 +19152,8 @@ OMPClause *Sema::ActOnOpenMPSharedClause(ArrayRef<Expr *> VarList,
   if (Vars.empty())
     return nullptr;
 
-  return OMPSharedClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars);
+  return OMPSharedClause::Create(getASTContext(), StartLoc, LParenLoc, EndLoc,
+                                 Vars);
 }
 
 namespace {
@@ -20200,7 +20274,7 @@ static bool actOnOMPReductionKindClause(
       } else {
         VarsExpr = Ref = buildCapture(S, D, SimpleRefExpr, /*WithInit=*/false);
       }
-      if (!S.isOpenMPCapturedDecl(D)) {
+      if (!S.OpenMP().isOpenMPCapturedDecl(D)) {
         RD.ExprCaptures.emplace_back(Ref->getDecl());
         if (Ref->getDecl()->hasAttr<OMPCaptureNoInitAttr>()) {
           ExprResult RefRes = S.DefaultLvalueConversion(Ref);
@@ -20250,7 +20324,7 @@ static bool actOnOMPReductionKindClause(
   return RD.Vars.empty();
 }
 
-OMPClause *Sema::ActOnOpenMPReductionClause(
+OMPClause *SemaOpenMP::ActOnOpenMPReductionClause(
     ArrayRef<Expr *> VarList, OpenMPReductionClauseModifier Modifier,
     SourceLocation StartLoc, SourceLocation LParenLoc,
     SourceLocation ModifierLoc, SourceLocation ColonLoc, SourceLocation EndLoc,
@@ -20279,77 +20353,80 @@ OMPClause *Sema::ActOnOpenMPReductionClause(
   }
 
   ReductionData RD(VarList.size(), Modifier);
-  if (actOnOMPReductionKindClause(*this, DSAStack, OMPC_reduction, VarList,
+  if (actOnOMPReductionKindClause(SemaRef, DSAStack, OMPC_reduction, VarList,
                                   StartLoc, LParenLoc, ColonLoc, EndLoc,
                                   ReductionIdScopeSpec, ReductionId,
                                   UnresolvedReductions, RD))
     return nullptr;
 
   return OMPReductionClause::Create(
-      Context, StartLoc, LParenLoc, ModifierLoc, ColonLoc, EndLoc, Modifier,
-      RD.Vars, ReductionIdScopeSpec.getWithLocInContext(Context), ReductionId,
+      getASTContext(), StartLoc, LParenLoc, ModifierLoc, ColonLoc, EndLoc,
+      Modifier, RD.Vars,
+      ReductionIdScopeSpec.getWithLocInContext(getASTContext()), ReductionId,
       RD.Privates, RD.LHSs, RD.RHSs, RD.ReductionOps, RD.InscanCopyOps,
       RD.InscanCopyArrayTemps, RD.InscanCopyArrayElems,
-      buildPreInits(Context, RD.ExprCaptures),
-      buildPostUpdate(*this, RD.ExprPostUpdates));
+      buildPreInits(getASTContext(), RD.ExprCaptures),
+      buildPostUpdate(SemaRef, RD.ExprPostUpdates));
 }
 
-OMPClause *Sema::ActOnOpenMPTaskReductionClause(
+OMPClause *SemaOpenMP::ActOnOpenMPTaskReductionClause(
     ArrayRef<Expr *> VarList, SourceLocation StartLoc, SourceLocation LParenLoc,
     SourceLocation ColonLoc, SourceLocation EndLoc,
     CXXScopeSpec &ReductionIdScopeSpec, const DeclarationNameInfo &ReductionId,
     ArrayRef<Expr *> UnresolvedReductions) {
   ReductionData RD(VarList.size());
-  if (actOnOMPReductionKindClause(*this, DSAStack, OMPC_task_reduction, VarList,
-                                  StartLoc, LParenLoc, ColonLoc, EndLoc,
-                                  ReductionIdScopeSpec, ReductionId,
+  if (actOnOMPReductionKindClause(SemaRef, DSAStack, OMPC_task_reduction,
+                                  VarList, StartLoc, LParenLoc, ColonLoc,
+                                  EndLoc, ReductionIdScopeSpec, ReductionId,
                                   UnresolvedReductions, RD))
     return nullptr;
 
   return OMPTaskReductionClause::Create(
-      Context, StartLoc, LParenLoc, ColonLoc, EndLoc, RD.Vars,
-      ReductionIdScopeSpec.getWithLocInContext(Context), ReductionId,
+      getASTContext(), StartLoc, LParenLoc, ColonLoc, EndLoc, RD.Vars,
+      ReductionIdScopeSpec.getWithLocInContext(getASTContext()), ReductionId,
       RD.Privates, RD.LHSs, RD.RHSs, RD.ReductionOps,
-      buildPreInits(Context, RD.ExprCaptures),
-      buildPostUpdate(*this, RD.ExprPostUpdates));
+      buildPreInits(getASTContext(), RD.ExprCaptures),
+      buildPostUpdate(SemaRef, RD.ExprPostUpdates));
 }
 
-OMPClause *Sema::ActOnOpenMPInReductionClause(
+OMPClause *SemaOpenMP::ActOnOpenMPInReductionClause(
     ArrayRef<Expr *> VarList, SourceLocation StartLoc, SourceLocation LParenLoc,
     SourceLocation ColonLoc, SourceLocation EndLoc,
     CXXScopeSpec &ReductionIdScopeSpec, const DeclarationNameInfo &ReductionId,
     ArrayRef<Expr *> UnresolvedReductions) {
   ReductionData RD(VarList.size());
-  if (actOnOMPReductionKindClause(*this, DSAStack, OMPC_in_reduction, VarList,
+  if (actOnOMPReductionKindClause(SemaRef, DSAStack, OMPC_in_reduction, VarList,
                                   StartLoc, LParenLoc, ColonLoc, EndLoc,
                                   ReductionIdScopeSpec, ReductionId,
                                   UnresolvedReductions, RD))
     return nullptr;
 
   return OMPInReductionClause::Create(
-      Context, StartLoc, LParenLoc, ColonLoc, EndLoc, RD.Vars,
-      ReductionIdScopeSpec.getWithLocInContext(Context), ReductionId,
+      getASTContext(), StartLoc, LParenLoc, ColonLoc, EndLoc, RD.Vars,
+      ReductionIdScopeSpec.getWithLocInContext(getASTContext()), ReductionId,
       RD.Privates, RD.LHSs, RD.RHSs, RD.ReductionOps, RD.TaskgroupDescriptors,
-      buildPreInits(Context, RD.ExprCaptures),
-      buildPostUpdate(*this, RD.ExprPostUpdates));
+      buildPreInits(getASTContext(), RD.ExprCaptures),
+      buildPostUpdate(SemaRef, RD.ExprPostUpdates));
 }
 
-bool Sema::CheckOpenMPLinearModifier(OpenMPLinearClauseKind LinKind,
-                                     SourceLocation LinLoc) {
-  if ((!LangOpts.CPlusPlus && LinKind != OMPC_LINEAR_val) ||
+bool SemaOpenMP::CheckOpenMPLinearModifier(OpenMPLinearClauseKind LinKind,
+                                           SourceLocation LinLoc) {
+  if ((!getLangOpts().CPlusPlus && LinKind != OMPC_LINEAR_val) ||
       LinKind == OMPC_LINEAR_unknown || LinKind == OMPC_LINEAR_step) {
-    Diag(LinLoc, diag::err_omp_wrong_linear_modifier) << LangOpts.CPlusPlus;
+    Diag(LinLoc, diag::err_omp_wrong_linear_modifier)
+        << getLangOpts().CPlusPlus;
     return true;
   }
   return false;
 }
 
-bool Sema::CheckOpenMPLinearDecl(const ValueDecl *D, SourceLocation ELoc,
-                                 OpenMPLinearClauseKind LinKind, QualType Type,
-                                 bool IsDeclareSimd) {
+bool SemaOpenMP::CheckOpenMPLinearDecl(const ValueDecl *D, SourceLocation ELoc,
+                                       OpenMPLinearClauseKind LinKind,
+                                       QualType Type, bool IsDeclareSimd) {
   const auto *VD = dyn_cast_or_null<VarDecl>(D);
   // A variable must not have an incomplete type or a reference type.
-  if (RequireCompleteType(ELoc, Type, diag::err_omp_linear_incomplete_type))
+  if (SemaRef.RequireCompleteType(ELoc, Type,
+                                  diag::err_omp_linear_incomplete_type))
     return true;
   if ((LinKind == OMPC_LINEAR_uval || LinKind == OMPC_LINEAR_ref) &&
       !Type->isReferenceType()) {
@@ -20365,17 +20442,17 @@ bool Sema::CheckOpenMPLinearDecl(const ValueDecl *D, SourceLocation ELoc,
   // not apply to the firstprivate clause, nor to the linear clause on
   // declarative directives (like declare simd).
   if (!IsDeclareSimd &&
-      rejectConstNotMutableType(*this, D, Type, OMPC_linear, ELoc))
+      rejectConstNotMutableType(SemaRef, D, Type, OMPC_linear, ELoc))
     return true;
 
   // A list item must be of integral or pointer type.
   Type = Type.getUnqualifiedType().getCanonicalType();
   const auto *Ty = Type.getTypePtrOrNull();
   if (!Ty || (LinKind != OMPC_LINEAR_ref && !Ty->isDependentType() &&
-              !Ty->isIntegralType(Context) && !Ty->isPointerType())) {
+              !Ty->isIntegralType(getASTContext()) && !Ty->isPointerType())) {
     Diag(ELoc, diag::err_omp_linear_expected_int_or_ptr) << Type;
     if (D) {
-      bool IsDecl = !VD || VD->isThisDeclarationADefinition(Context) ==
+      bool IsDecl = !VD || VD->isThisDeclarationADefinition(getASTContext()) ==
                                VarDecl::DeclarationOnly;
       Diag(D->getLocation(),
            IsDecl ? diag::note_previous_decl : diag::note_defined_here)
@@ -20386,7 +20463,7 @@ bool Sema::CheckOpenMPLinearDecl(const ValueDecl *D, SourceLocation ELoc,
   return false;
 }
 
-OMPClause *Sema::ActOnOpenMPLinearClause(
+OMPClause *SemaOpenMP::ActOnOpenMPLinearClause(
     ArrayRef<Expr *> VarList, Expr *Step, SourceLocation StartLoc,
     SourceLocation LParenLoc, OpenMPLinearClauseKind LinKind,
     SourceLocation LinLoc, SourceLocation ColonLoc,
@@ -20409,7 +20486,7 @@ OMPClause *Sema::ActOnOpenMPLinearClause(
     SourceLocation ELoc;
     SourceRange ERange;
     Expr *SimpleRefExpr = RefExpr;
-    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
+    auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange);
     if (Res.second) {
       // It will be analyzed later.
       Vars.push_back(RefExpr);
@@ -20431,7 +20508,7 @@ OMPClause *Sema::ActOnOpenMPLinearClause(
     if (DVar.RefExpr) {
       Diag(ELoc, diag::err_omp_wrong_dsa) << getOpenMPClauseName(DVar.CKind)
                                           << getOpenMPClauseName(OMPC_linear);
-      reportOriginalDsa(*this, DSAStack, D, DVar);
+      reportOriginalDsa(SemaRef, DSAStack, D, DVar);
       continue;
     }
 
@@ -20441,29 +20518,29 @@ OMPClause *Sema::ActOnOpenMPLinearClause(
 
     // Build private copy of original var.
     VarDecl *Private =
-        buildVarDecl(*this, ELoc, Type, D->getName(),
+        buildVarDecl(SemaRef, ELoc, Type, D->getName(),
                      D->hasAttrs() ? &D->getAttrs() : nullptr,
                      VD ? cast<DeclRefExpr>(SimpleRefExpr) : nullptr);
-    DeclRefExpr *PrivateRef = buildDeclRefExpr(*this, Private, Type, ELoc);
+    DeclRefExpr *PrivateRef = buildDeclRefExpr(SemaRef, Private, Type, ELoc);
     // Build var to save initial value.
-    VarDecl *Init = buildVarDecl(*this, ELoc, Type, ".linear.start");
+    VarDecl *Init = buildVarDecl(SemaRef, ELoc, Type, ".linear.start");
     Expr *InitExpr;
     DeclRefExpr *Ref = nullptr;
-    if (!VD && !CurContext->isDependentContext()) {
-      Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/false);
+    if (!VD && !SemaRef.CurContext->isDependentContext()) {
+      Ref = buildCapture(SemaRef, D, SimpleRefExpr, /*WithInit=*/false);
       if (!isOpenMPCapturedDecl(D)) {
         ExprCaptures.push_back(Ref->getDecl());
         if (Ref->getDecl()->hasAttr<OMPCaptureNoInitAttr>()) {
-          ExprResult RefRes = DefaultLvalueConversion(Ref);
+          ExprResult RefRes = SemaRef.DefaultLvalueConversion(Ref);
           if (!RefRes.isUsable())
             continue;
           ExprResult PostUpdateRes =
-              BuildBinOp(DSAStack->getCurScope(), ELoc, BO_Assign,
-                         SimpleRefExpr, RefRes.get());
+              SemaRef.BuildBinOp(DSAStack->getCurScope(), ELoc, BO_Assign,
+                                 SimpleRefExpr, RefRes.get());
           if (!PostUpdateRes.isUsable())
             continue;
           ExprPostUpdates.push_back(
-              IgnoredValueConversions(PostUpdateRes.get()).get());
+              SemaRef.IgnoredValueConversions(PostUpdateRes.get()).get());
         }
       }
     }
@@ -20471,12 +20548,13 @@ OMPClause *Sema::ActOnOpenMPLinearClause(
       InitExpr = VD ? VD->getInit() : SimpleRefExpr;
     else
       InitExpr = VD ? SimpleRefExpr : Ref;
-    AddInitializerToDecl(Init, DefaultLvalueConversion(InitExpr).get(),
-                         /*DirectInit=*/false);
-    DeclRefExpr *InitRef = buildDeclRefExpr(*this, Init, Type, ELoc);
+    SemaRef.AddInitializerToDecl(
+        Init, SemaRef.DefaultLvalueConversion(InitExpr).get(),
+        /*DirectInit=*/false);
+    DeclRefExpr *InitRef = buildDeclRefExpr(SemaRef, Init, Type, ELoc);
 
     DSAStack->addDSA(D, RefExpr->IgnoreParens(), OMPC_linear, Ref);
-    Vars.push_back((VD || CurContext->isDependentContext())
+    Vars.push_back((VD || SemaRef.CurContext->isDependentContext())
                        ? RefExpr->IgnoreParens()
                        : Ref);
     Privates.push_back(PrivateRef);
@@ -20499,17 +20577,18 @@ OMPClause *Sema::ActOnOpenMPLinearClause(
 
     // Build var to save the step value.
     VarDecl *SaveVar =
-        buildVarDecl(*this, StepLoc, StepExpr->getType(), ".linear.step");
+        buildVarDecl(SemaRef, StepLoc, StepExpr->getType(), ".linear.step");
     ExprResult SaveRef =
-        buildDeclRefExpr(*this, SaveVar, StepExpr->getType(), StepLoc);
-    ExprResult CalcStep =
-        BuildBinOp(CurScope, StepLoc, BO_Assign, SaveRef.get(), StepExpr);
-    CalcStep = ActOnFinishFullExpr(CalcStep.get(), /*DiscardedValue*/ false);
+        buildDeclRefExpr(SemaRef, SaveVar, StepExpr->getType(), StepLoc);
+    ExprResult CalcStep = SemaRef.BuildBinOp(
+        SemaRef.getCurScope(), StepLoc, BO_Assign, SaveRef.get(), StepExpr);
+    CalcStep =
+        SemaRef.ActOnFinishFullExpr(CalcStep.get(), /*DiscardedValue*/ false);
 
     // Warn about zero linear step (it would be probably better specified as
     // making corresponding variables 'const').
     if (std::optional<llvm::APSInt> Result =
-            StepExpr->getIntegerConstantExpr(Context)) {
+            StepExpr->getIntegerConstantExpr(getASTContext())) {
       if (!Result->isNegative() && !Result->isStrictlyPositive())
         Diag(StepLoc, diag::warn_omp_linear_step_zero)
             << Vars[0] << (Vars.size() > 1);
@@ -20520,11 +20599,11 @@ OMPClause *Sema::ActOnOpenMPLinearClause(
     }
   }
 
-  return OMPLinearClause::Create(Context, StartLoc, LParenLoc, LinKind, LinLoc,
-                                 ColonLoc, StepModifierLoc, EndLoc, Vars,
-                                 Privates, Inits, StepExpr, CalcStepExpr,
-                                 buildPreInits(Context, ExprCaptures),
-                                 buildPostUpdate(*this, ExprPostUpdates));
+  return OMPLinearClause::Create(getASTContext(), StartLoc, LParenLoc, LinKind,
+                                 LinLoc, ColonLoc, StepModifierLoc, EndLoc,
+                                 Vars, Privates, Inits, StepExpr, CalcStepExpr,
+                                 buildPreInits(getASTContext(), ExprCaptures),
+                                 buildPostUpdate(SemaRef, ExprPostUpdates));
 }
 
 static bool FinishOpenMPLinearClause(OMPLinearClause &Clause, DeclRefExpr *IV,
@@ -20630,7 +20709,7 @@ static bool FinishOpenMPLinearClause(OMPLinearClause &Clause, DeclRefExpr *IV,
   return HasErrors;
 }
 
-OMPClause *Sema::ActOnOpenMPAlignedClause(
+OMPClause *SemaOpenMP::ActOnOpenMPAlignedClause(
     ArrayRef<Expr *> VarList, Expr *Alignment, SourceLocation StartLoc,
     SourceLocation LParenLoc, SourceLocation ColonLoc, SourceLocation EndLoc) {
   SmallVector<Expr *, 8> Vars;
@@ -20639,7 +20718,7 @@ OMPClause *Sema::ActOnOpenMPAlignedClause(
     SourceLocation ELoc;
     SourceRange ERange;
     Expr *SimpleRefExpr = RefExpr;
-    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
+    auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange);
     if (Res.second) {
       // It will be analyzed later.
       Vars.push_back(RefExpr);
@@ -20659,7 +20738,7 @@ OMPClause *Sema::ActOnOpenMPAlignedClause(
     if (!Ty || (!Ty->isArrayType() && !Ty->isPointerType())) {
       Diag(ELoc, diag::err_omp_aligned_expected_array_or_ptr)
           << QType << getLangOpts().CPlusPlus << ERange;
-      bool IsDecl = !VD || VD->isThisDeclarationADefinition(Context) ==
+      bool IsDecl = !VD || VD->isThisDeclarationADefinition(getASTContext()) ==
                                VarDecl::DeclarationOnly;
       Diag(D->getLocation(),
            IsDecl ? diag::note_previous_decl : diag::note_defined_here)
@@ -20679,9 +20758,10 @@ OMPClause *Sema::ActOnOpenMPAlignedClause(
 
     DeclRefExpr *Ref = nullptr;
     if (!VD && isOpenMPCapturedDecl(D))
-      Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true);
-    Vars.push_back(DefaultFunctionArrayConversion(
-                       (VD || !Ref) ? RefExpr->IgnoreParens() : Ref)
+      Ref = buildCapture(SemaRef, D, SimpleRefExpr, /*WithInit=*/true);
+    Vars.push_back(SemaRef
+                       .DefaultFunctionArrayConversion(
+                           (VD || !Ref) ? RefExpr->IgnoreParens() : Ref)
                        .get());
   }
 
@@ -20700,14 +20780,14 @@ OMPClause *Sema::ActOnOpenMPAlignedClause(
   if (Vars.empty())
     return nullptr;
 
-  return OMPAlignedClause::Create(Context, StartLoc, LParenLoc, ColonLoc,
-                                  EndLoc, Vars, Alignment);
+  return OMPAlignedClause::Create(getASTContext(), StartLoc, LParenLoc,
+                                  ColonLoc, EndLoc, Vars, Alignment);
 }
 
-OMPClause *Sema::ActOnOpenMPCopyinClause(ArrayRef<Expr *> VarList,
-                                         SourceLocation StartLoc,
-                                         SourceLocation LParenLoc,
-                                         SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPCopyinClause(ArrayRef<Expr *> VarList,
+                                               SourceLocation StartLoc,
+                                               SourceLocation LParenLoc,
+                                               SourceLocation EndLoc) {
   SmallVector<Expr *, 8> Vars;
   SmallVector<Expr *, 8> SrcExprs;
   SmallVector<Expr *, 8> DstExprs;
@@ -20761,26 +20841,28 @@ OMPClause *Sema::ActOnOpenMPCopyinClause(ArrayRef<Expr *> VarList,
     //  A variable of class type (or array thereof) that appears in a
     //  copyin clause requires an accessible, unambiguous copy assignment
     //  operator for the class type.
-    QualType ElemType = Context.getBaseElementType(Type).getNonReferenceType();
+    QualType ElemType =
+        getASTContext().getBaseElementType(Type).getNonReferenceType();
     VarDecl *SrcVD =
-        buildVarDecl(*this, DE->getBeginLoc(), ElemType.getUnqualifiedType(),
+        buildVarDecl(SemaRef, DE->getBeginLoc(), ElemType.getUnqualifiedType(),
                      ".copyin.src", VD->hasAttrs() ? &VD->getAttrs() : nullptr);
     DeclRefExpr *PseudoSrcExpr = buildDeclRefExpr(
-        *this, SrcVD, ElemType.getUnqualifiedType(), DE->getExprLoc());
+        SemaRef, SrcVD, ElemType.getUnqualifiedType(), DE->getExprLoc());
     VarDecl *DstVD =
-        buildVarDecl(*this, DE->getBeginLoc(), ElemType, ".copyin.dst",
+        buildVarDecl(SemaRef, DE->getBeginLoc(), ElemType, ".copyin.dst",
                      VD->hasAttrs() ? &VD->getAttrs() : nullptr);
     DeclRefExpr *PseudoDstExpr =
-        buildDeclRefExpr(*this, DstVD, ElemType, DE->getExprLoc());
+        buildDeclRefExpr(SemaRef, DstVD, ElemType, DE->getExprLoc());
     // For arrays generate assignment operation for single element and replace
     // it by the original array element in CodeGen.
     ExprResult AssignmentOp =
-        BuildBinOp(/*S=*/nullptr, DE->getExprLoc(), BO_Assign, PseudoDstExpr,
-                   PseudoSrcExpr);
+        SemaRef.BuildBinOp(/*S=*/nullptr, DE->getExprLoc(), BO_Assign,
+                           PseudoDstExpr, PseudoSrcExpr);
     if (AssignmentOp.isInvalid())
       continue;
-    AssignmentOp = ActOnFinishFullExpr(AssignmentOp.get(), DE->getExprLoc(),
-                                       /*DiscardedValue*/ false);
+    AssignmentOp =
+        SemaRef.ActOnFinishFullExpr(AssignmentOp.get(), DE->getExprLoc(),
+                                    /*DiscardedValue*/ false);
     if (AssignmentOp.isInvalid())
       continue;
 
@@ -20794,14 +20876,14 @@ OMPClause *Sema::ActOnOpenMPCopyinClause(ArrayRef<Expr *> VarList,
   if (Vars.empty())
     return nullptr;
 
-  return OMPCopyinClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars,
-                                 SrcExprs, DstExprs, AssignmentOps);
+  return OMPCopyinClause::Create(getASTContext(), StartLoc, LParenLoc, EndLoc,
+                                 Vars, SrcExprs, DstExprs, AssignmentOps);
 }
 
-OMPClause *Sema::ActOnOpenMPCopyprivateClause(ArrayRef<Expr *> VarList,
-                                              SourceLocation StartLoc,
-                                              SourceLocation LParenLoc,
-                                              SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPCopyprivateClause(ArrayRef<Expr *> VarList,
+                                                    SourceLocation StartLoc,
+                                                    SourceLocation LParenLoc,
+                                                    SourceLocation EndLoc) {
   SmallVector<Expr *, 8> Vars;
   SmallVector<Expr *, 8> SrcExprs;
   SmallVector<Expr *, 8> DstExprs;
@@ -20811,7 +20893,7 @@ OMPClause *Sema::ActOnOpenMPCopyprivateClause(ArrayRef<Expr *> VarList,
     SourceLocation ELoc;
     SourceRange ERange;
     Expr *SimpleRefExpr = RefExpr;
-    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
+    auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange);
     if (Res.second) {
       // It will be analyzed later.
       Vars.push_back(RefExpr);
@@ -20837,7 +20919,7 @@ OMPClause *Sema::ActOnOpenMPCopyprivateClause(ArrayRef<Expr *> VarList,
         Diag(ELoc, diag::err_omp_wrong_dsa)
             << getOpenMPClauseName(DVar.CKind)
             << getOpenMPClauseName(OMPC_copyprivate);
-        reportOriginalDsa(*this, DSAStack, D, DVar);
+        reportOriginalDsa(SemaRef, DSAStack, D, DVar);
         continue;
       }
 
@@ -20850,7 +20932,7 @@ OMPClause *Sema::ActOnOpenMPCopyprivateClause(ArrayRef<Expr *> VarList,
           Diag(ELoc, diag::err_omp_required_access)
               << getOpenMPClauseName(OMPC_copyprivate)
               << "threadprivate or private in the enclosing context";
-          reportOriginalDsa(*this, DSAStack, D, DVar);
+          reportOriginalDsa(SemaRef, DSAStack, D, DVar);
           continue;
         }
       }
@@ -20861,7 +20943,7 @@ OMPClause *Sema::ActOnOpenMPCopyprivateClause(ArrayRef<Expr *> VarList,
       Diag(ELoc, diag::err_omp_variably_modified_type_not_supported)
           << getOpenMPClauseName(OMPC_copyprivate) << Type
           << getOpenMPDirectiveName(DSAStack->getCurrentDirective());
-      bool IsDecl = !VD || VD->isThisDeclarationADefinition(Context) ==
+      bool IsDecl = !VD || VD->isThisDeclarationADefinition(getASTContext()) ==
                                VarDecl::DeclarationOnly;
       Diag(D->getLocation(),
            IsDecl ? diag::note_previous_decl : diag::note_defined_here)
@@ -20873,22 +20955,23 @@ OMPClause *Sema::ActOnOpenMPCopyprivateClause(ArrayRef<Expr *> VarList,
     //  A variable of class type (or array thereof) that appears in a
     //  copyin clause requires an accessible, unambiguous copy assignment
     //  operator for the class type.
-    Type = Context.getBaseElementType(Type.getNonReferenceType())
+    Type = getASTContext()
+               .getBaseElementType(Type.getNonReferenceType())
                .getUnqualifiedType();
     VarDecl *SrcVD =
-        buildVarDecl(*this, RefExpr->getBeginLoc(), Type, ".copyprivate.src",
+        buildVarDecl(SemaRef, RefExpr->getBeginLoc(), Type, ".copyprivate.src",
                      D->hasAttrs() ? &D->getAttrs() : nullptr);
-    DeclRefExpr *PseudoSrcExpr = buildDeclRefExpr(*this, SrcVD, Type, ELoc);
+    DeclRefExpr *PseudoSrcExpr = buildDeclRefExpr(SemaRef, SrcVD, Type, ELoc);
     VarDecl *DstVD =
-        buildVarDecl(*this, RefExpr->getBeginLoc(), Type, ".copyprivate.dst",
+        buildVarDecl(SemaRef, RefExpr->getBeginLoc(), Type, ".copyprivate.dst",
                      D->hasAttrs() ? &D->getAttrs() : nullptr);
-    DeclRefExpr *PseudoDstExpr = buildDeclRefExpr(*this, DstVD, Type, ELoc);
-    ExprResult AssignmentOp = BuildBinOp(
+    DeclRefExpr *PseudoDstExpr = buildDeclRefExpr(SemaRef, DstVD, Type, ELoc);
+    ExprResult AssignmentOp = SemaRef.BuildBinOp(
         DSAStack->getCurScope(), ELoc, BO_Assign, PseudoDstExpr, PseudoSrcExpr);
     if (AssignmentOp.isInvalid())
       continue;
-    AssignmentOp =
-        ActOnFinishFullExpr(AssignmentOp.get(), ELoc, /*DiscardedValue*/ false);
+    AssignmentOp = SemaRef.ActOnFinishFullExpr(AssignmentOp.get(), ELoc,
+                                               /*DiscardedValue*/ false);
     if (AssignmentOp.isInvalid())
       continue;
 
@@ -20897,7 +20980,7 @@ OMPClause *Sema::ActOnOpenMPCopyprivateClause(ArrayRef<Expr *> VarList,
     assert(VD || isOpenMPCapturedDecl(D));
     Vars.push_back(
         VD ? RefExpr->IgnoreParens()
-           : buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/false));
+           : buildCapture(SemaRef, D, SimpleRefExpr, /*WithInit=*/false));
     SrcExprs.push_back(PseudoSrcExpr);
     DstExprs.push_back(PseudoDstExpr);
     AssignmentOps.push_back(AssignmentOp.get());
@@ -20906,18 +20989,20 @@ OMPClause *Sema::ActOnOpenMPCopyprivateClause(ArrayRef<Expr *> VarList,
   if (Vars.empty())
     return nullptr;
 
-  return OMPCopyprivateClause::Create(Context, StartLoc, LParenLoc, EndLoc,
-                                      Vars, SrcExprs, DstExprs, AssignmentOps);
+  return OMPCopyprivateClause::Create(getASTContext(), StartLoc, LParenLoc,
+                                      EndLoc, Vars, SrcExprs, DstExprs,
+                                      AssignmentOps);
 }
 
-OMPClause *Sema::ActOnOpenMPFlushClause(ArrayRef<Expr *> VarList,
-                                        SourceLocation StartLoc,
-                                        SourceLocation LParenLoc,
-                                        SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPFlushClause(ArrayRef<Expr *> VarList,
+                                              SourceLocation StartLoc,
+                                              SourceLocation LParenLoc,
+                                              SourceLocation EndLoc) {
   if (VarList.empty())
     return nullptr;
 
-  return OMPFlushClause::Create(Context, StartLoc, LParenLoc, EndLoc, VarList);
+  return OMPFlushClause::Create(getASTContext(), StartLoc, LParenLoc, EndLoc,
+                                VarList);
 }
 
 /// Tries to find omp_depend_t. type.
@@ -20937,22 +21022,23 @@ static bool findOMPDependT(Sema &S, SourceLocation Loc, DSAStackTy *Stack,
   return true;
 }
 
-OMPClause *Sema::ActOnOpenMPDepobjClause(Expr *Depobj, SourceLocation StartLoc,
-                                         SourceLocation LParenLoc,
-                                         SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPDepobjClause(Expr *Depobj,
+                                               SourceLocation StartLoc,
+                                               SourceLocation LParenLoc,
+                                               SourceLocation EndLoc) {
   if (!Depobj)
     return nullptr;
 
-  bool OMPDependTFound = findOMPDependT(*this, StartLoc, DSAStack);
+  bool OMPDependTFound = findOMPDependT(SemaRef, StartLoc, DSAStack);
 
   // OpenMP 5.0, 2.17.10.1 depobj Construct
   // depobj is an lvalue expression of type omp_depend_t.
   if (!Depobj->isTypeDependent() && !Depobj->isValueDependent() &&
       !Depobj->isInstantiationDependent() &&
       !Depobj->containsUnexpandedParameterPack() &&
-      (OMPDependTFound &&
-       !Context.typesAreCompatible(DSAStack->getOMPDependT(), Depobj->getType(),
-                                   /*CompareUnqualified=*/true))) {
+      (OMPDependTFound && !getASTContext().typesAreCompatible(
+                              DSAStack->getOMPDependT(), Depobj->getType(),
+                              /*CompareUnqualified=*/true))) {
     Diag(Depobj->getExprLoc(), diag::err_omp_expected_omp_depend_t_lvalue)
         << 0 << Depobj->getType() << Depobj->getSourceRange();
   }
@@ -20962,7 +21048,8 @@ OMPClause *Sema::ActOnOpenMPDepobjClause(Expr *Depobj, SourceLocation StartLoc,
         << 1 << Depobj->getSourceRange();
   }
 
-  return OMPDepobjClause::Create(Context, StartLoc, LParenLoc, EndLoc, Depobj);
+  return OMPDepobjClause::Create(getASTContext(), StartLoc, LParenLoc, EndLoc,
+                                 Depobj);
 }
 
 namespace {
@@ -21062,8 +21149,9 @@ ProcessOpenMPDoacrossClauseCommon(Sema &SemaRef, bool IsSource,
         continue;
       }
       if (RHS) {
-        ExprResult RHSRes = SemaRef.VerifyPositiveIntegerConstantInClause(
-            RHS, OMPC_depend, /*StrictlyPositive=*/false);
+        ExprResult RHSRes =
+            SemaRef.OpenMP().VerifyPositiveIntegerConstantInClause(
+                RHS, OMPC_depend, /*StrictlyPositive=*/false);
         if (RHSRes.isInvalid())
           continue;
       }
@@ -21094,11 +21182,10 @@ ProcessOpenMPDoacrossClauseCommon(Sema &SemaRef, bool IsSource,
   return {Vars, OpsOffs, TotalDepCount};
 }
 
-OMPClause *
-Sema::ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data,
-                              Expr *DepModifier, ArrayRef<Expr *> VarList,
-                              SourceLocation StartLoc, SourceLocation LParenLoc,
-                              SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPDependClause(
+    const OMPDependClause::DependDataTy &Data, Expr *DepModifier,
+    ArrayRef<Expr *> VarList, SourceLocation StartLoc, SourceLocation LParenLoc,
+    SourceLocation EndLoc) {
   OpenMPDependClauseKind DepKind = Data.DepKind;
   SourceLocation DepLoc = Data.DepLoc;
   if (DSAStack->getCurrentDirective() == OMPD_ordered &&
@@ -21116,17 +21203,18 @@ Sema::ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data,
        DSAStack->getCurrentDirective() == OMPD_depobj) &&
       (DepKind == OMPC_DEPEND_unknown || DepKind == OMPC_DEPEND_source ||
        DepKind == OMPC_DEPEND_sink ||
-       ((LangOpts.OpenMP < 50 ||
+       ((getLangOpts().OpenMP < 50 ||
          DSAStack->getCurrentDirective() == OMPD_depobj) &&
         DepKind == OMPC_DEPEND_depobj))) {
     SmallVector<unsigned, 6> Except = {OMPC_DEPEND_source, OMPC_DEPEND_sink,
                                        OMPC_DEPEND_outallmemory,
                                        OMPC_DEPEND_inoutallmemory};
-    if (LangOpts.OpenMP < 50 || DSAStack->getCurrentDirective() == OMPD_depobj)
+    if (getLangOpts().OpenMP < 50 ||
+        DSAStack->getCurrentDirective() == OMPD_depobj)
       Except.push_back(OMPC_DEPEND_depobj);
-    if (LangOpts.OpenMP < 51)
+    if (getLangOpts().OpenMP < 51)
       Except.push_back(OMPC_DEPEND_inoutset);
-    std::string Expected = (LangOpts.OpenMP >= 50 && !DepModifier)
+    std::string Expected = (getLangOpts().OpenMP >= 50 && !DepModifier)
                                ? "depend modifier(iterator) or "
                                : "";
     Diag(DepLoc, diag::err_omp_unexpected_clause_value)
@@ -21152,7 +21240,7 @@ Sema::ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data,
 
   if (DepKind == OMPC_DEPEND_sink || DepKind == OMPC_DEPEND_source) {
     DoacrossDataInfoTy VarOffset = ProcessOpenMPDoacrossClauseCommon(
-        *this, DepKind == OMPC_DEPEND_source, VarList, DSAStack, EndLoc);
+        SemaRef, DepKind == OMPC_DEPEND_source, VarList, DSAStack, EndLoc);
     Vars = VarOffset.Vars;
     OpsOffs = VarOffset.OpsOffs;
     TotalDepCount = VarOffset.TotalDepCount;
@@ -21168,9 +21256,9 @@ Sema::ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data,
       SourceLocation ELoc = RefExpr->getExprLoc();
       Expr *SimpleExpr = RefExpr->IgnoreParenCasts();
       if (DepKind != OMPC_DEPEND_sink && DepKind != OMPC_DEPEND_source) {
-        bool OMPDependTFound = LangOpts.OpenMP >= 50;
+        bool OMPDependTFound = getLangOpts().OpenMP >= 50;
         if (OMPDependTFound)
-          OMPDependTFound = findOMPDependT(*this, StartLoc, DSAStack,
+          OMPDependTFound = findOMPDependT(SemaRef, StartLoc, DSAStack,
                                            DepKind == OMPC_DEPEND_depobj);
         if (DepKind == OMPC_DEPEND_depobj) {
           // OpenMP 5.0, 2.17.11 depend Clause, Restrictions, C/C++
@@ -21180,8 +21268,8 @@ Sema::ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data,
               !RefExpr->isInstantiationDependent() &&
               !RefExpr->containsUnexpandedParameterPack() &&
               (OMPDependTFound &&
-               !Context.hasSameUnqualifiedType(DSAStack->getOMPDependT(),
-                                               RefExpr->getType()))) {
+               !getASTContext().hasSameUnqualifiedType(
+                   DSAStack->getOMPDependT(), RefExpr->getType()))) {
             Diag(ELoc, diag::err_omp_expected_omp_depend_t_lvalue)
                 << 0 << RefExpr->getType() << RefExpr->getSourceRange();
             continue;
@@ -21212,7 +21300,7 @@ Sema::ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data,
             const Expr *Length = OASE->getLength();
             Expr::EvalResult Result;
             if (Length && !Length->isValueDependent() &&
-                Length->EvaluateAsInt(Result, Context) &&
+                Length->EvaluateAsInt(Result, getASTContext()) &&
                 Result.Val.getInt().isZero()) {
               Diag(ELoc,
                    diag::err_omp_depend_zero_length_array_section_not_allowed)
@@ -21232,8 +21320,9 @@ Sema::ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data,
                (OMPDependTFound && DSAStack->getOMPDependT().getTypePtr() ==
                                        ExprTy.getTypePtr()))) {
             Diag(ELoc, diag::err_omp_expected_addressable_lvalue_or_array_item)
-                << (LangOpts.OpenMP >= 50 ? 1 : 0)
-                << (LangOpts.OpenMP >= 50 ? 1 : 0) << RefExpr->getSourceRange();
+                << (getLangOpts().OpenMP >= 50 ? 1 : 0)
+                << (getLangOpts().OpenMP >= 50 ? 1 : 0)
+                << RefExpr->getSourceRange();
             continue;
           }
 
@@ -21245,22 +21334,24 @@ Sema::ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data,
                    ->isPointerType() &&
               !ASE->getBase()->getType().getNonReferenceType()->isArrayType()) {
             Diag(ELoc, diag::err_omp_expected_addressable_lvalue_or_array_item)
-                << (LangOpts.OpenMP >= 50 ? 1 : 0)
-                << (LangOpts.OpenMP >= 50 ? 1 : 0) << RefExpr->getSourceRange();
+                << (getLangOpts().OpenMP >= 50 ? 1 : 0)
+                << (getLangOpts().OpenMP >= 50 ? 1 : 0)
+                << RefExpr->getSourceRange();
             continue;
           }
 
           ExprResult Res;
           {
-            Sema::TentativeAnalysisScope Trap(*this);
-            Res = CreateBuiltinUnaryOp(ELoc, UO_AddrOf,
-                                       RefExpr->IgnoreParenImpCasts());
+            Sema::TentativeAnalysisScope Trap(SemaRef);
+            Res = SemaRef.CreateBuiltinUnaryOp(ELoc, UO_AddrOf,
+                                               RefExpr->IgnoreParenImpCasts());
           }
           if (!Res.isUsable() && !isa<OMPArraySectionExpr>(SimpleExpr) &&
               !isa<OMPArrayShapingExpr>(SimpleExpr)) {
             Diag(ELoc, diag::err_omp_expected_addressable_lvalue_or_array_item)
-                << (LangOpts.OpenMP >= 50 ? 1 : 0)
-                << (LangOpts.OpenMP >= 50 ? 1 : 0) << RefExpr->getSourceRange();
+                << (getLangOpts().OpenMP >= 50 ? 1 : 0)
+                << (getLangOpts().OpenMP >= 50 ? 1 : 0)
+                << RefExpr->getSourceRange();
             continue;
           }
         }
@@ -21275,7 +21366,7 @@ Sema::ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data,
     return nullptr;
 
   auto *C = OMPDependClause::Create(
-      Context, StartLoc, LParenLoc, EndLoc,
+      getASTContext(), StartLoc, LParenLoc, EndLoc,
       {DepKind, DepLoc, Data.ColonLoc, Data.OmpAllMemoryLoc}, DepModifier, Vars,
       TotalDepCount.getZExtValue());
   if ((DepKind == OMPC_DEPEND_sink || DepKind == OMPC_DEPEND_source) &&
@@ -21284,12 +21375,11 @@ Sema::ActOnOpenMPDependClause(const OMPDependClause::DependDataTy &Data,
   return C;
 }
 
-OMPClause *Sema::ActOnOpenMPDeviceClause(OpenMPDeviceClauseModifier Modifier,
-                                         Expr *Device, SourceLocation StartLoc,
-                                         SourceLocation LParenLoc,
-                                         SourceLocation ModifierLoc,
-                                         SourceLocation EndLoc) {
-  assert((ModifierLoc.isInvalid() || LangOpts.OpenMP >= 50) &&
+OMPClause *SemaOpenMP::ActOnOpenMPDeviceClause(
+    OpenMPDeviceClauseModifier Modifier, Expr *Device, SourceLocation StartLoc,
+    SourceLocation LParenLoc, SourceLocation ModifierLoc,
+    SourceLocation EndLoc) {
+  assert((ModifierLoc.isInvalid() || getLangOpts().OpenMP >= 50) &&
          "Unexpected device modifier in OpenMP < 50.");
 
   bool ErrorFound = false;
@@ -21306,7 +21396,7 @@ OMPClause *Sema::ActOnOpenMPDeviceClause(OpenMPDeviceClauseModifier Modifier,
 
   // OpenMP [2.9.1, Restrictions]
   // The device expression must evaluate to a non-negative integer value.
-  ErrorFound = !isNonNegativeIntegerValue(ValExpr, *this, OMPC_device,
+  ErrorFound = !isNonNegativeIntegerValue(ValExpr, SemaRef, OMPC_device,
                                           /*StrictlyPositive=*/false) ||
                ErrorFound;
   if (ErrorFound)
@@ -21317,7 +21407,7 @@ OMPClause *Sema::ActOnOpenMPDeviceClause(OpenMPDeviceClauseModifier Modifier,
   // the reverse_offload clause must be specified.
   if (Modifier == OMPC_DEVICE_ancestor) {
     if (!DSAStack->hasRequiresDeclWithClause<OMPReverseOffloadClause>()) {
-      targetDiag(
+      SemaRef.targetDiag(
           StartLoc,
           diag::err_omp_device_ancestor_without_requires_reverse_offload);
       ErrorFound = true;
@@ -21326,15 +21416,16 @@ OMPClause *Sema::ActOnOpenMPDeviceClause(OpenMPDeviceClauseModifier Modifier,
 
   OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
   OpenMPDirectiveKind CaptureRegion =
-      getOpenMPCaptureRegionForClause(DKind, OMPC_device, LangOpts.OpenMP);
-  if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
-    ValExpr = MakeFullExpr(ValExpr).get();
+      getOpenMPCaptureRegionForClause(DKind, OMPC_device, getLangOpts().OpenMP);
+  if (CaptureRegion != OMPD_unknown &&
+      !SemaRef.CurContext->isDependentContext()) {
+    ValExpr = SemaRef.MakeFullExpr(ValExpr).get();
     llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
-    ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
-    HelperValStmt = buildPreInits(Context, Captures);
+    ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get();
+    HelperValStmt = buildPreInits(getASTContext(), Captures);
   }
 
-  return new (Context)
+  return new (getASTContext())
       OMPDeviceClause(Modifier, ValExpr, HelperValStmt, CaptureRegion, StartLoc,
                       LParenLoc, ModifierLoc, EndLoc);
 }
@@ -22527,7 +22618,7 @@ static void checkMappableExpressionList(
   }
 }
 
-OMPClause *Sema::ActOnOpenMPMapClause(
+OMPClause *SemaOpenMP::ActOnOpenMPMapClause(
     Expr *IteratorModifier, ArrayRef<OpenMPMapModifierKind> MapTypeModifiers,
     ArrayRef<SourceLocation> MapTypeModifiersLoc,
     CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId,
@@ -22562,7 +22653,7 @@ OMPClause *Sema::ActOnOpenMPMapClause(
   }
 
   MappableVarListInfo MVLI(VarList);
-  checkMappableExpressionList(*this, DSAStack, OMPC_map, MVLI, Locs.StartLoc,
+  checkMappableExpressionList(SemaRef, DSAStack, OMPC_map, MVLI, Locs.StartLoc,
                               MapperIdScopeSpec, MapperId, UnresolvedMappers,
                               MapType, Modifiers, IsMapTypeImplicit,
                               NoDiagnose);
@@ -22570,17 +22661,17 @@ OMPClause *Sema::ActOnOpenMPMapClause(
   // We need to produce a map clause even if we don't have variables so that
   // other diagnostics related with non-existing map clauses are accurate.
   return OMPMapClause::Create(
-      Context, Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations,
+      getASTContext(), Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations,
       MVLI.VarComponents, MVLI.UDMapperList, IteratorModifier, Modifiers,
-      ModifiersLoc, MapperIdScopeSpec.getWithLocInContext(Context), MapperId,
-      MapType, IsMapTypeImplicit, MapLoc);
+      ModifiersLoc, MapperIdScopeSpec.getWithLocInContext(getASTContext()),
+      MapperId, MapType, IsMapTypeImplicit, MapLoc);
 }
 
-QualType Sema::ActOnOpenMPDeclareReductionType(SourceLocation TyLoc,
-                                               TypeResult ParsedType) {
+QualType SemaOpenMP::ActOnOpenMPDeclareReductionType(SourceLocation TyLoc,
+                                                     TypeResult ParsedType) {
   assert(ParsedType.isUsable());
 
-  QualType ReductionType = GetTypeFromParser(ParsedType.get());
+  QualType ReductionType = SemaRef.GetTypeFromParser(ParsedType.get());
   if (ReductionType.isNull())
     return QualType();
 
@@ -22608,15 +22699,17 @@ QualType Sema::ActOnOpenMPDeclareReductionType(SourceLocation TyLoc,
   return ReductionType;
 }
 
-Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareReductionDirectiveStart(
+SemaOpenMP::DeclGroupPtrTy
+SemaOpenMP::ActOnOpenMPDeclareReductionDirectiveStart(
     Scope *S, DeclContext *DC, DeclarationName Name,
     ArrayRef<std::pair<QualType, SourceLocation>> ReductionTypes,
     AccessSpecifier AS, Decl *PrevDeclInScope) {
   SmallVector<Decl *, 8> Decls;
   Decls.reserve(ReductionTypes.size());
 
-  LookupResult Lookup(*this, Name, SourceLocation(), LookupOMPReductionName,
-                      forRedeclarationInCurContext());
+  LookupResult Lookup(SemaRef, Name, SourceLocation(),
+                      Sema::LookupOMPReductionName,
+                      SemaRef.forRedeclarationInCurContext());
   // [OpenMP 4.0], 2.15 declare reduction Directive, Restrictions
   // A reduction-identifier may not be re-declared in the current scope for the
   // same type or for a type that is compatible according to the base language
@@ -22627,12 +22720,12 @@ Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareReductionDirectiveStart(
   if (S != nullptr) {
     // Find previous declaration with the same name not referenced in other
     // declarations.
-    FunctionScopeInfo *ParentFn = getEnclosingFunction();
+    FunctionScopeInfo *ParentFn = SemaRef.getEnclosingFunction();
     InCompoundScope =
         (ParentFn != nullptr) && !ParentFn->CompoundScopes.empty();
-    LookupName(Lookup, S);
-    FilterLookupForScope(Lookup, DC, S, /*ConsiderLinkage=*/false,
-                         /*AllowInlineNamespace=*/false);
+    SemaRef.LookupName(Lookup, S);
+    SemaRef.FilterLookupForScope(Lookup, DC, S, /*ConsiderLinkage=*/false,
+                                 /*AllowInlineNamespace=*/false);
     llvm::DenseMap<OMPDeclareReductionDecl *, bool> UsedAsPrevious;
     LookupResult::Filter Filter = Lookup.makeFilter();
     while (Filter.hasNext()) {
@@ -22675,8 +22768,8 @@ Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareReductionDirectiveStart(
       Invalid = true;
     }
     PreviousRedeclTypes[TyData.first.getCanonicalType()] = TyData.second;
-    auto *DRD = OMPDeclareReductionDecl::Create(Context, DC, TyData.second,
-                                                Name, TyData.first, PrevDRD);
+    auto *DRD = OMPDeclareReductionDecl::Create(
+        getASTContext(), DC, TyData.second, Name, TyData.first, PrevDRD);
     DC->addDecl(DRD);
     DRD->setAccess(AS);
     Decls.push_back(DRD);
@@ -22687,24 +22780,24 @@ Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareReductionDirectiveStart(
   }
 
   return DeclGroupPtrTy::make(
-      DeclGroupRef::Create(Context, Decls.begin(), Decls.size()));
+      DeclGroupRef::Create(getASTContext(), Decls.begin(), Decls.size()));
 }
 
-void Sema::ActOnOpenMPDeclareReductionCombinerStart(Scope *S, Decl *D) {
+void SemaOpenMP::ActOnOpenMPDeclareReductionCombinerStart(Scope *S, Decl *D) {
   auto *DRD = cast<OMPDeclareReductionDecl>(D);
 
   // Enter new function scope.
-  PushFunctionScope();
-  setFunctionHasBranchProtectedScope();
-  getCurFunction()->setHasOMPDeclareReductionCombiner();
+  SemaRef.PushFunctionScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
+  SemaRef.getCurFunction()->setHasOMPDeclareReductionCombiner();
 
   if (S != nullptr)
-    PushDeclContext(S, DRD);
+    SemaRef.PushDeclContext(S, DRD);
   else
-    CurContext = DRD;
+    SemaRef.CurContext = DRD;
 
-  PushExpressionEvaluationContext(
-      ExpressionEvaluationContext::PotentiallyEvaluated);
+  SemaRef.PushExpressionEvaluationContext(
+      Sema::ExpressionEvaluationContext::PotentiallyEvaluated);
 
   QualType ReductionType = DRD->getType();
   // Create 'T* omp_parm;T omp_in;'. All references to 'omp_in' will
@@ -22714,7 +22807,7 @@ void Sema::ActOnOpenMPDeclareReductionCombinerStart(Scope *S, Decl *D) {
   // pointers.
   // Create 'T omp_in;' variable.
   VarDecl *OmpInParm =
-      buildVarDecl(*this, D->getLocation(), ReductionType, "omp_in");
+      buildVarDecl(SemaRef, D->getLocation(), ReductionType, "omp_in");
   // Create 'T* omp_parm;T omp_out;'. All references to 'omp_out' will
   // be replaced by '*omp_parm' during codegen. This required because 'omp_out'
   // uses semantics of argument handles by value, but it should be passed by
@@ -22722,28 +22815,29 @@ void Sema::ActOnOpenMPDeclareReductionCombinerStart(Scope *S, Decl *D) {
   // pointers.
   // Create 'T omp_out;' variable.
   VarDecl *OmpOutParm =
-      buildVarDecl(*this, D->getLocation(), ReductionType, "omp_out");
+      buildVarDecl(SemaRef, D->getLocation(), ReductionType, "omp_out");
   if (S != nullptr) {
-    PushOnScopeChains(OmpInParm, S);
-    PushOnScopeChains(OmpOutParm, S);
+    SemaRef.PushOnScopeChains(OmpInParm, S);
+    SemaRef.PushOnScopeChains(OmpOutParm, S);
   } else {
     DRD->addDecl(OmpInParm);
     DRD->addDecl(OmpOutParm);
   }
   Expr *InE =
-      ::buildDeclRefExpr(*this, OmpInParm, ReductionType, D->getLocation());
+      ::buildDeclRefExpr(SemaRef, OmpInParm, ReductionType, D->getLocation());
   Expr *OutE =
-      ::buildDeclRefExpr(*this, OmpOutParm, ReductionType, D->getLocation());
+      ::buildDeclRefExpr(SemaRef, OmpOutParm, ReductionType, D->getLocation());
   DRD->setCombinerData(InE, OutE);
 }
 
-void Sema::ActOnOpenMPDeclareReductionCombinerEnd(Decl *D, Expr *Combiner) {
+void SemaOpenMP::ActOnOpenMPDeclareReductionCombinerEnd(Decl *D,
+                                                        Expr *Combiner) {
   auto *DRD = cast<OMPDeclareReductionDecl>(D);
-  DiscardCleanupsInEvaluationContext();
-  PopExpressionEvaluationContext();
+  SemaRef.DiscardCleanupsInEvaluationContext();
+  SemaRef.PopExpressionEvaluationContext();
 
-  PopDeclContext();
-  PopFunctionScopeInfo();
+  SemaRef.PopDeclContext();
+  SemaRef.PopFunctionScopeInfo();
 
   if (Combiner != nullptr)
     DRD->setCombiner(Combiner);
@@ -22751,20 +22845,21 @@ void Sema::ActOnOpenMPDeclareReductionCombinerEnd(Decl *D, Expr *Combiner) {
     DRD->setInvalidDecl();
 }
 
-VarDecl *Sema::ActOnOpenMPDeclareReductionInitializerStart(Scope *S, Decl *D) {
+VarDecl *SemaOpenMP::ActOnOpenMPDeclareReductionInitializerStart(Scope *S,
+                                                                 Decl *D) {
   auto *DRD = cast<OMPDeclareReductionDecl>(D);
 
   // Enter new function scope.
-  PushFunctionScope();
-  setFunctionHasBranchProtectedScope();
+  SemaRef.PushFunctionScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
   if (S != nullptr)
-    PushDeclContext(S, DRD);
+    SemaRef.PushDeclContext(S, DRD);
   else
-    CurContext = DRD;
+    SemaRef.CurContext = DRD;
 
-  PushExpressionEvaluationContext(
-      ExpressionEvaluationContext::PotentiallyEvaluated);
+  SemaRef.PushExpressionEvaluationContext(
+      Sema::ExpressionEvaluationContext::PotentiallyEvaluated);
 
   QualType ReductionType = DRD->getType();
   // Create 'T* omp_parm;T omp_priv;'. All references to 'omp_priv' will
@@ -22774,7 +22869,7 @@ VarDecl *Sema::ActOnOpenMPDeclareReductionInitializerStart(Scope *S, Decl *D) {
   // pointers.
   // Create 'T omp_priv;' variable.
   VarDecl *OmpPrivParm =
-      buildVarDecl(*this, D->getLocation(), ReductionType, "omp_priv");
+      buildVarDecl(SemaRef, D->getLocation(), ReductionType, "omp_priv");
   // Create 'T* omp_parm;T omp_orig;'. All references to 'omp_orig' will
   // be replaced by '*omp_parm' during codegen. This required because 'omp_orig'
   // uses semantics of argument handles by value, but it should be passed by
@@ -22782,30 +22877,30 @@ VarDecl *Sema::ActOnOpenMPDeclareReductionInitializerStart(Scope *S, Decl *D) {
   // pointers.
   // Create 'T omp_orig;' variable.
   VarDecl *OmpOrigParm =
-      buildVarDecl(*this, D->getLocation(), ReductionType, "omp_orig");
+      buildVarDecl(SemaRef, D->getLocation(), ReductionType, "omp_orig");
   if (S != nullptr) {
-    PushOnScopeChains(OmpPrivParm, S);
-    PushOnScopeChains(OmpOrigParm, S);
+    SemaRef.PushOnScopeChains(OmpPrivParm, S);
+    SemaRef.PushOnScopeChains(OmpOrigParm, S);
   } else {
     DRD->addDecl(OmpPrivParm);
     DRD->addDecl(OmpOrigParm);
   }
   Expr *OrigE =
-      ::buildDeclRefExpr(*this, OmpOrigParm, ReductionType, D->getLocation());
+      ::buildDeclRefExpr(SemaRef, OmpOrigParm, ReductionType, D->getLocation());
   Expr *PrivE =
-      ::buildDeclRefExpr(*this, OmpPrivParm, ReductionType, D->getLocation());
+      ::buildDeclRefExpr(SemaRef, OmpPrivParm, ReductionType, D->getLocation());
   DRD->setInitializerData(OrigE, PrivE);
   return OmpPrivParm;
 }
 
-void Sema::ActOnOpenMPDeclareReductionInitializerEnd(Decl *D, Expr *Initializer,
-                                                     VarDecl *OmpPrivParm) {
+void SemaOpenMP::ActOnOpenMPDeclareReductionInitializerEnd(
+    Decl *D, Expr *Initializer, VarDecl *OmpPrivParm) {
   auto *DRD = cast<OMPDeclareReductionDecl>(D);
-  DiscardCleanupsInEvaluationContext();
-  PopExpressionEvaluationContext();
+  SemaRef.DiscardCleanupsInEvaluationContext();
+  SemaRef.PopExpressionEvaluationContext();
 
-  PopDeclContext();
-  PopFunctionScopeInfo();
+  SemaRef.PopDeclContext();
+  SemaRef.PopFunctionScopeInfo();
 
   if (Initializer != nullptr) {
     DRD->setInitializer(Initializer, OMPDeclareReductionInitKind::Call);
@@ -22819,13 +22914,13 @@ void Sema::ActOnOpenMPDeclareReductionInitializerEnd(Decl *D, Expr *Initializer,
   }
 }
 
-Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareReductionDirectiveEnd(
+SemaOpenMP::DeclGroupPtrTy SemaOpenMP::ActOnOpenMPDeclareReductionDirectiveEnd(
     Scope *S, DeclGroupPtrTy DeclReductions, bool IsValid) {
   for (Decl *D : DeclReductions.get()) {
     if (IsValid) {
       if (S)
-        PushOnScopeChains(cast<OMPDeclareReductionDecl>(D), S,
-                          /*AddToContext=*/false);
+        SemaRef.PushOnScopeChains(cast<OMPDeclareReductionDecl>(D), S,
+                                  /*AddToContext=*/false);
     } else {
       D->setInvalidDecl();
     }
@@ -22833,25 +22928,26 @@ Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareReductionDirectiveEnd(
   return DeclReductions;
 }
 
-TypeResult Sema::ActOnOpenMPDeclareMapperVarDecl(Scope *S, Declarator &D) {
-  TypeSourceInfo *TInfo = GetTypeForDeclarator(D);
+TypeResult SemaOpenMP::ActOnOpenMPDeclareMapperVarDecl(Scope *S,
+                                                       Declarator &D) {
+  TypeSourceInfo *TInfo = SemaRef.GetTypeForDeclarator(D);
   QualType T = TInfo->getType();
   if (D.isInvalidType())
     return true;
 
   if (getLangOpts().CPlusPlus) {
     // Check that there are no default arguments (C++ only).
-    CheckExtraCXXDefaultArguments(D);
+    SemaRef.CheckExtraCXXDefaultArguments(D);
   }
 
-  return CreateParsedType(T, TInfo);
+  return SemaRef.CreateParsedType(T, TInfo);
 }
 
-QualType Sema::ActOnOpenMPDeclareMapperType(SourceLocation TyLoc,
-                                            TypeResult ParsedType) {
+QualType SemaOpenMP::ActOnOpenMPDeclareMapperType(SourceLocation TyLoc,
+                                                  TypeResult ParsedType) {
   assert(ParsedType.isUsable() && "Expect usable parsed mapper type");
 
-  QualType MapperType = GetTypeFromParser(ParsedType.get());
+  QualType MapperType = SemaRef.GetTypeFromParser(ParsedType.get());
   assert(!MapperType.isNull() && "Expect valid mapper type");
 
   // [OpenMP 5.0], 2.19.7.3 declare mapper Directive, Restrictions
@@ -22863,12 +22959,13 @@ QualType Sema::ActOnOpenMPDeclareMapperType(SourceLocation TyLoc,
   return MapperType;
 }
 
-Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareMapperDirective(
+SemaOpenMP::DeclGroupPtrTy SemaOpenMP::ActOnOpenMPDeclareMapperDirective(
     Scope *S, DeclContext *DC, DeclarationName Name, QualType MapperType,
     SourceLocation StartLoc, DeclarationName VN, AccessSpecifier AS,
     Expr *MapperVarRef, ArrayRef<OMPClause *> Clauses, Decl *PrevDeclInScope) {
-  LookupResult Lookup(*this, Name, SourceLocation(), LookupOMPMapperName,
-                      forRedeclarationInCurContext());
+  LookupResult Lookup(SemaRef, Name, SourceLocation(),
+                      Sema::LookupOMPMapperName,
+                      SemaRef.forRedeclarationInCurContext());
   // [OpenMP 5.0], 2.19.7.3 declare mapper Directive, Restrictions
   //  A mapper-identifier may not be redeclared in the current scope for the
   //  same type or for a type that is compatible according to the base language
@@ -22879,12 +22976,12 @@ Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareMapperDirective(
   if (S != nullptr) {
     // Find previous declaration with the same name not referenced in other
     // declarations.
-    FunctionScopeInfo *ParentFn = getEnclosingFunction();
+    FunctionScopeInfo *ParentFn = SemaRef.getEnclosingFunction();
     InCompoundScope =
         (ParentFn != nullptr) && !ParentFn->CompoundScopes.empty();
-    LookupName(Lookup, S);
-    FilterLookupForScope(Lookup, DC, S, /*ConsiderLinkage=*/false,
-                         /*AllowInlineNamespace=*/false);
+    SemaRef.LookupName(Lookup, S);
+    SemaRef.FilterLookupForScope(Lookup, DC, S, /*ConsiderLinkage=*/false,
+                                 /*AllowInlineNamespace=*/false);
     llvm::DenseMap<OMPDeclareMapperDecl *, bool> UsedAsPrevious;
     LookupResult::Filter Filter = Lookup.makeFilter();
     while (Filter.hasNext()) {
@@ -22929,13 +23026,14 @@ Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareMapperDirective(
   // mappers.
   SmallVector<OMPClause *, 4> ClausesWithImplicit(Clauses.begin(),
                                                   Clauses.end());
-  if (LangOpts.OpenMP >= 50)
-    processImplicitMapsWithDefaultMappers(*this, DSAStack, ClausesWithImplicit);
-  auto *DMD =
-      OMPDeclareMapperDecl::Create(Context, DC, StartLoc, Name, MapperType, VN,
-                                   ClausesWithImplicit, PrevDMD);
+  if (getLangOpts().OpenMP >= 50)
+    processImplicitMapsWithDefaultMappers(SemaRef, DSAStack,
+                                          ClausesWithImplicit);
+  auto *DMD = OMPDeclareMapperDecl::Create(getASTContext(), DC, StartLoc, Name,
+                                           MapperType, VN, ClausesWithImplicit,
+                                           PrevDMD);
   if (S)
-    PushOnScopeChains(DMD, S);
+    SemaRef.PushOnScopeChains(DMD, S);
   else
     DC->addDecl(DMD);
   DMD->setAccess(AS);
@@ -22951,105 +23049,106 @@ Sema::DeclGroupPtrTy Sema::ActOnOpenMPDeclareMapperDirective(
   return DeclGroupPtrTy::make(DeclGroupRef(DMD));
 }
 
-ExprResult
-Sema::ActOnOpenMPDeclareMapperDirectiveVarDecl(Scope *S, QualType MapperType,
-                                               SourceLocation StartLoc,
-                                               DeclarationName VN) {
+ExprResult SemaOpenMP::ActOnOpenMPDeclareMapperDirectiveVarDecl(
+    Scope *S, QualType MapperType, SourceLocation StartLoc,
+    DeclarationName VN) {
   TypeSourceInfo *TInfo =
-      Context.getTrivialTypeSourceInfo(MapperType, StartLoc);
-  auto *VD = VarDecl::Create(Context, Context.getTranslationUnitDecl(),
-                             StartLoc, StartLoc, VN.getAsIdentifierInfo(),
-                             MapperType, TInfo, SC_None);
+      getASTContext().getTrivialTypeSourceInfo(MapperType, StartLoc);
+  auto *VD = VarDecl::Create(
+      getASTContext(), getASTContext().getTranslationUnitDecl(), StartLoc,
+      StartLoc, VN.getAsIdentifierInfo(), MapperType, TInfo, SC_None);
   if (S)
-    PushOnScopeChains(VD, S, /*AddToContext=*/false);
-  Expr *E = buildDeclRefExpr(*this, VD, MapperType, StartLoc);
+    SemaRef.PushOnScopeChains(VD, S, /*AddToContext=*/false);
+  Expr *E = buildDeclRefExpr(SemaRef, VD, MapperType, StartLoc);
   DSAStack->addDeclareMapperVarRef(E);
   return E;
 }
 
-void Sema::ActOnOpenMPIteratorVarDecl(VarDecl *VD) {
+void SemaOpenMP::ActOnOpenMPIteratorVarDecl(VarDecl *VD) {
   if (DSAStack->getDeclareMapperVarRef())
     DSAStack->addIteratorVarDecl(VD);
 }
 
-bool Sema::isOpenMPDeclareMapperVarDeclAllowed(const VarDecl *VD) const {
-  assert(LangOpts.OpenMP && "Expected OpenMP mode.");
+bool SemaOpenMP::isOpenMPDeclareMapperVarDeclAllowed(const VarDecl *VD) const {
+  assert(getLangOpts().OpenMP && "Expected OpenMP mode.");
   const Expr *Ref = DSAStack->getDeclareMapperVarRef();
   if (const auto *DRE = cast_or_null<DeclRefExpr>(Ref)) {
     if (VD->getCanonicalDecl() == DRE->getDecl()->getCanonicalDecl())
       return true;
-    if (VD->isUsableInConstantExpressions(Context))
+    if (VD->isUsableInConstantExpressions(getASTContext()))
       return true;
-    if (LangOpts.OpenMP >= 52 && DSAStack->isIteratorVarDecl(VD))
+    if (getLangOpts().OpenMP >= 52 && DSAStack->isIteratorVarDecl(VD))
       return true;
     return false;
   }
   return true;
 }
 
-const ValueDecl *Sema::getOpenMPDeclareMapperVarName() const {
-  assert(LangOpts.OpenMP && "Expected OpenMP mode.");
+const ValueDecl *SemaOpenMP::getOpenMPDeclareMapperVarName() const {
+  assert(getLangOpts().OpenMP && "Expected OpenMP mode.");
   return cast<DeclRefExpr>(DSAStack->getDeclareMapperVarRef())->getDecl();
 }
 
-OMPClause *Sema::ActOnOpenMPNumTeamsClause(Expr *NumTeams,
-                                           SourceLocation StartLoc,
-                                           SourceLocation LParenLoc,
-                                           SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPNumTeamsClause(Expr *NumTeams,
+                                                 SourceLocation StartLoc,
+                                                 SourceLocation LParenLoc,
+                                                 SourceLocation EndLoc) {
   Expr *ValExpr = NumTeams;
   Stmt *HelperValStmt = nullptr;
 
   // OpenMP [teams Constrcut, Restrictions]
   // The num_teams expression must evaluate to a positive integer value.
-  if (!isNonNegativeIntegerValue(ValExpr, *this, OMPC_num_teams,
+  if (!isNonNegativeIntegerValue(ValExpr, SemaRef, OMPC_num_teams,
                                  /*StrictlyPositive=*/true))
     return nullptr;
 
   OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
-  OpenMPDirectiveKind CaptureRegion =
-      getOpenMPCaptureRegionForClause(DKind, OMPC_num_teams, LangOpts.OpenMP);
-  if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
-    ValExpr = MakeFullExpr(ValExpr).get();
+  OpenMPDirectiveKind CaptureRegion = getOpenMPCaptureRegionForClause(
+      DKind, OMPC_num_teams, getLangOpts().OpenMP);
+  if (CaptureRegion != OMPD_unknown &&
+      !SemaRef.CurContext->isDependentContext()) {
+    ValExpr = SemaRef.MakeFullExpr(ValExpr).get();
     llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
-    ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
-    HelperValStmt = buildPreInits(Context, Captures);
+    ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get();
+    HelperValStmt = buildPreInits(getASTContext(), Captures);
   }
 
-  return new (Context) OMPNumTeamsClause(ValExpr, HelperValStmt, CaptureRegion,
-                                         StartLoc, LParenLoc, EndLoc);
+  return new (getASTContext()) OMPNumTeamsClause(
+      ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPThreadLimitClause(Expr *ThreadLimit,
-                                              SourceLocation StartLoc,
-                                              SourceLocation LParenLoc,
-                                              SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPThreadLimitClause(Expr *ThreadLimit,
+                                                    SourceLocation StartLoc,
+                                                    SourceLocation LParenLoc,
+                                                    SourceLocation EndLoc) {
   Expr *ValExpr = ThreadLimit;
   Stmt *HelperValStmt = nullptr;
 
   // OpenMP [teams Constrcut, Restrictions]
   // The thread_limit expression must evaluate to a positive integer value.
-  if (!isNonNegativeIntegerValue(ValExpr, *this, OMPC_thread_limit,
+  if (!isNonNegativeIntegerValue(ValExpr, SemaRef, OMPC_thread_limit,
                                  /*StrictlyPositive=*/true))
     return nullptr;
 
   OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
   OpenMPDirectiveKind CaptureRegion = getOpenMPCaptureRegionForClause(
-      DKind, OMPC_thread_limit, LangOpts.OpenMP);
-  if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
-    ValExpr = MakeFullExpr(ValExpr).get();
+      DKind, OMPC_thread_limit, getLangOpts().OpenMP);
+  if (CaptureRegion != OMPD_unknown &&
+      !SemaRef.CurContext->isDependentContext()) {
+    ValExpr = SemaRef.MakeFullExpr(ValExpr).get();
     llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
-    ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
-    HelperValStmt = buildPreInits(Context, Captures);
+    ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get();
+    HelperValStmt = buildPreInits(getASTContext(), Captures);
   }
 
-  return new (Context) OMPThreadLimitClause(
+  return new (getASTContext()) OMPThreadLimitClause(
       ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPPriorityClause(Expr *Priority,
-                                           SourceLocation StartLoc,
-                                           SourceLocation LParenLoc,
-                                           SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPPriorityClause(Expr *Priority,
+                                                 SourceLocation StartLoc,
+                                                 SourceLocation LParenLoc,
+                                                 SourceLocation EndLoc) {
   Expr *ValExpr = Priority;
   Stmt *HelperValStmt = nullptr;
   OpenMPDirectiveKind CaptureRegion = OMPD_unknown;
@@ -23057,20 +23156,20 @@ OMPClause *Sema::ActOnOpenMPPriorityClause(Expr *Priority,
   // OpenMP [2.9.1, task Constrcut]
   // The priority-value is a non-negative numerical scalar expression.
   if (!isNonNegativeIntegerValue(
-          ValExpr, *this, OMPC_priority,
+          ValExpr, SemaRef, OMPC_priority,
           /*StrictlyPositive=*/false, /*BuildCapture=*/true,
           DSAStack->getCurrentDirective(), &CaptureRegion, &HelperValStmt))
     return nullptr;
 
-  return new (Context) OMPPriorityClause(ValExpr, HelperValStmt, CaptureRegion,
-                                         StartLoc, LParenLoc, EndLoc);
+  return new (getASTContext()) OMPPriorityClause(
+      ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPGrainsizeClause(
+OMPClause *SemaOpenMP::ActOnOpenMPGrainsizeClause(
     OpenMPGrainsizeClauseModifier Modifier, Expr *Grainsize,
     SourceLocation StartLoc, SourceLocation LParenLoc,
     SourceLocation ModifierLoc, SourceLocation EndLoc) {
-  assert((ModifierLoc.isInvalid() || LangOpts.OpenMP >= 51) &&
+  assert((ModifierLoc.isInvalid() || getLangOpts().OpenMP >= 51) &&
          "Unexpected grainsize modifier in OpenMP < 51.");
 
   if (ModifierLoc.isValid() && Modifier == OMPC_GRAINSIZE_unknown) {
@@ -23088,23 +23187,23 @@ OMPClause *Sema::ActOnOpenMPGrainsizeClause(
   // OpenMP [2.9.2, taskloop Constrcut]
   // The parameter of the grainsize clause must be a positive integer
   // expression.
-  if (!isNonNegativeIntegerValue(ValExpr, *this, OMPC_grainsize,
+  if (!isNonNegativeIntegerValue(ValExpr, SemaRef, OMPC_grainsize,
                                  /*StrictlyPositive=*/true,
                                  /*BuildCapture=*/true,
                                  DSAStack->getCurrentDirective(),
                                  &CaptureRegion, &HelperValStmt))
     return nullptr;
 
-  return new (Context)
+  return new (getASTContext())
       OMPGrainsizeClause(Modifier, ValExpr, HelperValStmt, CaptureRegion,
                          StartLoc, LParenLoc, ModifierLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPNumTasksClause(
+OMPClause *SemaOpenMP::ActOnOpenMPNumTasksClause(
     OpenMPNumTasksClauseModifier Modifier, Expr *NumTasks,
     SourceLocation StartLoc, SourceLocation LParenLoc,
     SourceLocation ModifierLoc, SourceLocation EndLoc) {
-  assert((ModifierLoc.isInvalid() || LangOpts.OpenMP >= 51) &&
+  assert((ModifierLoc.isInvalid() || getLangOpts().OpenMP >= 51) &&
          "Unexpected num_tasks modifier in OpenMP < 51.");
 
   if (ModifierLoc.isValid() && Modifier == OMPC_NUMTASKS_unknown) {
@@ -23123,19 +23222,20 @@ OMPClause *Sema::ActOnOpenMPNumTasksClause(
   // The parameter of the num_tasks clause must be a positive integer
   // expression.
   if (!isNonNegativeIntegerValue(
-          ValExpr, *this, OMPC_num_tasks,
+          ValExpr, SemaRef, OMPC_num_tasks,
           /*StrictlyPositive=*/true, /*BuildCapture=*/true,
           DSAStack->getCurrentDirective(), &CaptureRegion, &HelperValStmt))
     return nullptr;
 
-  return new (Context)
+  return new (getASTContext())
       OMPNumTasksClause(Modifier, ValExpr, HelperValStmt, CaptureRegion,
                         StartLoc, LParenLoc, ModifierLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPHintClause(Expr *Hint, SourceLocation StartLoc,
-                                       SourceLocation LParenLoc,
-                                       SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPHintClause(Expr *Hint,
+                                             SourceLocation StartLoc,
+                                             SourceLocation LParenLoc,
+                                             SourceLocation EndLoc) {
   // OpenMP [2.13.2, critical construct, Description]
   // ... where hint-expression is an integer constant expression that evaluates
   // to a valid lock hint.
@@ -23143,7 +23243,7 @@ OMPClause *Sema::ActOnOpenMPHintClause(Expr *Hint, SourceLocation StartLoc,
       VerifyPositiveIntegerConstantInClause(Hint, OMPC_hint, false);
   if (HintExpr.isInvalid())
     return nullptr;
-  return new (Context)
+  return new (getASTContext())
       OMPHintClause(HintExpr.get(), StartLoc, LParenLoc, EndLoc);
 }
 
@@ -23163,13 +23263,14 @@ static bool findOMPEventHandleT(Sema &S, SourceLocation Loc,
   return true;
 }
 
-OMPClause *Sema::ActOnOpenMPDetachClause(Expr *Evt, SourceLocation StartLoc,
-                                         SourceLocation LParenLoc,
-                                         SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPDetachClause(Expr *Evt,
+                                               SourceLocation StartLoc,
+                                               SourceLocation LParenLoc,
+                                               SourceLocation EndLoc) {
   if (!Evt->isValueDependent() && !Evt->isTypeDependent() &&
       !Evt->isInstantiationDependent() &&
       !Evt->containsUnexpandedParameterPack()) {
-    if (!findOMPEventHandleT(*this, Evt->getExprLoc(), DSAStack))
+    if (!findOMPEventHandleT(SemaRef, Evt->getExprLoc(), DSAStack))
       return nullptr;
     // OpenMP 5.0, 2.10.1 task Construct.
     // event-handle is a variable of the omp_event_handle_t type.
@@ -23185,9 +23286,9 @@ OMPClause *Sema::ActOnOpenMPDetachClause(Expr *Evt, SourceLocation StartLoc,
           << "omp_event_handle_t" << 0 << Evt->getSourceRange();
       return nullptr;
     }
-    if (!Context.hasSameUnqualifiedType(DSAStack->getOMPEventHandleT(),
-                                        VD->getType()) ||
-        VD->getType().isConstant(Context)) {
+    if (!getASTContext().hasSameUnqualifiedType(DSAStack->getOMPEventHandleT(),
+                                                VD->getType()) ||
+        VD->getType().isConstant(getASTContext())) {
       Diag(Evt->getExprLoc(), diag::err_omp_var_expected)
           << "omp_event_handle_t" << 1 << VD->getType()
           << Evt->getSourceRange();
@@ -23202,15 +23303,16 @@ OMPClause *Sema::ActOnOpenMPDetachClause(Expr *Evt, SourceLocation StartLoc,
       Diag(Evt->getExprLoc(), diag::err_omp_wrong_dsa)
           << getOpenMPClauseName(DVar.CKind)
           << getOpenMPClauseName(OMPC_firstprivate);
-      reportOriginalDsa(*this, DSAStack, VD, DVar);
+      reportOriginalDsa(SemaRef, DSAStack, VD, DVar);
       return nullptr;
     }
   }
 
-  return new (Context) OMPDetachClause(Evt, StartLoc, LParenLoc, EndLoc);
+  return new (getASTContext())
+      OMPDetachClause(Evt, StartLoc, LParenLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPDistScheduleClause(
+OMPClause *SemaOpenMP::ActOnOpenMPDistScheduleClause(
     OpenMPDistScheduleClauseKind Kind, Expr *ChunkSize, SourceLocation StartLoc,
     SourceLocation LParenLoc, SourceLocation KindLoc, SourceLocation CommaLoc,
     SourceLocation EndLoc) {
@@ -23241,7 +23343,7 @@ OMPClause *Sema::ActOnOpenMPDistScheduleClause(
       //  chunk_size must be a loop invariant integer expression with a positive
       //  value.
       if (std::optional<llvm::APSInt> Result =
-              ValExpr->getIntegerConstantExpr(Context)) {
+              ValExpr->getIntegerConstantExpr(getASTContext())) {
         if (Result->isSigned() && !Result->isStrictlyPositive()) {
           Diag(ChunkSizeLoc, diag::err_omp_negative_expression_in_clause)
               << "dist_schedule" << ChunkSize->getSourceRange();
@@ -23249,22 +23351,22 @@ OMPClause *Sema::ActOnOpenMPDistScheduleClause(
         }
       } else if (getOpenMPCaptureRegionForClause(
                      DSAStack->getCurrentDirective(), OMPC_dist_schedule,
-                     LangOpts.OpenMP) != OMPD_unknown &&
-                 !CurContext->isDependentContext()) {
-        ValExpr = MakeFullExpr(ValExpr).get();
+                     getLangOpts().OpenMP) != OMPD_unknown &&
+                 !SemaRef.CurContext->isDependentContext()) {
+        ValExpr = SemaRef.MakeFullExpr(ValExpr).get();
         llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
-        ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
-        HelperValStmt = buildPreInits(Context, Captures);
+        ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get();
+        HelperValStmt = buildPreInits(getASTContext(), Captures);
       }
     }
   }
 
-  return new (Context)
+  return new (getASTContext())
       OMPDistScheduleClause(StartLoc, LParenLoc, KindLoc, CommaLoc, EndLoc,
                             Kind, ValExpr, HelperValStmt);
 }
 
-OMPClause *Sema::ActOnOpenMPDefaultmapClause(
+OMPClause *SemaOpenMP::ActOnOpenMPDefaultmapClause(
     OpenMPDefaultmapClauseModifier M, OpenMPDefaultmapClauseKind Kind,
     SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation MLoc,
     SourceLocation KindLoc, SourceLocation EndLoc) {
@@ -23291,10 +23393,10 @@ OMPClause *Sema::ActOnOpenMPDefaultmapClause(
   } else {
     bool isDefaultmapModifier = (M != OMPC_DEFAULTMAP_MODIFIER_unknown);
     bool isDefaultmapKind = (Kind != OMPC_DEFAULTMAP_unknown) ||
-                            (LangOpts.OpenMP >= 50 && KindLoc.isInvalid());
+                            (getLangOpts().OpenMP >= 50 && KindLoc.isInvalid());
     if (!isDefaultmapKind || !isDefaultmapModifier) {
       StringRef KindValue = "'scalar', 'aggregate', 'pointer'";
-      if (LangOpts.OpenMP == 50) {
+      if (getLangOpts().OpenMP == 50) {
         StringRef ModifierValue = "'alloc', 'from', 'to', 'tofrom', "
                                   "'firstprivate', 'none', 'default'";
         if (!isDefaultmapKind && isDefaultmapModifier) {
@@ -23346,13 +23448,13 @@ OMPClause *Sema::ActOnOpenMPDefaultmapClause(
     DSAStack->setDefaultDMAAttr(M, Kind, StartLoc);
   }
 
-  return new (Context)
+  return new (getASTContext())
       OMPDefaultmapClause(StartLoc, LParenLoc, MLoc, KindLoc, EndLoc, Kind, M);
 }
 
-bool Sema::ActOnStartOpenMPDeclareTargetContext(
+bool SemaOpenMP::ActOnStartOpenMPDeclareTargetContext(
     DeclareTargetContextInfo &DTCI) {
-  DeclContext *CurLexicalContext = getCurLexicalContext();
+  DeclContext *CurLexicalContext = SemaRef.getCurLexicalContext();
   if (!CurLexicalContext->isFileContext() &&
       !CurLexicalContext->isExternCContext() &&
       !CurLexicalContext->isExternCXXContext() &&
@@ -23372,20 +23474,20 @@ bool Sema::ActOnStartOpenMPDeclareTargetContext(
   return true;
 }
 
-const Sema::DeclareTargetContextInfo
-Sema::ActOnOpenMPEndDeclareTargetDirective() {
+const SemaOpenMP::DeclareTargetContextInfo
+SemaOpenMP::ActOnOpenMPEndDeclareTargetDirective() {
   assert(!DeclareTargetNesting.empty() &&
          "check isInOpenMPDeclareTargetContext() first!");
   return DeclareTargetNesting.pop_back_val();
 }
 
-void Sema::ActOnFinishedOpenMPDeclareTargetContext(
+void SemaOpenMP::ActOnFinishedOpenMPDeclareTargetContext(
     DeclareTargetContextInfo &DTCI) {
   for (auto &It : DTCI.ExplicitlyMapped)
     ActOnOpenMPDeclareTargetName(It.first, It.second.Loc, It.second.MT, DTCI);
 }
 
-void Sema::DiagnoseUnterminatedOpenMPDeclareTarget() {
+void SemaOpenMP::DiagnoseUnterminatedOpenMPDeclareTarget() {
   if (DeclareTargetNesting.empty())
     return;
   DeclareTargetContextInfo &DTCI = DeclareTargetNesting.back();
@@ -23393,23 +23495,23 @@ void Sema::DiagnoseUnterminatedOpenMPDeclareTarget() {
       << getOpenMPDirectiveName(DTCI.Kind);
 }
 
-NamedDecl *Sema::lookupOpenMPDeclareTargetName(Scope *CurScope,
-                                               CXXScopeSpec &ScopeSpec,
-                                               const DeclarationNameInfo &Id) {
-  LookupResult Lookup(*this, Id, LookupOrdinaryName);
-  LookupParsedName(Lookup, CurScope, &ScopeSpec, true);
+NamedDecl *SemaOpenMP::lookupOpenMPDeclareTargetName(
+    Scope *CurScope, CXXScopeSpec &ScopeSpec, const DeclarationNameInfo &Id) {
+  LookupResult Lookup(SemaRef, Id, Sema::LookupOrdinaryName);
+  SemaRef.LookupParsedName(Lookup, CurScope, &ScopeSpec, true);
 
   if (Lookup.isAmbiguous())
     return nullptr;
   Lookup.suppressDiagnostics();
 
   if (!Lookup.isSingleResult()) {
-    VarOrFuncDeclFilterCCC CCC(*this);
+    VarOrFuncDeclFilterCCC CCC(SemaRef);
     if (TypoCorrection Corrected =
-            CorrectTypo(Id, LookupOrdinaryName, CurScope, nullptr, CCC,
-                        CTK_ErrorRecovery)) {
-      diagnoseTypo(Corrected, PDiag(diag::err_undeclared_var_use_suggest)
-                                  << Id.getName());
+            SemaRef.CorrectTypo(Id, Sema::LookupOrdinaryName, CurScope, nullptr,
+                                CCC, Sema::CTK_ErrorRecovery)) {
+      SemaRef.diagnoseTypo(Corrected,
+                           SemaRef.PDiag(diag::err_undeclared_var_use_suggest)
+                               << Id.getName());
       checkDeclIsAllowedInOpenMPTarget(nullptr, Corrected.getCorrectionDecl());
       return nullptr;
     }
@@ -23427,9 +23529,9 @@ NamedDecl *Sema::lookupOpenMPDeclareTargetName(Scope *CurScope,
   return ND;
 }
 
-void Sema::ActOnOpenMPDeclareTargetName(NamedDecl *ND, SourceLocation Loc,
-                                        OMPDeclareTargetDeclAttr::MapTypeTy MT,
-                                        DeclareTargetContextInfo &DTCI) {
+void SemaOpenMP::ActOnOpenMPDeclareTargetName(
+    NamedDecl *ND, SourceLocation Loc, OMPDeclareTargetDeclAttr::MapTypeTy MT,
+    DeclareTargetContextInfo &DTCI) {
   assert((isa<VarDecl>(ND) || isa<FunctionDecl>(ND) ||
           isa<FunctionTemplateDecl>(ND)) &&
          "Expected variable, function or function template.");
@@ -23445,7 +23547,7 @@ void Sema::ActOnOpenMPDeclareTargetName(NamedDecl *ND, SourceLocation Loc,
   }
   // Diagnose marking after use as it may lead to incorrect diagnosis and
   // codegen.
-  if (LangOpts.OpenMP >= 50 &&
+  if (getLangOpts().OpenMP >= 50 &&
       (ND->isUsed(/*CheckUsedAttr=*/false) || ND->isReferenced()))
     Diag(Loc, diag::warn_omp_declare_target_after_first_use);
 
@@ -23484,14 +23586,14 @@ void Sema::ActOnOpenMPDeclareTargetName(NamedDecl *ND, SourceLocation Loc,
       IsIndirect = true;
   }
   auto *A = OMPDeclareTargetDeclAttr::CreateImplicit(
-      Context, MT, DTCI.DT, IndirectE, IsIndirect, Level,
+      getASTContext(), MT, DTCI.DT, IndirectE, IsIndirect, Level,
       SourceRange(Loc, Loc));
   ND->addAttr(A);
-  if (ASTMutationListener *ML = Context.getASTMutationListener())
+  if (ASTMutationListener *ML = getASTContext().getASTMutationListener())
     ML->DeclarationMarkedOpenMPDeclareTarget(ND, A);
   checkDeclIsAllowedInOpenMPTarget(nullptr, ND, Loc);
   if (auto *VD = dyn_cast<VarDecl>(ND);
-      LangOpts.OpenMP && VD && VD->hasAttr<OMPDeclareTargetDeclAttr>() &&
+      getLangOpts().OpenMP && VD && VD->hasAttr<OMPDeclareTargetDeclAttr>() &&
       VD->hasGlobalStorage())
     ActOnOpenMPDeclareTargetInitializer(ND);
 }
@@ -23535,8 +23637,8 @@ static bool checkValueDeclInTarget(SourceLocation SL, SourceRange SR,
                            /*FullCheck=*/false);
 }
 
-void Sema::checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D,
-                                            SourceLocation IdLoc) {
+void SemaOpenMP::checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D,
+                                                  SourceLocation IdLoc) {
   if (!D || D->isInvalidDecl())
     return;
   SourceRange SR = E ? E->getSourceRange() : D->getSourceRange();
@@ -23550,7 +23652,7 @@ void Sema::checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D,
     // directive.
     if (DSAStack->isThreadPrivate(VD)) {
       Diag(SL, diag::err_omp_threadprivate_in_target);
-      reportOriginalDsa(*this, DSAStack, VD, DSAStack->getTopDSA(VD, false));
+      reportOriginalDsa(SemaRef, DSAStack, VD, DSAStack->getTopDSA(VD, false));
       return;
     }
   }
@@ -23569,7 +23671,7 @@ void Sema::checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D,
     // Problem if any with var declared with incomplete type will be reported
     // as normal, so no need to check it here.
     if ((E || !VD->getType()->isIncompleteType()) &&
-        !checkValueDeclInTarget(SL, SR, *this, DSAStack, VD))
+        !checkValueDeclInTarget(SL, SR, SemaRef, DSAStack, VD))
       return;
     if (!E && isInOpenMPDeclareTargetContext()) {
       // Checking declaration inside declare target region.
@@ -23589,13 +23691,13 @@ void Sema::checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D,
             IsIndirect = true;
         }
         auto *A = OMPDeclareTargetDeclAttr::CreateImplicit(
-            Context,
+            getASTContext(),
             getLangOpts().OpenMP >= 52 ? OMPDeclareTargetDeclAttr::MT_Enter
                                        : OMPDeclareTargetDeclAttr::MT_To,
             DTCI.DT, IndirectE, IsIndirect, Level,
             SourceRange(DTCI.Loc, DTCI.Loc));
         D->addAttr(A);
-        if (ASTMutationListener *ML = Context.getASTMutationListener())
+        if (ASTMutationListener *ML = getASTContext().getASTMutationListener())
           ML->DeclarationMarkedOpenMPDeclareTarget(D, A);
       }
       return;
@@ -23603,7 +23705,7 @@ void Sema::checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D,
   }
   if (!E)
     return;
-  checkDeclInTargetContext(E->getExprLoc(), E->getSourceRange(), *this, D);
+  checkDeclInTargetContext(E->getExprLoc(), E->getSourceRange(), SemaRef, D);
 }
 
 /// This class visits every VarDecl that the initializer references and adds
@@ -23649,13 +23751,13 @@ class GlobalDeclRefChecker final
 /// Adding OMPDeclareTargetDeclAttr to variables with static storage
 /// duration that are referenced in the initializer expression list of
 /// variables with static storage duration in declare target directive.
-void Sema::ActOnOpenMPDeclareTargetInitializer(Decl *TargetDecl) {
+void SemaOpenMP::ActOnOpenMPDeclareTargetInitializer(Decl *TargetDecl) {
   GlobalDeclRefChecker Checker;
   if (isa<VarDecl>(TargetDecl))
     Checker.declareTargetInitializer(TargetDecl);
 }
 
-OMPClause *Sema::ActOnOpenMPToClause(
+OMPClause *SemaOpenMP::ActOnOpenMPToClause(
     ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
     ArrayRef<SourceLocation> MotionModifiersLoc,
     CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId,
@@ -23681,18 +23783,18 @@ OMPClause *Sema::ActOnOpenMPToClause(
   }
 
   MappableVarListInfo MVLI(VarList);
-  checkMappableExpressionList(*this, DSAStack, OMPC_to, MVLI, Locs.StartLoc,
+  checkMappableExpressionList(SemaRef, DSAStack, OMPC_to, MVLI, Locs.StartLoc,
                               MapperIdScopeSpec, MapperId, UnresolvedMappers);
   if (MVLI.ProcessedVarList.empty())
     return nullptr;
 
   return OMPToClause::Create(
-      Context, Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations,
+      getASTContext(), Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations,
       MVLI.VarComponents, MVLI.UDMapperList, Modifiers, ModifiersLoc,
-      MapperIdScopeSpec.getWithLocInContext(Context), MapperId);
+      MapperIdScopeSpec.getWithLocInContext(getASTContext()), MapperId);
 }
 
-OMPClause *Sema::ActOnOpenMPFromClause(
+OMPClause *SemaOpenMP::ActOnOpenMPFromClause(
     ArrayRef<OpenMPMotionModifierKind> MotionModifiers,
     ArrayRef<SourceLocation> MotionModifiersLoc,
     CXXScopeSpec &MapperIdScopeSpec, DeclarationNameInfo &MapperId,
@@ -23718,19 +23820,20 @@ OMPClause *Sema::ActOnOpenMPFromClause(
   }
 
   MappableVarListInfo MVLI(VarList);
-  checkMappableExpressionList(*this, DSAStack, OMPC_from, MVLI, Locs.StartLoc,
+  checkMappableExpressionList(SemaRef, DSAStack, OMPC_from, MVLI, Locs.StartLoc,
                               MapperIdScopeSpec, MapperId, UnresolvedMappers);
   if (MVLI.ProcessedVarList.empty())
     return nullptr;
 
   return OMPFromClause::Create(
-      Context, Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations,
+      getASTContext(), Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations,
       MVLI.VarComponents, MVLI.UDMapperList, Modifiers, ModifiersLoc,
-      MapperIdScopeSpec.getWithLocInContext(Context), MapperId);
+      MapperIdScopeSpec.getWithLocInContext(getASTContext()), MapperId);
 }
 
-OMPClause *Sema::ActOnOpenMPUseDevicePtrClause(ArrayRef<Expr *> VarList,
-                                               const OMPVarListLocTy &Locs) {
+OMPClause *
+SemaOpenMP::ActOnOpenMPUseDevicePtrClause(ArrayRef<Expr *> VarList,
+                                          const OMPVarListLocTy &Locs) {
   MappableVarListInfo MVLI(VarList);
   SmallVector<Expr *, 8> PrivateCopies;
   SmallVector<Expr *, 8> Inits;
@@ -23740,7 +23843,7 @@ OMPClause *Sema::ActOnOpenMPUseDevicePtrClause(ArrayRef<Expr *> VarList,
     SourceLocation ELoc;
     SourceRange ERange;
     Expr *SimpleRefExpr = RefExpr;
-    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
+    auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange);
     if (Res.second) {
       // It will be analyzed later.
       MVLI.ProcessedVarList.push_back(RefExpr);
@@ -23765,30 +23868,30 @@ OMPClause *Sema::ActOnOpenMPUseDevicePtrClause(ArrayRef<Expr *> VarList,
 
     // Build the private variable and the expression that refers to it.
     auto VDPrivate =
-        buildVarDecl(*this, ELoc, Type, D->getName(),
+        buildVarDecl(SemaRef, ELoc, Type, D->getName(),
                      D->hasAttrs() ? &D->getAttrs() : nullptr,
                      VD ? cast<DeclRefExpr>(SimpleRefExpr) : nullptr);
     if (VDPrivate->isInvalidDecl())
       continue;
 
-    CurContext->addDecl(VDPrivate);
+    SemaRef.CurContext->addDecl(VDPrivate);
     DeclRefExpr *VDPrivateRefExpr = buildDeclRefExpr(
-        *this, VDPrivate, RefExpr->getType().getUnqualifiedType(), ELoc);
+        SemaRef, VDPrivate, RefExpr->getType().getUnqualifiedType(), ELoc);
 
     // Add temporary variable to initialize the private copy of the pointer.
     VarDecl *VDInit =
-        buildVarDecl(*this, RefExpr->getExprLoc(), Type, ".devptr.temp");
+        buildVarDecl(SemaRef, RefExpr->getExprLoc(), Type, ".devptr.temp");
     DeclRefExpr *VDInitRefExpr = buildDeclRefExpr(
-        *this, VDInit, RefExpr->getType(), RefExpr->getExprLoc());
-    AddInitializerToDecl(VDPrivate,
-                         DefaultLvalueConversion(VDInitRefExpr).get(),
-                         /*DirectInit=*/false);
+        SemaRef, VDInit, RefExpr->getType(), RefExpr->getExprLoc());
+    SemaRef.AddInitializerToDecl(
+        VDPrivate, SemaRef.DefaultLvalueConversion(VDInitRefExpr).get(),
+        /*DirectInit=*/false);
 
     // If required, build a capture to implement the privatization initialized
     // with the current list item value.
     DeclRefExpr *Ref = nullptr;
     if (!VD)
-      Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true);
+      Ref = buildCapture(SemaRef, D, SimpleRefExpr, /*WithInit=*/true);
     MVLI.ProcessedVarList.push_back(VD ? RefExpr->IgnoreParens() : Ref);
     PrivateCopies.push_back(VDPrivateRefExpr);
     Inits.push_back(VDInitRefExpr);
@@ -23810,12 +23913,13 @@ OMPClause *Sema::ActOnOpenMPUseDevicePtrClause(ArrayRef<Expr *> VarList,
     return nullptr;
 
   return OMPUseDevicePtrClause::Create(
-      Context, Locs, MVLI.ProcessedVarList, PrivateCopies, Inits,
+      getASTContext(), Locs, MVLI.ProcessedVarList, PrivateCopies, Inits,
       MVLI.VarBaseDeclarations, MVLI.VarComponents);
 }
 
-OMPClause *Sema::ActOnOpenMPUseDeviceAddrClause(ArrayRef<Expr *> VarList,
-                                                const OMPVarListLocTy &Locs) {
+OMPClause *
+SemaOpenMP::ActOnOpenMPUseDeviceAddrClause(ArrayRef<Expr *> VarList,
+                                           const OMPVarListLocTy &Locs) {
   MappableVarListInfo MVLI(VarList);
 
   for (Expr *RefExpr : VarList) {
@@ -23823,7 +23927,7 @@ OMPClause *Sema::ActOnOpenMPUseDeviceAddrClause(ArrayRef<Expr *> VarList,
     SourceLocation ELoc;
     SourceRange ERange;
     Expr *SimpleRefExpr = RefExpr;
-    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange,
+    auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange,
                               /*AllowArraySection=*/true);
     if (Res.second) {
       // It will be analyzed later.
@@ -23838,7 +23942,7 @@ OMPClause *Sema::ActOnOpenMPUseDeviceAddrClause(ArrayRef<Expr *> VarList,
     // with the current list item value.
     DeclRefExpr *Ref = nullptr;
     if (!VD)
-      Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true);
+      Ref = buildCapture(SemaRef, D, SimpleRefExpr, /*WithInit=*/true);
     MVLI.ProcessedVarList.push_back(VD ? RefExpr->IgnoreParens() : Ref);
 
     // We need to add a data sharing attribute for this variable to make sure it
@@ -23853,7 +23957,8 @@ OMPClause *Sema::ActOnOpenMPUseDeviceAddrClause(ArrayRef<Expr *> VarList,
     Expr *Component = SimpleRefExpr;
     if (VD && (isa<OMPArraySectionExpr>(RefExpr->IgnoreParenImpCasts()) ||
                isa<ArraySubscriptExpr>(RefExpr->IgnoreParenImpCasts())))
-      Component = DefaultFunctionArrayLvalueConversion(SimpleRefExpr).get();
+      Component =
+          SemaRef.DefaultFunctionArrayLvalueConversion(SimpleRefExpr).get();
     MVLI.VarComponents.back().emplace_back(Component, D,
                                            /*IsNonContiguous=*/false);
   }
@@ -23861,20 +23966,21 @@ OMPClause *Sema::ActOnOpenMPUseDeviceAddrClause(ArrayRef<Expr *> VarList,
   if (MVLI.ProcessedVarList.empty())
     return nullptr;
 
-  return OMPUseDeviceAddrClause::Create(Context, Locs, MVLI.ProcessedVarList,
-                                        MVLI.VarBaseDeclarations,
-                                        MVLI.VarComponents);
+  return OMPUseDeviceAddrClause::Create(
+      getASTContext(), Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations,
+      MVLI.VarComponents);
 }
 
-OMPClause *Sema::ActOnOpenMPIsDevicePtrClause(ArrayRef<Expr *> VarList,
-                                              const OMPVarListLocTy &Locs) {
+OMPClause *
+SemaOpenMP::ActOnOpenMPIsDevicePtrClause(ArrayRef<Expr *> VarList,
+                                         const OMPVarListLocTy &Locs) {
   MappableVarListInfo MVLI(VarList);
   for (Expr *RefExpr : VarList) {
     assert(RefExpr && "NULL expr in OpenMP is_device_ptr clause.");
     SourceLocation ELoc;
     SourceRange ERange;
     Expr *SimpleRefExpr = RefExpr;
-    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
+    auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange);
     if (Res.second) {
       // It will be analyzed later.
       MVLI.ProcessedVarList.push_back(RefExpr);
@@ -23900,7 +24006,7 @@ OMPClause *Sema::ActOnOpenMPIsDevicePtrClause(ArrayRef<Expr *> VarList,
           << getOpenMPClauseName(DVar.CKind)
           << getOpenMPClauseName(OMPC_is_device_ptr)
           << getOpenMPDirectiveName(DSAStack->getCurrentDirective());
-      reportOriginalDsa(*this, DSAStack, D, DVar);
+      reportOriginalDsa(SemaRef, DSAStack, D, DVar);
       continue;
     }
 
@@ -23944,20 +24050,21 @@ OMPClause *Sema::ActOnOpenMPIsDevicePtrClause(ArrayRef<Expr *> VarList,
   if (MVLI.ProcessedVarList.empty())
     return nullptr;
 
-  return OMPIsDevicePtrClause::Create(Context, Locs, MVLI.ProcessedVarList,
-                                      MVLI.VarBaseDeclarations,
-                                      MVLI.VarComponents);
+  return OMPIsDevicePtrClause::Create(
+      getASTContext(), Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations,
+      MVLI.VarComponents);
 }
 
-OMPClause *Sema::ActOnOpenMPHasDeviceAddrClause(ArrayRef<Expr *> VarList,
-                                                const OMPVarListLocTy &Locs) {
+OMPClause *
+SemaOpenMP::ActOnOpenMPHasDeviceAddrClause(ArrayRef<Expr *> VarList,
+                                           const OMPVarListLocTy &Locs) {
   MappableVarListInfo MVLI(VarList);
   for (Expr *RefExpr : VarList) {
     assert(RefExpr && "NULL expr in OpenMP has_device_addr clause.");
     SourceLocation ELoc;
     SourceRange ERange;
     Expr *SimpleRefExpr = RefExpr;
-    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange,
+    auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange,
                               /*AllowArraySection=*/true);
     if (Res.second) {
       // It will be analyzed later.
@@ -23975,7 +24082,7 @@ OMPClause *Sema::ActOnOpenMPHasDeviceAddrClause(ArrayRef<Expr *> VarList,
           << getOpenMPClauseName(DVar.CKind)
           << getOpenMPClauseName(OMPC_has_device_addr)
           << getOpenMPDirectiveName(DSAStack->getCurrentDirective());
-      reportOriginalDsa(*this, DSAStack, D, DVar);
+      reportOriginalDsa(SemaRef, DSAStack, D, DVar);
       continue;
     }
 
@@ -24000,16 +24107,17 @@ OMPClause *Sema::ActOnOpenMPHasDeviceAddrClause(ArrayRef<Expr *> VarList,
     auto *VD = dyn_cast<VarDecl>(D);
     if (VD && (isa<OMPArraySectionExpr>(RefExpr->IgnoreParenImpCasts()) ||
                isa<ArraySubscriptExpr>(RefExpr->IgnoreParenImpCasts())))
-      Component = DefaultFunctionArrayLvalueConversion(SimpleRefExpr).get();
+      Component =
+          SemaRef.DefaultFunctionArrayLvalueConversion(SimpleRefExpr).get();
     OMPClauseMappableExprCommon::MappableComponent MC(
         Component, D, /*IsNonContiguous=*/false);
     DSAStack->addMappableExpressionComponents(
         D, MC, /*WhereFoundClauseKind=*/OMPC_has_device_addr);
 
     // Record the expression we've just processed.
-    if (!VD && !CurContext->isDependentContext()) {
+    if (!VD && !SemaRef.CurContext->isDependentContext()) {
       DeclRefExpr *Ref =
-          buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/true);
+          buildCapture(SemaRef, D, SimpleRefExpr, /*WithInit=*/true);
       assert(Ref && "has_device_addr capture failed");
       MVLI.ProcessedVarList.push_back(Ref);
     } else
@@ -24030,27 +24138,27 @@ OMPClause *Sema::ActOnOpenMPHasDeviceAddrClause(ArrayRef<Expr *> VarList,
   if (MVLI.ProcessedVarList.empty())
     return nullptr;
 
-  return OMPHasDeviceAddrClause::Create(Context, Locs, MVLI.ProcessedVarList,
-                                        MVLI.VarBaseDeclarations,
-                                        MVLI.VarComponents);
+  return OMPHasDeviceAddrClause::Create(
+      getASTContext(), Locs, MVLI.ProcessedVarList, MVLI.VarBaseDeclarations,
+      MVLI.VarComponents);
 }
 
-OMPClause *Sema::ActOnOpenMPAllocateClause(
+OMPClause *SemaOpenMP::ActOnOpenMPAllocateClause(
     Expr *Allocator, ArrayRef<Expr *> VarList, SourceLocation StartLoc,
     SourceLocation ColonLoc, SourceLocation LParenLoc, SourceLocation EndLoc) {
   if (Allocator) {
     // OpenMP [2.11.4 allocate Clause, Description]
     // allocator is an expression of omp_allocator_handle_t type.
-    if (!findOMPAllocatorHandleT(*this, Allocator->getExprLoc(), DSAStack))
+    if (!findOMPAllocatorHandleT(SemaRef, Allocator->getExprLoc(), DSAStack))
       return nullptr;
 
-    ExprResult AllocatorRes = DefaultLvalueConversion(Allocator);
+    ExprResult AllocatorRes = SemaRef.DefaultLvalueConversion(Allocator);
     if (AllocatorRes.isInvalid())
       return nullptr;
-    AllocatorRes = PerformImplicitConversion(AllocatorRes.get(),
-                                             DSAStack->getOMPAllocatorHandleT(),
-                                             Sema::AA_Initializing,
-                                             /*AllowExplicit=*/true);
+    AllocatorRes = SemaRef.PerformImplicitConversion(
+        AllocatorRes.get(), DSAStack->getOMPAllocatorHandleT(),
+        Sema::AA_Initializing,
+        /*AllowExplicit=*/true);
     if (AllocatorRes.isInvalid())
       return nullptr;
     Allocator = AllocatorRes.get();
@@ -24060,9 +24168,9 @@ OMPClause *Sema::ActOnOpenMPAllocateClause(
     // target region must specify an allocator expression unless a requires
     // directive with the dynamic_allocators clause is present in the same
     // compilation unit.
-    if (LangOpts.OpenMPIsTargetDevice &&
+    if (getLangOpts().OpenMPIsTargetDevice &&
         !DSAStack->hasRequiresDeclWithClause<OMPDynamicAllocatorsClause>())
-      targetDiag(StartLoc, diag::err_expected_allocator_expression);
+      SemaRef.targetDiag(StartLoc, diag::err_expected_allocator_expression);
   }
   // Analyze and build list of variables.
   SmallVector<Expr *, 8> Vars;
@@ -24071,7 +24179,7 @@ OMPClause *Sema::ActOnOpenMPAllocateClause(
     SourceLocation ELoc;
     SourceRange ERange;
     Expr *SimpleRefExpr = RefExpr;
-    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
+    auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange);
     if (Res.second) {
       // It will be analyzed later.
       Vars.push_back(RefExpr);
@@ -24082,9 +24190,9 @@ OMPClause *Sema::ActOnOpenMPAllocateClause(
 
     auto *VD = dyn_cast<VarDecl>(D);
     DeclRefExpr *Ref = nullptr;
-    if (!VD && !CurContext->isDependentContext())
-      Ref = buildCapture(*this, D, SimpleRefExpr, /*WithInit=*/false);
-    Vars.push_back((VD || CurContext->isDependentContext())
+    if (!VD && !SemaRef.CurContext->isDependentContext())
+      Ref = buildCapture(SemaRef, D, SimpleRefExpr, /*WithInit=*/false);
+    Vars.push_back((VD || SemaRef.CurContext->isDependentContext())
                        ? RefExpr->IgnoreParens()
                        : Ref);
   }
@@ -24094,21 +24202,21 @@ OMPClause *Sema::ActOnOpenMPAllocateClause(
 
   if (Allocator)
     DSAStack->addInnerAllocatorExpr(Allocator);
-  return OMPAllocateClause::Create(Context, StartLoc, LParenLoc, Allocator,
-                                   ColonLoc, EndLoc, Vars);
+  return OMPAllocateClause::Create(getASTContext(), StartLoc, LParenLoc,
+                                   Allocator, ColonLoc, EndLoc, Vars);
 }
 
-OMPClause *Sema::ActOnOpenMPNontemporalClause(ArrayRef<Expr *> VarList,
-                                              SourceLocation StartLoc,
-                                              SourceLocation LParenLoc,
-                                              SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPNontemporalClause(ArrayRef<Expr *> VarList,
+                                                    SourceLocation StartLoc,
+                                                    SourceLocation LParenLoc,
+                                                    SourceLocation EndLoc) {
   SmallVector<Expr *, 8> Vars;
   for (Expr *RefExpr : VarList) {
     assert(RefExpr && "NULL expr in OpenMP nontemporal clause.");
     SourceLocation ELoc;
     SourceRange ERange;
     Expr *SimpleRefExpr = RefExpr;
-    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange);
+    auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange);
     if (Res.second)
       // It will be analyzed later.
       Vars.push_back(RefExpr);
@@ -24133,32 +24241,34 @@ OMPClause *Sema::ActOnOpenMPNontemporalClause(ArrayRef<Expr *> VarList,
   if (Vars.empty())
     return nullptr;
 
-  return OMPNontemporalClause::Create(Context, StartLoc, LParenLoc, EndLoc,
-                                      Vars);
+  return OMPNontemporalClause::Create(getASTContext(), StartLoc, LParenLoc,
+                                      EndLoc, Vars);
 }
 
-StmtResult Sema::ActOnOpenMPScopeDirective(ArrayRef<OMPClause *> Clauses,
-                                           Stmt *AStmt, SourceLocation StartLoc,
-                                           SourceLocation EndLoc) {
+StmtResult SemaOpenMP::ActOnOpenMPScopeDirective(ArrayRef<OMPClause *> Clauses,
+                                                 Stmt *AStmt,
+                                                 SourceLocation StartLoc,
+                                                 SourceLocation EndLoc) {
   if (!AStmt)
     return StmtError();
 
-  setFunctionHasBranchProtectedScope();
+  SemaRef.setFunctionHasBranchProtectedScope();
 
-  return OMPScopeDirective::Create(Context, StartLoc, EndLoc, Clauses, AStmt);
+  return OMPScopeDirective::Create(getASTContext(), StartLoc, EndLoc, Clauses,
+                                   AStmt);
 }
 
-OMPClause *Sema::ActOnOpenMPInclusiveClause(ArrayRef<Expr *> VarList,
-                                            SourceLocation StartLoc,
-                                            SourceLocation LParenLoc,
-                                            SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPInclusiveClause(ArrayRef<Expr *> VarList,
+                                                  SourceLocation StartLoc,
+                                                  SourceLocation LParenLoc,
+                                                  SourceLocation EndLoc) {
   SmallVector<Expr *, 8> Vars;
   for (Expr *RefExpr : VarList) {
     assert(RefExpr && "NULL expr in OpenMP nontemporal clause.");
     SourceLocation ELoc;
     SourceRange ERange;
     Expr *SimpleRefExpr = RefExpr;
-    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange,
+    auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange,
                               /*AllowArraySection=*/true);
     if (Res.second)
       // It will be analyzed later.
@@ -24185,20 +24295,21 @@ OMPClause *Sema::ActOnOpenMPInclusiveClause(ArrayRef<Expr *> VarList,
   if (Vars.empty())
     return nullptr;
 
-  return OMPInclusiveClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars);
+  return OMPInclusiveClause::Create(getASTContext(), StartLoc, LParenLoc,
+                                    EndLoc, Vars);
 }
 
-OMPClause *Sema::ActOnOpenMPExclusiveClause(ArrayRef<Expr *> VarList,
-                                            SourceLocation StartLoc,
-                                            SourceLocation LParenLoc,
-                                            SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPExclusiveClause(ArrayRef<Expr *> VarList,
+                                                  SourceLocation StartLoc,
+                                                  SourceLocation LParenLoc,
+                                                  SourceLocation EndLoc) {
   SmallVector<Expr *, 8> Vars;
   for (Expr *RefExpr : VarList) {
     assert(RefExpr && "NULL expr in OpenMP nontemporal clause.");
     SourceLocation ELoc;
     SourceRange ERange;
     Expr *SimpleRefExpr = RefExpr;
-    auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange,
+    auto Res = getPrivateItem(SemaRef, SimpleRefExpr, ELoc, ERange,
                               /*AllowArraySection=*/true);
     if (Res.second)
       // It will be analyzed later.
@@ -24228,7 +24339,8 @@ OMPClause *Sema::ActOnOpenMPExclusiveClause(ArrayRef<Expr *> VarList,
   if (Vars.empty())
     return nullptr;
 
-  return OMPExclusiveClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars);
+  return OMPExclusiveClause::Create(getASTContext(), StartLoc, LParenLoc,
+                                    EndLoc, Vars);
 }
 
 /// Tries to find omp_alloctrait_t type.
@@ -24246,19 +24358,20 @@ static bool findOMPAlloctraitT(Sema &S, SourceLocation Loc, DSAStackTy *Stack) {
   return true;
 }
 
-OMPClause *Sema::ActOnOpenMPUsesAllocatorClause(
+OMPClause *SemaOpenMP::ActOnOpenMPUsesAllocatorClause(
     SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc,
     ArrayRef<UsesAllocatorsData> Data) {
+  ASTContext &Context = getASTContext();
   // OpenMP [2.12.5, target Construct]
   // allocator is an identifier of omp_allocator_handle_t type.
-  if (!findOMPAllocatorHandleT(*this, StartLoc, DSAStack))
+  if (!findOMPAllocatorHandleT(SemaRef, StartLoc, DSAStack))
     return nullptr;
   // OpenMP [2.12.5, target Construct]
   // allocator-traits-array is an identifier of const omp_alloctrait_t * type.
   if (llvm::any_of(
           Data,
           [](const UsesAllocatorsData &D) { return D.AllocatorTraits; }) &&
-      !findOMPAlloctraitT(*this, StartLoc, DSAStack))
+      !findOMPAlloctraitT(SemaRef, StartLoc, DSAStack))
     return nullptr;
   llvm::SmallPtrSet<CanonicalDeclPtr<Decl>, 4> PredefinedAllocators;
   for (int I = 0; I < OMPAllocateDeclAttr::OMPUserDefinedMemAlloc; ++I) {
@@ -24266,8 +24379,8 @@ OMPClause *Sema::ActOnOpenMPUsesAllocatorClause(
     StringRef Allocator =
         OMPAllocateDeclAttr::ConvertAllocatorTypeTyToStr(AllocatorKind);
     DeclarationName AllocatorName = &Context.Idents.get(Allocator);
-    PredefinedAllocators.insert(LookupSingleName(
-        TUScope, AllocatorName, StartLoc, Sema::LookupAnyName));
+    PredefinedAllocators.insert(SemaRef.LookupSingleName(
+        SemaRef.TUScope, AllocatorName, StartLoc, Sema::LookupAnyName));
   }
 
   SmallVector<OMPUsesAllocatorsClause::Data, 4> NewData;
@@ -24284,7 +24397,7 @@ OMPClause *Sema::ActOnOpenMPUsesAllocatorClause(
       bool IsPredefinedAllocator = false;
       if (DRE) {
         OMPAllocateDeclAttr::AllocatorTypeTy AllocatorTy =
-            getAllocatorKind(*this, DSAStack, AllocatorExpr);
+            getAllocatorKind(SemaRef, DSAStack, AllocatorExpr);
         IsPredefinedAllocator =
             AllocatorTy !=
             OMPAllocateDeclAttr::AllocatorTypeTy::OMPUserDefinedMemAlloc;
@@ -24329,7 +24442,7 @@ OMPClause *Sema::ActOnOpenMPUsesAllocatorClause(
       }
       // No allocator traits - just convert it to rvalue.
       if (!D.AllocatorTraits)
-        AllocatorExpr = DefaultLvalueConversion(AllocatorExpr).get();
+        AllocatorExpr = SemaRef.DefaultLvalueConversion(AllocatorExpr).get();
       DSAStack->addUsesAllocatorsDecl(
           DRE->getDecl(),
           IsPredefinedAllocator
@@ -24376,11 +24489,11 @@ OMPClause *Sema::ActOnOpenMPUsesAllocatorClause(
     NewD.LParenLoc = D.LParenLoc;
     NewD.RParenLoc = D.RParenLoc;
   }
-  return OMPUsesAllocatorsClause::Create(Context, StartLoc, LParenLoc, EndLoc,
-                                         NewData);
+  return OMPUsesAllocatorsClause::Create(getASTContext(), StartLoc, LParenLoc,
+                                         EndLoc, NewData);
 }
 
-OMPClause *Sema::ActOnOpenMPAffinityClause(
+OMPClause *SemaOpenMP::ActOnOpenMPAffinityClause(
     SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation ColonLoc,
     SourceLocation EndLoc, Expr *Modifier, ArrayRef<Expr *> Locators) {
   SmallVector<Expr *, 8> Vars;
@@ -24403,8 +24516,8 @@ OMPClause *Sema::ActOnOpenMPAffinityClause(
 
     ExprResult Res;
     {
-      Sema::TentativeAnalysisScope Trap(*this);
-      Res = CreateBuiltinUnaryOp(ELoc, UO_AddrOf, SimpleExpr);
+      Sema::TentativeAnalysisScope Trap(SemaRef);
+      Res = SemaRef.CreateBuiltinUnaryOp(ELoc, UO_AddrOf, SimpleExpr);
     }
     if (!Res.isUsable() && !isa<OMPArraySectionExpr>(SimpleExpr) &&
         !isa<OMPArrayShapingExpr>(SimpleExpr)) {
@@ -24415,15 +24528,15 @@ OMPClause *Sema::ActOnOpenMPAffinityClause(
     Vars.push_back(SimpleExpr);
   }
 
-  return OMPAffinityClause::Create(Context, StartLoc, LParenLoc, ColonLoc,
-                                   EndLoc, Modifier, Vars);
+  return OMPAffinityClause::Create(getASTContext(), StartLoc, LParenLoc,
+                                   ColonLoc, EndLoc, Modifier, Vars);
 }
 
-OMPClause *Sema::ActOnOpenMPBindClause(OpenMPBindClauseKind Kind,
-                                       SourceLocation KindLoc,
-                                       SourceLocation StartLoc,
-                                       SourceLocation LParenLoc,
-                                       SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPBindClause(OpenMPBindClauseKind Kind,
+                                             SourceLocation KindLoc,
+                                             SourceLocation StartLoc,
+                                             SourceLocation LParenLoc,
+                                             SourceLocation EndLoc) {
   if (Kind == OMPC_BIND_unknown) {
     Diag(KindLoc, diag::err_omp_unexpected_clause_value)
         << getListOfPossibleValues(OMPC_bind, /*First=*/0,
@@ -24432,39 +24545,40 @@ OMPClause *Sema::ActOnOpenMPBindClause(OpenMPBindClauseKind Kind,
     return nullptr;
   }
 
-  return OMPBindClause::Create(Context, Kind, KindLoc, StartLoc, LParenLoc,
-                               EndLoc);
+  return OMPBindClause::Create(getASTContext(), Kind, KindLoc, StartLoc,
+                               LParenLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPXDynCGroupMemClause(Expr *Size,
-                                                SourceLocation StartLoc,
-                                                SourceLocation LParenLoc,
-                                                SourceLocation EndLoc) {
+OMPClause *SemaOpenMP::ActOnOpenMPXDynCGroupMemClause(Expr *Size,
+                                                      SourceLocation StartLoc,
+                                                      SourceLocation LParenLoc,
+                                                      SourceLocation EndLoc) {
   Expr *ValExpr = Size;
   Stmt *HelperValStmt = nullptr;
 
   // OpenMP [2.5, Restrictions]
   //  The ompx_dyn_cgroup_mem expression must evaluate to a positive integer
   //  value.
-  if (!isNonNegativeIntegerValue(ValExpr, *this, OMPC_ompx_dyn_cgroup_mem,
+  if (!isNonNegativeIntegerValue(ValExpr, SemaRef, OMPC_ompx_dyn_cgroup_mem,
                                  /*StrictlyPositive=*/false))
     return nullptr;
 
   OpenMPDirectiveKind DKind = DSAStack->getCurrentDirective();
   OpenMPDirectiveKind CaptureRegion = getOpenMPCaptureRegionForClause(
-      DKind, OMPC_ompx_dyn_cgroup_mem, LangOpts.OpenMP);
-  if (CaptureRegion != OMPD_unknown && !CurContext->isDependentContext()) {
-    ValExpr = MakeFullExpr(ValExpr).get();
+      DKind, OMPC_ompx_dyn_cgroup_mem, getLangOpts().OpenMP);
+  if (CaptureRegion != OMPD_unknown &&
+      !SemaRef.CurContext->isDependentContext()) {
+    ValExpr = SemaRef.MakeFullExpr(ValExpr).get();
     llvm::MapVector<const Expr *, DeclRefExpr *> Captures;
-    ValExpr = tryBuildCapture(*this, ValExpr, Captures).get();
-    HelperValStmt = buildPreInits(Context, Captures);
+    ValExpr = tryBuildCapture(SemaRef, ValExpr, Captures).get();
+    HelperValStmt = buildPreInits(getASTContext(), Captures);
   }
 
-  return new (Context) OMPXDynCGroupMemClause(
+  return new (getASTContext()) OMPXDynCGroupMemClause(
       ValExpr, HelperValStmt, CaptureRegion, StartLoc, LParenLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPDoacrossClause(
+OMPClause *SemaOpenMP::ActOnOpenMPDoacrossClause(
     OpenMPDoacrossClauseModifier DepType, SourceLocation DepLoc,
     SourceLocation ColonLoc, ArrayRef<Expr *> VarList, SourceLocation StartLoc,
     SourceLocation LParenLoc, SourceLocation EndLoc) {
@@ -24483,7 +24597,7 @@ OMPClause *Sema::ActOnOpenMPDoacrossClause(
   DSAStackTy::OperatorOffsetTy OpsOffs;
   llvm::APSInt TotalDepCount(/*BitWidth=*/32);
   DoacrossDataInfoTy VarOffset = ProcessOpenMPDoacrossClauseCommon(
-      *this,
+      SemaRef,
       DepType == OMPC_DOACROSS_source ||
           DepType == OMPC_DOACROSS_source_omp_cur_iteration ||
           DepType == OMPC_DOACROSS_sink_omp_cur_iteration,
@@ -24491,22 +24605,587 @@ OMPClause *Sema::ActOnOpenMPDoacrossClause(
   Vars = VarOffset.Vars;
   OpsOffs = VarOffset.OpsOffs;
   TotalDepCount = VarOffset.TotalDepCount;
-  auto *C = OMPDoacrossClause::Create(Context, StartLoc, LParenLoc, EndLoc,
-                                      DepType, DepLoc, ColonLoc, Vars,
+  auto *C = OMPDoacrossClause::Create(getASTContext(), StartLoc, LParenLoc,
+                                      EndLoc, DepType, DepLoc, ColonLoc, Vars,
                                       TotalDepCount.getZExtValue());
   if (DSAStack->isParentOrderedRegion())
     DSAStack->addDoacrossDependClause(C, OpsOffs);
   return C;
 }
 
-OMPClause *Sema::ActOnOpenMPXAttributeClause(ArrayRef<const Attr *> Attrs,
-                                             SourceLocation StartLoc,
-                                             SourceLocation LParenLoc,
-                                             SourceLocation EndLoc) {
-  return new (Context) OMPXAttributeClause(Attrs, StartLoc, LParenLoc, EndLoc);
+OMPClause *SemaOpenMP::ActOnOpenMPXAttributeClause(ArrayRef<const Attr *> Attrs,
+                                                   SourceLocation StartLoc,
+                                                   SourceLocation LParenLoc,
+                                                   SourceLocation EndLoc) {
+  return new (getASTContext())
+      OMPXAttributeClause(Attrs, StartLoc, LParenLoc, EndLoc);
 }
 
-OMPClause *Sema::ActOnOpenMPXBareClause(SourceLocation StartLoc,
-                                        SourceLocation EndLoc) {
-  return new (Context) OMPXBareClause(StartLoc, EndLoc);
+OMPClause *SemaOpenMP::ActOnOpenMPXBareClause(SourceLocation StartLoc,
+                                              SourceLocation EndLoc) {
+  return new (getASTContext()) OMPXBareClause(StartLoc, EndLoc);
+}
+
+ExprResult SemaOpenMP::ActOnOMPArraySectionExpr(
+    Expr *Base, SourceLocation LBLoc, Expr *LowerBound,
+    SourceLocation ColonLocFirst, SourceLocation ColonLocSecond, Expr *Length,
+    Expr *Stride, SourceLocation RBLoc) {
+  ASTContext &Context = getASTContext();
+  if (Base->hasPlaceholderType() &&
+      !Base->hasPlaceholderType(BuiltinType::OMPArraySection)) {
+    ExprResult Result = SemaRef.CheckPlaceholderExpr(Base);
+    if (Result.isInvalid())
+      return ExprError();
+    Base = Result.get();
+  }
+  if (LowerBound && LowerBound->getType()->isNonOverloadPlaceholderType()) {
+    ExprResult Result = SemaRef.CheckPlaceholderExpr(LowerBound);
+    if (Result.isInvalid())
+      return ExprError();
+    Result = SemaRef.DefaultLvalueConversion(Result.get());
+    if (Result.isInvalid())
+      return ExprError();
+    LowerBound = Result.get();
+  }
+  if (Length && Length->getType()->isNonOverloadPlaceholderType()) {
+    ExprResult Result = SemaRef.CheckPlaceholderExpr(Length);
+    if (Result.isInvalid())
+      return ExprError();
+    Result = SemaRef.DefaultLvalueConversion(Result.get());
+    if (Result.isInvalid())
+      return ExprError();
+    Length = Result.get();
+  }
+  if (Stride && Stride->getType()->isNonOverloadPlaceholderType()) {
+    ExprResult Result = SemaRef.CheckPlaceholderExpr(Stride);
+    if (Result.isInvalid())
+      return ExprError();
+    Result = SemaRef.DefaultLvalueConversion(Result.get());
+    if (Result.isInvalid())
+      return ExprError();
+    Stride = Result.get();
+  }
+
+  // Build an unanalyzed expression if either operand is type-dependent.
+  if (Base->isTypeDependent() ||
+      (LowerBound &&
+       (LowerBound->isTypeDependent() || LowerBound->isValueDependent())) ||
+      (Length && (Length->isTypeDependent() || Length->isValueDependent())) ||
+      (Stride && (Stride->isTypeDependent() || Stride->isValueDependent()))) {
+    return new (Context) OMPArraySectionExpr(
+        Base, LowerBound, Length, Stride, Context.DependentTy, VK_LValue,
+        OK_Ordinary, ColonLocFirst, ColonLocSecond, RBLoc);
+  }
+
+  // Perform default conversions.
+  QualType OriginalTy = OMPArraySectionExpr::getBaseOriginalType(Base);
+  QualType ResultTy;
+  if (OriginalTy->isAnyPointerType()) {
+    ResultTy = OriginalTy->getPointeeType();
+  } else if (OriginalTy->isArrayType()) {
+    ResultTy = OriginalTy->getAsArrayTypeUnsafe()->getElementType();
+  } else {
+    return ExprError(
+        Diag(Base->getExprLoc(), diag::err_omp_typecheck_section_value)
+        << Base->getSourceRange());
+  }
+  // C99 6.5.2.1p1
+  if (LowerBound) {
+    auto Res = PerformOpenMPImplicitIntegerConversion(LowerBound->getExprLoc(),
+                                                      LowerBound);
+    if (Res.isInvalid())
+      return ExprError(Diag(LowerBound->getExprLoc(),
+                            diag::err_omp_typecheck_section_not_integer)
+                       << 0 << LowerBound->getSourceRange());
+    LowerBound = Res.get();
+
+    if (LowerBound->getType()->isSpecificBuiltinType(BuiltinType::Char_S) ||
+        LowerBound->getType()->isSpecificBuiltinType(BuiltinType::Char_U))
+      Diag(LowerBound->getExprLoc(), diag::warn_omp_section_is_char)
+          << 0 << LowerBound->getSourceRange();
+  }
+  if (Length) {
+    auto Res =
+        PerformOpenMPImplicitIntegerConversion(Length->getExprLoc(), Length);
+    if (Res.isInvalid())
+      return ExprError(Diag(Length->getExprLoc(),
+                            diag::err_omp_typecheck_section_not_integer)
+                       << 1 << Length->getSourceRange());
+    Length = Res.get();
+
+    if (Length->getType()->isSpecificBuiltinType(BuiltinType::Char_S) ||
+        Length->getType()->isSpecificBuiltinType(BuiltinType::Char_U))
+      Diag(Length->getExprLoc(), diag::warn_omp_section_is_char)
+          << 1 << Length->getSourceRange();
+  }
+  if (Stride) {
+    ExprResult Res =
+        PerformOpenMPImplicitIntegerConversion(Stride->getExprLoc(), Stride);
+    if (Res.isInvalid())
+      return ExprError(Diag(Stride->getExprLoc(),
+                            diag::err_omp_typecheck_section_not_integer)
+                       << 1 << Stride->getSourceRange());
+    Stride = Res.get();
+
+    if (Stride->getType()->isSpecificBuiltinType(BuiltinType::Char_S) ||
+        Stride->getType()->isSpecificBuiltinType(BuiltinType::Char_U))
+      Diag(Stride->getExprLoc(), diag::warn_omp_section_is_char)
+          << 1 << Stride->getSourceRange();
+  }
+
+  // C99 6.5.2.1p1: "shall have type "pointer to *object* type". Similarly,
+  // C++ [expr.sub]p1: The type "T" shall be a completely-defined object
+  // type. Note that functions are not objects, and that (in C99 parlance)
+  // incomplete types are not object types.
+  if (ResultTy->isFunctionType()) {
+    Diag(Base->getExprLoc(), diag::err_omp_section_function_type)
+        << ResultTy << Base->getSourceRange();
+    return ExprError();
+  }
+
+  if (SemaRef.RequireCompleteType(Base->getExprLoc(), ResultTy,
+                                  diag::err_omp_section_incomplete_type, Base))
+    return ExprError();
+
+  if (LowerBound && !OriginalTy->isAnyPointerType()) {
+    Expr::EvalResult Result;
+    if (LowerBound->EvaluateAsInt(Result, Context)) {
+      // OpenMP 5.0, [2.1.5 Array Sections]
+      // The array section must be a subset of the original array.
+      llvm::APSInt LowerBoundValue = Result.Val.getInt();
+      if (LowerBoundValue.isNegative()) {
+        Diag(LowerBound->getExprLoc(),
+             diag::err_omp_section_not_subset_of_array)
+            << LowerBound->getSourceRange();
+        return ExprError();
+      }
+    }
+  }
+
+  if (Length) {
+    Expr::EvalResult Result;
+    if (Length->EvaluateAsInt(Result, Context)) {
+      // OpenMP 5.0, [2.1.5 Array Sections]
+      // The length must evaluate to non-negative integers.
+      llvm::APSInt LengthValue = Result.Val.getInt();
+      if (LengthValue.isNegative()) {
+        Diag(Length->getExprLoc(), diag::err_omp_section_length_negative)
+            << toString(LengthValue, /*Radix=*/10, /*Signed=*/true)
+            << Length->getSourceRange();
+        return ExprError();
+      }
+    }
+  } else if (ColonLocFirst.isValid() &&
+             (OriginalTy.isNull() || (!OriginalTy->isConstantArrayType() &&
+                                      !OriginalTy->isVariableArrayType()))) {
+    // OpenMP 5.0, [2.1.5 Array Sections]
+    // When the size of the array dimension is not known, the length must be
+    // specified explicitly.
+    Diag(ColonLocFirst, diag::err_omp_section_length_undefined)
+        << (!OriginalTy.isNull() && OriginalTy->isArrayType());
+    return ExprError();
+  }
+
+  if (Stride) {
+    Expr::EvalResult Result;
+    if (Stride->EvaluateAsInt(Result, Context)) {
+      // OpenMP 5.0, [2.1.5 Array Sections]
+      // The stride must evaluate to a positive integer.
+      llvm::APSInt StrideValue = Result.Val.getInt();
+      if (!StrideValue.isStrictlyPositive()) {
+        Diag(Stride->getExprLoc(), diag::err_omp_section_stride_non_positive)
+            << toString(StrideValue, /*Radix=*/10, /*Signed=*/true)
+            << Stride->getSourceRange();
+        return ExprError();
+      }
+    }
+  }
+
+  if (!Base->hasPlaceholderType(BuiltinType::OMPArraySection)) {
+    ExprResult Result = SemaRef.DefaultFunctionArrayLvalueConversion(Base);
+    if (Result.isInvalid())
+      return ExprError();
+    Base = Result.get();
+  }
+  return new (Context) OMPArraySectionExpr(
+      Base, LowerBound, Length, Stride, Context.OMPArraySectionTy, VK_LValue,
+      OK_Ordinary, ColonLocFirst, ColonLocSecond, RBLoc);
+}
+
+ExprResult SemaOpenMP::ActOnOMPArrayShapingExpr(
+    Expr *Base, SourceLocation LParenLoc, SourceLocation RParenLoc,
+    ArrayRef<Expr *> Dims, ArrayRef<SourceRange> Brackets) {
+  ASTContext &Context = getASTContext();
+  if (Base->hasPlaceholderType()) {
+    ExprResult Result = SemaRef.CheckPlaceholderExpr(Base);
+    if (Result.isInvalid())
+      return ExprError();
+    Result = SemaRef.DefaultLvalueConversion(Result.get());
+    if (Result.isInvalid())
+      return ExprError();
+    Base = Result.get();
+  }
+  QualType BaseTy = Base->getType();
+  // Delay analysis of the types/expressions if instantiation/specialization is
+  // required.
+  if (!BaseTy->isPointerType() && Base->isTypeDependent())
+    return OMPArrayShapingExpr::Create(Context, Context.DependentTy, Base,
+                                       LParenLoc, RParenLoc, Dims, Brackets);
+  if (!BaseTy->isPointerType() ||
+      (!Base->isTypeDependent() &&
+       BaseTy->getPointeeType()->isIncompleteType()))
+    return ExprError(Diag(Base->getExprLoc(),
+                          diag::err_omp_non_pointer_type_array_shaping_base)
+                     << Base->getSourceRange());
+
+  SmallVector<Expr *, 4> NewDims;
+  bool ErrorFound = false;
+  for (Expr *Dim : Dims) {
+    if (Dim->hasPlaceholderType()) {
+      ExprResult Result = SemaRef.CheckPlaceholderExpr(Dim);
+      if (Result.isInvalid()) {
+        ErrorFound = true;
+        continue;
+      }
+      Result = SemaRef.DefaultLvalueConversion(Result.get());
+      if (Result.isInvalid()) {
+        ErrorFound = true;
+        continue;
+      }
+      Dim = Result.get();
+    }
+    if (!Dim->isTypeDependent()) {
+      ExprResult Result =
+          PerformOpenMPImplicitIntegerConversion(Dim->getExprLoc(), Dim);
+      if (Result.isInvalid()) {
+        ErrorFound = true;
+        Diag(Dim->getExprLoc(), diag::err_omp_typecheck_shaping_not_integer)
+            << Dim->getSourceRange();
+        continue;
+      }
+      Dim = Result.get();
+      Expr::EvalResult EvResult;
+      if (!Dim->isValueDependent() && Dim->EvaluateAsInt(EvResult, Context)) {
+        // OpenMP 5.0, [2.1.4 Array Shaping]
+        // Each si is an integral type expression that must evaluate to a
+        // positive integer.
+        llvm::APSInt Value = EvResult.Val.getInt();
+        if (!Value.isStrictlyPositive()) {
+          Diag(Dim->getExprLoc(), diag::err_omp_shaping_dimension_not_positive)
+              << toString(Value, /*Radix=*/10, /*Signed=*/true)
+              << Dim->getSourceRange();
+          ErrorFound = true;
+          continue;
+        }
+      }
+    }
+    NewDims.push_back(Dim);
+  }
+  if (ErrorFound)
+    return ExprError();
+  return OMPArrayShapingExpr::Create(Context, Context.OMPArrayShapingTy, Base,
+                                     LParenLoc, RParenLoc, NewDims, Brackets);
 }
+
+ExprResult SemaOpenMP::ActOnOMPIteratorExpr(Scope *S,
+                                            SourceLocation IteratorKwLoc,
+                                            SourceLocation LLoc,
+                                            SourceLocation RLoc,
+                                            ArrayRef<OMPIteratorData> Data) {
+  ASTContext &Context = getASTContext();
+  SmallVector<OMPIteratorExpr::IteratorDefinition, 4> ID;
+  bool IsCorrect = true;
+  for (const OMPIteratorData &D : Data) {
+    TypeSourceInfo *TInfo = nullptr;
+    SourceLocation StartLoc;
+    QualType DeclTy;
+    if (!D.Type.getAsOpaquePtr()) {
+      // OpenMP 5.0, 2.1.6 Iterators
+      // In an iterator-specifier, if the iterator-type is not specified then
+      // the type of that iterator is of int type.
+      DeclTy = Context.IntTy;
+      StartLoc = D.DeclIdentLoc;
+    } else {
+      DeclTy = Sema::GetTypeFromParser(D.Type, &TInfo);
+      StartLoc = TInfo->getTypeLoc().getBeginLoc();
+    }
+
+    bool IsDeclTyDependent = DeclTy->isDependentType() ||
+                             DeclTy->containsUnexpandedParameterPack() ||
+                             DeclTy->isInstantiationDependentType();
+    if (!IsDeclTyDependent) {
+      if (!DeclTy->isIntegralType(Context) && !DeclTy->isAnyPointerType()) {
+        // OpenMP 5.0, 2.1.6 Iterators, Restrictions, C/C++
+        // The iterator-type must be an integral or pointer type.
+        Diag(StartLoc, diag::err_omp_iterator_not_integral_or_pointer)
+            << DeclTy;
+        IsCorrect = false;
+        continue;
+      }
+      if (DeclTy.isConstant(Context)) {
+        // OpenMP 5.0, 2.1.6 Iterators, Restrictions, C/C++
+        // The iterator-type must not be const qualified.
+        Diag(StartLoc, diag::err_omp_iterator_not_integral_or_pointer)
+            << DeclTy;
+        IsCorrect = false;
+        continue;
+      }
+    }
+
+    // Iterator declaration.
+    assert(D.DeclIdent && "Identifier expected.");
+    // Always try to create iterator declarator to avoid extra error messages
+    // about unknown declarations use.
+    auto *VD =
+        VarDecl::Create(Context, SemaRef.CurContext, StartLoc, D.DeclIdentLoc,
+                        D.DeclIdent, DeclTy, TInfo, SC_None);
+    VD->setImplicit();
+    if (S) {
+      // Check for conflicting previous declaration.
+      DeclarationNameInfo NameInfo(VD->getDeclName(), D.DeclIdentLoc);
+      LookupResult Previous(SemaRef, NameInfo, Sema::LookupOrdinaryName,
+                            RedeclarationKind::ForVisibleRedeclaration);
+      Previous.suppressDiagnostics();
+      SemaRef.LookupName(Previous, S);
+
+      SemaRef.FilterLookupForScope(Previous, SemaRef.CurContext, S,
+                                   /*ConsiderLinkage=*/false,
+                                   /*AllowInlineNamespace=*/false);
+      if (!Previous.empty()) {
+        NamedDecl *Old = Previous.getRepresentativeDecl();
+        Diag(D.DeclIdentLoc, diag::err_redefinition) << VD->getDeclName();
+        Diag(Old->getLocation(), diag::note_previous_definition);
+      } else {
+        SemaRef.PushOnScopeChains(VD, S);
+      }
+    } else {
+      SemaRef.CurContext->addDecl(VD);
+    }
+
+    /// Act on the iterator variable declaration.
+    ActOnOpenMPIteratorVarDecl(VD);
+
+    Expr *Begin = D.Range.Begin;
+    if (!IsDeclTyDependent && Begin && !Begin->isTypeDependent()) {
+      ExprResult BeginRes =
+          SemaRef.PerformImplicitConversion(Begin, DeclTy, Sema::AA_Converting);
+      Begin = BeginRes.get();
+    }
+    Expr *End = D.Range.End;
+    if (!IsDeclTyDependent && End && !End->isTypeDependent()) {
+      ExprResult EndRes =
+          SemaRef.PerformImplicitConversion(End, DeclTy, Sema::AA_Converting);
+      End = EndRes.get();
+    }
+    Expr *Step = D.Range.Step;
+    if (!IsDeclTyDependent && Step && !Step->isTypeDependent()) {
+      if (!Step->getType()->isIntegralType(Context)) {
+        Diag(Step->getExprLoc(), diag::err_omp_iterator_step_not_integral)
+            << Step << Step->getSourceRange();
+        IsCorrect = false;
+        continue;
+      }
+      std::optional<llvm::APSInt> Result =
+          Step->getIntegerConstantExpr(Context);
+      // OpenMP 5.0, 2.1.6 Iterators, Restrictions
+      // If the step expression of a range-specification equals zero, the
+      // behavior is unspecified.
+      if (Result && Result->isZero()) {
+        Diag(Step->getExprLoc(), diag::err_omp_iterator_step_constant_zero)
+            << Step << Step->getSourceRange();
+        IsCorrect = false;
+        continue;
+      }
+    }
+    if (!Begin || !End || !IsCorrect) {
+      IsCorrect = false;
+      continue;
+    }
+    OMPIteratorExpr::IteratorDefinition &IDElem = ID.emplace_back();
+    IDElem.IteratorDecl = VD;
+    IDElem.AssignmentLoc = D.AssignLoc;
+    IDElem.Range.Begin = Begin;
+    IDElem.Range.End = End;
+    IDElem.Range.Step = Step;
+    IDElem.ColonLoc = D.ColonLoc;
+    IDElem.SecondColonLoc = D.SecColonLoc;
+  }
+  if (!IsCorrect) {
+    // Invalidate all created iterator declarations if error is found.
+    for (const OMPIteratorExpr::IteratorDefinition &D : ID) {
+      if (Decl *ID = D.IteratorDecl)
+        ID->setInvalidDecl();
+    }
+    return ExprError();
+  }
+  SmallVector<OMPIteratorHelperData, 4> Helpers;
+  if (!SemaRef.CurContext->isDependentContext()) {
+    // Build number of ityeration for each iteration range.
+    // Ni = ((Stepi > 0) ? ((Endi + Stepi -1 - Begini)/Stepi) :
+    // ((Begini-Stepi-1-Endi) / -Stepi);
+    for (OMPIteratorExpr::IteratorDefinition &D : ID) {
+      // (Endi - Begini)
+      ExprResult Res = SemaRef.CreateBuiltinBinOp(D.AssignmentLoc, BO_Sub,
+                                                  D.Range.End, D.Range.Begin);
+      if (!Res.isUsable()) {
+        IsCorrect = false;
+        continue;
+      }
+      ExprResult St, St1;
+      if (D.Range.Step) {
+        St = D.Range.Step;
+        // (Endi - Begini) + Stepi
+        Res = SemaRef.CreateBuiltinBinOp(D.AssignmentLoc, BO_Add, Res.get(),
+                                         St.get());
+        if (!Res.isUsable()) {
+          IsCorrect = false;
+          continue;
+        }
+        // (Endi - Begini) + Stepi - 1
+        Res = SemaRef.CreateBuiltinBinOp(
+            D.AssignmentLoc, BO_Sub, Res.get(),
+            SemaRef.ActOnIntegerConstant(D.AssignmentLoc, 1).get());
+        if (!Res.isUsable()) {
+          IsCorrect = false;
+          continue;
+        }
+        // ((Endi - Begini) + Stepi - 1) / Stepi
+        Res = SemaRef.CreateBuiltinBinOp(D.AssignmentLoc, BO_Div, Res.get(),
+                                         St.get());
+        if (!Res.isUsable()) {
+          IsCorrect = false;
+          continue;
+        }
+        St1 = SemaRef.CreateBuiltinUnaryOp(D.AssignmentLoc, UO_Minus,
+                                           D.Range.Step);
+        // (Begini - Endi)
+        ExprResult Res1 = SemaRef.CreateBuiltinBinOp(
+            D.AssignmentLoc, BO_Sub, D.Range.Begin, D.Range.End);
+        if (!Res1.isUsable()) {
+          IsCorrect = false;
+          continue;
+        }
+        // (Begini - Endi) - Stepi
+        Res1 = SemaRef.CreateBuiltinBinOp(D.AssignmentLoc, BO_Add, Res1.get(),
+                                          St1.get());
+        if (!Res1.isUsable()) {
+          IsCorrect = false;
+          continue;
+        }
+        // (Begini - Endi) - Stepi - 1
+        Res1 = SemaRef.CreateBuiltinBinOp(
+            D.AssignmentLoc, BO_Sub, Res1.get(),
+            SemaRef.ActOnIntegerConstant(D.AssignmentLoc, 1).get());
+        if (!Res1.isUsable()) {
+          IsCorrect = false;
+          continue;
+        }
+        // ((Begini - Endi) - Stepi - 1) / (-Stepi)
+        Res1 = SemaRef.CreateBuiltinBinOp(D.AssignmentLoc, BO_Div, Res1.get(),
+                                          St1.get());
+        if (!Res1.isUsable()) {
+          IsCorrect = false;
+          continue;
+        }
+        // Stepi > 0.
+        ExprResult CmpRes = SemaRef.CreateBuiltinBinOp(
+            D.AssignmentLoc, BO_GT, D.Range.Step,
+            SemaRef.ActOnIntegerConstant(D.AssignmentLoc, 0).get());
+        if (!CmpRes.isUsable()) {
+          IsCorrect = false;
+          continue;
+        }
+        Res = SemaRef.ActOnConditionalOp(D.AssignmentLoc, D.AssignmentLoc,
+                                         CmpRes.get(), Res.get(), Res1.get());
+        if (!Res.isUsable()) {
+          IsCorrect = false;
+          continue;
+        }
+      }
+      Res = SemaRef.ActOnFinishFullExpr(Res.get(), /*DiscardedValue=*/false);
+      if (!Res.isUsable()) {
+        IsCorrect = false;
+        continue;
+      }
+
+      // Build counter update.
+      // Build counter.
+      auto *CounterVD = VarDecl::Create(Context, SemaRef.CurContext,
+                                        D.IteratorDecl->getBeginLoc(),
+                                        D.IteratorDecl->getBeginLoc(), nullptr,
+                                        Res.get()->getType(), nullptr, SC_None);
+      CounterVD->setImplicit();
+      ExprResult RefRes =
+          SemaRef.BuildDeclRefExpr(CounterVD, CounterVD->getType(), VK_LValue,
+                                   D.IteratorDecl->getBeginLoc());
+      // Build counter update.
+      // I = Begini + counter * Stepi;
+      ExprResult UpdateRes;
+      if (D.Range.Step) {
+        UpdateRes = SemaRef.CreateBuiltinBinOp(
+            D.AssignmentLoc, BO_Mul,
+            SemaRef.DefaultLvalueConversion(RefRes.get()).get(), St.get());
+      } else {
+        UpdateRes = SemaRef.DefaultLvalueConversion(RefRes.get());
+      }
+      if (!UpdateRes.isUsable()) {
+        IsCorrect = false;
+        continue;
+      }
+      UpdateRes = SemaRef.CreateBuiltinBinOp(D.AssignmentLoc, BO_Add,
+                                             D.Range.Begin, UpdateRes.get());
+      if (!UpdateRes.isUsable()) {
+        IsCorrect = false;
+        continue;
+      }
+      ExprResult VDRes =
+          SemaRef.BuildDeclRefExpr(cast<VarDecl>(D.IteratorDecl),
+                                   cast<VarDecl>(D.IteratorDecl)->getType(),
+                                   VK_LValue, D.IteratorDecl->getBeginLoc());
+      UpdateRes = SemaRef.CreateBuiltinBinOp(D.AssignmentLoc, BO_Assign,
+                                             VDRes.get(), UpdateRes.get());
+      if (!UpdateRes.isUsable()) {
+        IsCorrect = false;
+        continue;
+      }
+      UpdateRes =
+          SemaRef.ActOnFinishFullExpr(UpdateRes.get(), /*DiscardedValue=*/true);
+      if (!UpdateRes.isUsable()) {
+        IsCorrect = false;
+        continue;
+      }
+      ExprResult CounterUpdateRes = SemaRef.CreateBuiltinUnaryOp(
+          D.AssignmentLoc, UO_PreInc, RefRes.get());
+      if (!CounterUpdateRes.isUsable()) {
+        IsCorrect = false;
+        continue;
+      }
+      CounterUpdateRes = SemaRef.ActOnFinishFullExpr(CounterUpdateRes.get(),
+                                                     /*DiscardedValue=*/true);
+      if (!CounterUpdateRes.isUsable()) {
+        IsCorrect = false;
+        continue;
+      }
+      OMPIteratorHelperData &HD = Helpers.emplace_back();
+      HD.CounterVD = CounterVD;
+      HD.Upper = Res.get();
+      HD.Update = UpdateRes.get();
+      HD.CounterUpdate = CounterUpdateRes.get();
+    }
+  } else {
+    Helpers.assign(ID.size(), {});
+  }
+  if (!IsCorrect) {
+    // Invalidate all created iterator declarations if error is found.
+    for (const OMPIteratorExpr::IteratorDefinition &D : ID) {
+      if (Decl *ID = D.IteratorDecl)
+        ID->setInvalidDecl();
+    }
+    return ExprError();
+  }
+  return OMPIteratorExpr::Create(Context, Context.OMPIteratorTy, IteratorKwLoc,
+                                 LLoc, RLoc, ID, Helpers);
+}
+
+SemaOpenMP::SemaOpenMP(Sema &S)
+    : SemaBase(S), VarDataSharingAttributesStack(nullptr) {}
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 227ef564ba3e0..adc319e97b762 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -14506,7 +14506,7 @@ Sema::CreateOverloadedUnaryOp(SourceLocation OpLoc, UnaryOperatorKind Opc,
       // operator node.
       ExprResult InputRes = PerformImplicitConversion(
           Input, Best->BuiltinParamTypes[0], Best->Conversions[0], AA_Passing,
-          CCK_ForBuiltinOverloadedOp);
+          CheckedConversionKind::ForBuiltinOverloadedOp);
       if (InputRes.isInvalid())
         return ExprError();
       Input = InputRes.get();
@@ -14989,14 +14989,14 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc,
         // operator node.
         ExprResult ArgsRes0 = PerformImplicitConversion(
             Args[0], Best->BuiltinParamTypes[0], Best->Conversions[0],
-            AA_Passing, CCK_ForBuiltinOverloadedOp);
+            AA_Passing, CheckedConversionKind::ForBuiltinOverloadedOp);
         if (ArgsRes0.isInvalid())
           return ExprError();
         Args[0] = ArgsRes0.get();
 
         ExprResult ArgsRes1 = PerformImplicitConversion(
             Args[1], Best->BuiltinParamTypes[1], Best->Conversions[1],
-            AA_Passing, CCK_ForBuiltinOverloadedOp);
+            AA_Passing, CheckedConversionKind::ForBuiltinOverloadedOp);
         if (ArgsRes1.isInvalid())
           return ExprError();
         Args[1] = ArgsRes1.get();
@@ -15367,14 +15367,14 @@ ExprResult Sema::CreateOverloadedArraySubscriptExpr(SourceLocation LLoc,
         // operator node.
         ExprResult ArgsRes0 = PerformImplicitConversion(
             Args[0], Best->BuiltinParamTypes[0], Best->Conversions[0],
-            AA_Passing, CCK_ForBuiltinOverloadedOp);
+            AA_Passing, CheckedConversionKind::ForBuiltinOverloadedOp);
         if (ArgsRes0.isInvalid())
           return ExprError();
         Args[0] = ArgsRes0.get();
 
         ExprResult ArgsRes1 = PerformImplicitConversion(
             Args[1], Best->BuiltinParamTypes[1], Best->Conversions[1],
-            AA_Passing, CCK_ForBuiltinOverloadedOp);
+            AA_Passing, CheckedConversionKind::ForBuiltinOverloadedOp);
         if (ArgsRes1.isInvalid())
           return ExprError();
         Args[1] = ArgsRes1.get();
diff --git a/clang/lib/Sema/SemaPseudoObject.cpp b/clang/lib/Sema/SemaPseudoObject.cpp
index 82774760b34d4..c6a0a182d3583 100644
--- a/clang/lib/Sema/SemaPseudoObject.cpp
+++ b/clang/lib/Sema/SemaPseudoObject.cpp
@@ -1136,7 +1136,7 @@ static void CheckKeyForObjCARCConversion(Sema &S, QualType ContainerT,
     return;
   QualType T = Getter->parameters()[0]->getType();
   S.CheckObjCConversion(Key->getSourceRange(), T, Key,
-                        Sema::CCK_ImplicitConversion);
+                        CheckedConversionKind::Implicit);
 }
 
 bool ObjCSubscriptOpBuilder::findAtIndexGetter() {
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index d28c24cfdfd33..a7b33f0db047e 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -35,6 +35,7 @@
 #include "clang/Sema/ScopeInfo.h"
 #include "clang/Sema/SemaCUDA.h"
 #include "clang/Sema/SemaInternal.h"
+#include "clang/Sema/SemaOpenMP.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -3097,7 +3098,7 @@ StmtResult Sema::BuildCXXForRangeStmt(
   // In OpenMP loop region loop control variable must be private. Perform
   // analysis of first part (if any).
   if (getLangOpts().OpenMP >= 50 && BeginDeclStmt.isUsable())
-    ActOnOpenMPLoopInitialization(ForLoc, BeginDeclStmt.get());
+    OpenMP().ActOnOpenMPLoopInitialization(ForLoc, BeginDeclStmt.get());
 
   return new (Context) CXXForRangeStmt(
       InitStmt, RangeDS, cast_or_null<DeclStmt>(BeginDeclStmt.get()),
@@ -4822,7 +4823,8 @@ buildCapturedStmtCaptureList(Sema &S, CapturedRegionScopeInfo *RSI,
       assert(Cap.isVariableCapture() && "unknown kind of capture");
 
       if (S.getLangOpts().OpenMP && RSI->CapRegionKind == CR_OpenMP)
-        S.setOpenMPCaptureKind(Field, Cap.getVariable(), RSI->OpenMPLevel);
+        S.OpenMP().setOpenMPCaptureKind(Field, Cap.getVariable(),
+                                        RSI->OpenMPLevel);
 
       Captures.push_back(CapturedStmt::Capture(
           Cap.getLocation(),
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 95171359f0ab1..f4b6e1ceb6f02 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -972,8 +972,9 @@ void Sema::translateTemplateArguments(const ASTTemplateArgsPtr &TemplateArgsIn,
 static void maybeDiagnoseTemplateParameterShadow(Sema &SemaRef, Scope *S,
                                                  SourceLocation Loc,
                                                  const IdentifierInfo *Name) {
-  NamedDecl *PrevDecl = SemaRef.LookupSingleName(
-      S, Name, Loc, Sema::LookupOrdinaryName, Sema::ForVisibleRedeclaration);
+  NamedDecl *PrevDecl =
+      SemaRef.LookupSingleName(S, Name, Loc, Sema::LookupOrdinaryName,
+                               RedeclarationKind::ForVisibleRedeclaration);
   if (PrevDecl && PrevDecl->isTemplateParameter())
     SemaRef.DiagnoseTemplateParameterShadow(Loc, PrevDecl);
 }
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index c45a8d1408fff..caa07abb61fe3 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -28,6 +28,7 @@
 #include "clang/Sema/ScopeInfo.h"
 #include "clang/Sema/SemaCUDA.h"
 #include "clang/Sema/SemaInternal.h"
+#include "clang/Sema/SemaOpenMP.h"
 #include "clang/Sema/Template.h"
 #include "clang/Sema/TemplateInstCallback.h"
 #include "llvm/Support/TimeProfiler.h"
@@ -399,7 +400,7 @@ static void instantiateOMPDeclareSimdDeclAttr(
     ++SI;
   }
   LinModifiers.append(Attr.modifiers_begin(), Attr.modifiers_end());
-  (void)S.ActOnOpenMPDeclareSimdDirective(
+  (void)S.OpenMP().ActOnOpenMPDeclareSimdDirective(
       S.ConvertDeclToDeclGroup(New), Attr.getBranchState(), Simdlen.get(),
       Uniforms, Aligneds, Alignments, Linears, LinModifiers, Steps,
       Attr.getRange());
@@ -476,9 +477,9 @@ static void instantiateOMPDeclareVariantAttr(
   // Check function/variant ref for `omp declare variant` but not for `omp
   // begin declare variant` (which use implicit attributes).
   std::optional<std::pair<FunctionDecl *, Expr *>> DeclVarData =
-      S.checkOpenMPDeclareVariantFunction(S.ConvertDeclToDeclGroup(New), E, TI,
-                                          Attr.appendArgs_size(),
-                                          Attr.getRange());
+      S.OpenMP().checkOpenMPDeclareVariantFunction(
+          S.ConvertDeclToDeclGroup(New), E, TI, Attr.appendArgs_size(),
+          Attr.getRange());
 
   if (!DeclVarData)
     return;
@@ -539,7 +540,7 @@ static void instantiateOMPDeclareVariantAttr(
     AppendArgs.emplace_back(II.IsTarget, II.IsTargetSync);
   }
 
-  S.ActOnOpenMPDeclareVariantDirective(
+  S.OpenMP().ActOnOpenMPDeclareVariantDirective(
       FD, E, TI, NothingExprs, NeedDevicePtrExprs, AppendArgs, SourceLocation(),
       SourceLocation(), Attr.getRange());
 }
@@ -2295,7 +2296,7 @@ Decl *TemplateDeclInstantiator::VisitFunctionDecl(
       SemaRef, Function->getDeclName(), SourceLocation(),
       D->isLocalExternDecl() ? Sema::LookupRedeclarationWithLinkage
                              : Sema::LookupOrdinaryName,
-      D->isLocalExternDecl() ? Sema::ForExternalRedeclaration
+      D->isLocalExternDecl() ? RedeclarationKind::ForExternalRedeclaration
                              : SemaRef.forRedeclarationInCurContext());
 
   if (DependentFunctionTemplateSpecializationInfo *DFTSI =
@@ -2696,7 +2697,7 @@ Decl *TemplateDeclInstantiator::VisitCXXMethodDecl(
     Method->setInvalidDecl();
 
   LookupResult Previous(SemaRef, NameInfo, Sema::LookupOrdinaryName,
-                        Sema::ForExternalRedeclaration);
+                        RedeclarationKind::ForExternalRedeclaration);
 
   bool IsExplicitSpecialization = false;
 
@@ -3364,7 +3365,7 @@ Decl *TemplateDeclInstantiator::VisitUsingDecl(UsingDecl *D) {
   // fact, it's not really even possible in non-class scopes).
   bool CheckRedeclaration = Owner->isRecord();
   LookupResult Prev(SemaRef, NameInfo, Sema::LookupUsingDeclName,
-                    Sema::ForVisibleRedeclaration);
+                    RedeclarationKind::ForVisibleRedeclaration);
 
   UsingDecl *NewUD = UsingDecl::Create(SemaRef.Context, Owner,
                                        D->getUsingLoc(),
@@ -3587,7 +3588,7 @@ Decl *TemplateDeclInstantiator::VisitOMPThreadPrivateDecl(
   }
 
   OMPThreadPrivateDecl *TD =
-    SemaRef.CheckOMPThreadPrivateDecl(D->getLocation(), Vars);
+      SemaRef.OpenMP().CheckOMPThreadPrivateDecl(D->getLocation(), Vars);
 
   TD->setAccess(AS_public);
   Owner->addDecl(TD);
@@ -3610,14 +3611,14 @@ Decl *TemplateDeclInstantiator::VisitOMPAllocateDecl(OMPAllocateDecl *D) {
       ExprResult NewE = SemaRef.SubstExpr(AC->getAllocator(), TemplateArgs);
       if (!NewE.isUsable())
         continue;
-      IC = SemaRef.ActOnOpenMPAllocatorClause(
+      IC = SemaRef.OpenMP().ActOnOpenMPAllocatorClause(
           NewE.get(), AC->getBeginLoc(), AC->getLParenLoc(), AC->getEndLoc());
     } else if (auto *AC = dyn_cast<OMPAlignClause>(C)) {
       ExprResult NewE = SemaRef.SubstExpr(AC->getAlignment(), TemplateArgs);
       if (!NewE.isUsable())
         continue;
-      IC = SemaRef.ActOnOpenMPAlignClause(NewE.get(), AC->getBeginLoc(),
-                                          AC->getLParenLoc(), AC->getEndLoc());
+      IC = SemaRef.OpenMP().ActOnOpenMPAlignClause(
+          NewE.get(), AC->getBeginLoc(), AC->getLParenLoc(), AC->getEndLoc());
       // If align clause value ends up being invalid, this can end up null.
       if (!IC)
         continue;
@@ -3625,7 +3626,7 @@ Decl *TemplateDeclInstantiator::VisitOMPAllocateDecl(OMPAllocateDecl *D) {
     Clauses.push_back(IC);
   }
 
-  Sema::DeclGroupPtrTy Res = SemaRef.ActOnOpenMPAllocateDirective(
+  Sema::DeclGroupPtrTy Res = SemaRef.OpenMP().ActOnOpenMPAllocateDirective(
       D->getLocation(), Vars, Clauses, Owner);
   if (Res.get().isNull())
     return nullptr;
@@ -3646,7 +3647,7 @@ Decl *TemplateDeclInstantiator::VisitOMPDeclareReductionDecl(
       D->getType()->containsUnexpandedParameterPack();
   QualType SubstReductionType;
   if (RequiresInstantiation) {
-    SubstReductionType = SemaRef.ActOnOpenMPDeclareReductionType(
+    SubstReductionType = SemaRef.OpenMP().ActOnOpenMPDeclareReductionType(
         D->getLocation(),
         ParsedType::make(SemaRef.SubstType(
             D->getType(), TemplateArgs, D->getLocation(), DeclarationName())));
@@ -3667,7 +3668,7 @@ Decl *TemplateDeclInstantiator::VisitOMPDeclareReductionDecl(
         SemaRef.CurrentInstantiationScope->findInstantiationOf(PrevDeclInScope)
             ->get<Decl *>());
   }
-  auto DRD = SemaRef.ActOnOpenMPDeclareReductionDirectiveStart(
+  auto DRD = SemaRef.OpenMP().ActOnOpenMPDeclareReductionDirectiveStart(
       /*S=*/nullptr, Owner, D->getDeclName(), ReductionTypes, D->getAccess(),
       PrevDeclInScope);
   auto *NewDRD = cast<OMPDeclareReductionDecl>(DRD.get().getSingleDecl());
@@ -3676,7 +3677,7 @@ Decl *TemplateDeclInstantiator::VisitOMPDeclareReductionDecl(
   Expr *SubstInitializer = nullptr;
   // Combiners instantiation sequence.
   if (Combiner) {
-    SemaRef.ActOnOpenMPDeclareReductionCombinerStart(
+    SemaRef.OpenMP().ActOnOpenMPDeclareReductionCombinerStart(
         /*S=*/nullptr, NewDRD);
     SemaRef.CurrentInstantiationScope->InstantiatedLocal(
         cast<DeclRefExpr>(D->getCombinerIn())->getDecl(),
@@ -3688,12 +3689,14 @@ Decl *TemplateDeclInstantiator::VisitOMPDeclareReductionDecl(
     Sema::CXXThisScopeRAII ThisScope(SemaRef, ThisContext, Qualifiers(),
                                      ThisContext);
     SubstCombiner = SemaRef.SubstExpr(Combiner, TemplateArgs).get();
-    SemaRef.ActOnOpenMPDeclareReductionCombinerEnd(NewDRD, SubstCombiner);
+    SemaRef.OpenMP().ActOnOpenMPDeclareReductionCombinerEnd(NewDRD,
+                                                            SubstCombiner);
   }
   // Initializers instantiation sequence.
   if (Init) {
-    VarDecl *OmpPrivParm = SemaRef.ActOnOpenMPDeclareReductionInitializerStart(
-        /*S=*/nullptr, NewDRD);
+    VarDecl *OmpPrivParm =
+        SemaRef.OpenMP().ActOnOpenMPDeclareReductionInitializerStart(
+            /*S=*/nullptr, NewDRD);
     SemaRef.CurrentInstantiationScope->InstantiatedLocal(
         cast<DeclRefExpr>(D->getInitOrig())->getDecl(),
         cast<DeclRefExpr>(NewDRD->getInitOrig())->getDecl());
@@ -3710,8 +3713,8 @@ Decl *TemplateDeclInstantiator::VisitOMPDeclareReductionDecl(
         SemaRef.InstantiateVariableInitializer(OmpPrivParm, OldPrivParm,
                                                TemplateArgs);
     }
-    SemaRef.ActOnOpenMPDeclareReductionInitializerEnd(NewDRD, SubstInitializer,
-                                                      OmpPrivParm);
+    SemaRef.OpenMP().ActOnOpenMPDeclareReductionInitializerEnd(
+        NewDRD, SubstInitializer, OmpPrivParm);
   }
   IsCorrect = IsCorrect && SubstCombiner &&
               (!Init ||
@@ -3720,7 +3723,7 @@ Decl *TemplateDeclInstantiator::VisitOMPDeclareReductionDecl(
                (D->getInitializerKind() != OMPDeclareReductionInitKind::Call &&
                 !SubstInitializer));
 
-  (void)SemaRef.ActOnOpenMPDeclareReductionDirectiveEnd(
+  (void)SemaRef.OpenMP().ActOnOpenMPDeclareReductionDirectiveEnd(
       /*S=*/nullptr, DRD, IsCorrect && !D->isInvalidDecl());
 
   return NewDRD;
@@ -3736,7 +3739,7 @@ TemplateDeclInstantiator::VisitOMPDeclareMapperDecl(OMPDeclareMapperDecl *D) {
   QualType SubstMapperTy;
   DeclarationName VN = D->getVarName();
   if (RequiresInstantiation) {
-    SubstMapperTy = SemaRef.ActOnOpenMPDeclareMapperType(
+    SubstMapperTy = SemaRef.OpenMP().ActOnOpenMPDeclareMapperType(
         D->getLocation(),
         ParsedType::make(SemaRef.SubstType(D->getType(), TemplateArgs,
                                            D->getLocation(), VN)));
@@ -3756,11 +3759,12 @@ TemplateDeclInstantiator::VisitOMPDeclareMapperDecl(OMPDeclareMapperDecl *D) {
   SmallVector<OMPClause *, 6> Clauses;
   // Instantiate the mapper variable.
   DeclarationNameInfo DirName;
-  SemaRef.StartOpenMPDSABlock(llvm::omp::OMPD_declare_mapper, DirName,
-                              /*S=*/nullptr,
-                              (*D->clauselist_begin())->getBeginLoc());
-  ExprResult MapperVarRef = SemaRef.ActOnOpenMPDeclareMapperDirectiveVarDecl(
-      /*S=*/nullptr, SubstMapperTy, D->getLocation(), VN);
+  SemaRef.OpenMP().StartOpenMPDSABlock(llvm::omp::OMPD_declare_mapper, DirName,
+                                       /*S=*/nullptr,
+                                       (*D->clauselist_begin())->getBeginLoc());
+  ExprResult MapperVarRef =
+      SemaRef.OpenMP().ActOnOpenMPDeclareMapperDirectiveVarDecl(
+          /*S=*/nullptr, SubstMapperTy, D->getLocation(), VN);
   SemaRef.CurrentInstantiationScope->InstantiatedLocal(
       cast<DeclRefExpr>(D->getMapperVarRef())->getDecl(),
       cast<DeclRefExpr>(MapperVarRef.get())->getDecl());
@@ -3790,17 +3794,17 @@ TemplateDeclInstantiator::VisitOMPDeclareMapperDecl(OMPDeclareMapperDecl *D) {
         SemaRef.SubstDeclarationNameInfo(OldC->getMapperIdInfo(), TemplateArgs);
     OMPVarListLocTy Locs(OldC->getBeginLoc(), OldC->getLParenLoc(),
                          OldC->getEndLoc());
-    OMPClause *NewC = SemaRef.ActOnOpenMPMapClause(
+    OMPClause *NewC = SemaRef.OpenMP().ActOnOpenMPMapClause(
         OldC->getIteratorModifier(), OldC->getMapTypeModifiers(),
         OldC->getMapTypeModifiersLoc(), SS, NewNameInfo, OldC->getMapType(),
         OldC->isImplicitMapType(), OldC->getMapLoc(), OldC->getColonLoc(),
         NewVars, Locs);
     Clauses.push_back(NewC);
   }
-  SemaRef.EndOpenMPDSABlock(nullptr);
+  SemaRef.OpenMP().EndOpenMPDSABlock(nullptr);
   if (!IsCorrect)
     return nullptr;
-  Sema::DeclGroupPtrTy DG = SemaRef.ActOnOpenMPDeclareMapperDirective(
+  Sema::DeclGroupPtrTy DG = SemaRef.OpenMP().ActOnOpenMPDeclareMapperDirective(
       /*S=*/nullptr, Owner, D->getDeclName(), SubstMapperTy, D->getLocation(),
       VN, D->getAccess(), MapperVarRef.get(), Clauses, PrevDeclInScope);
   Decl *NewDMD = DG.get().getSingleDecl();
@@ -5384,7 +5388,7 @@ void Sema::BuildVariableInstantiation(
       *this, NewVar->getDeclName(), NewVar->getLocation(),
       NewVar->isLocalExternDecl() ? Sema::LookupRedeclarationWithLinkage
                                   : Sema::LookupOrdinaryName,
-      NewVar->isLocalExternDecl() ? Sema::ForExternalRedeclaration
+      NewVar->isLocalExternDecl() ? RedeclarationKind::ForExternalRedeclaration
                                   : forRedeclarationInCurContext());
 
   if (NewVar->isLocalExternDecl() && OldVar->getPreviousDecl() &&
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 404c4e8e31b55..1b31df8d97fba 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -35,6 +35,7 @@
 #include "clang/Sema/ScopeInfo.h"
 #include "clang/Sema/SemaCUDA.h"
 #include "clang/Sema/SemaInternal.h"
+#include "clang/Sema/SemaOpenMP.h"
 #include "clang/Sema/Template.h"
 #include "clang/Sema/TemplateInstCallback.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -2640,7 +2641,7 @@ QualType Sema::BuildArrayType(QualType T, ArraySizeModifier ASM,
   } else if (isSFINAEContext()) {
     VLADiag = diag::err_vla_in_sfinae;
     VLAIsError = true;
-  } else if (getLangOpts().OpenMP && isInOpenMPTaskUntiedContext()) {
+  } else if (getLangOpts().OpenMP && OpenMP().isInOpenMPTaskUntiedContext()) {
     VLADiag = diag::err_openmp_vla_in_task_untied;
     VLAIsError = true;
   } else if (getLangOpts().CPlusPlus) {
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 8c96134af7c8f..eb05783a6219d 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -40,6 +40,7 @@
 #include "clang/Sema/SemaDiagnostic.h"
 #include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/SemaOpenACC.h"
+#include "clang/Sema/SemaOpenMP.h"
 #include "clang/Sema/SemaSYCL.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -1656,7 +1657,7 @@ class TreeTransform {
   /// Ensures that the outermost loop in @p LoopStmt is wrapped by a
   /// OMPCanonicalLoop.
   StmtResult RebuildOMPCanonicalLoop(Stmt *LoopStmt) {
-    return getSema().ActOnOpenMPCanonicalLoop(LoopStmt);
+    return getSema().OpenMP().ActOnOpenMPCanonicalLoop(LoopStmt);
   }
 
   /// Build a new OpenMP executable directive.
@@ -1669,7 +1670,7 @@ class TreeTransform {
       Stmt *AStmt, SourceLocation StartLoc, SourceLocation EndLoc,
       OpenMPDirectiveKind PrevMappedDirective = OMPD_unknown) {
 
-    return getSema().ActOnOpenMPExecutableDirective(
+    return getSema().OpenMP().ActOnOpenMPExecutableDirective(
         Kind, DirName, CancelRegion, Clauses, AStmt, StartLoc, EndLoc,
         PrevMappedDirective);
   }
@@ -1684,9 +1685,9 @@ class TreeTransform {
                                 SourceLocation NameModifierLoc,
                                 SourceLocation ColonLoc,
                                 SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPIfClause(NameModifier, Condition, StartLoc,
-                                         LParenLoc, NameModifierLoc, ColonLoc,
-                                         EndLoc);
+    return getSema().OpenMP().ActOnOpenMPIfClause(
+        NameModifier, Condition, StartLoc, LParenLoc, NameModifierLoc, ColonLoc,
+        EndLoc);
   }
 
   /// Build a new OpenMP 'final' clause.
@@ -1696,8 +1697,8 @@ class TreeTransform {
   OMPClause *RebuildOMPFinalClause(Expr *Condition, SourceLocation StartLoc,
                                    SourceLocation LParenLoc,
                                    SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPFinalClause(Condition, StartLoc, LParenLoc,
-                                            EndLoc);
+    return getSema().OpenMP().ActOnOpenMPFinalClause(Condition, StartLoc,
+                                                     LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'num_threads' clause.
@@ -1708,8 +1709,8 @@ class TreeTransform {
                                         SourceLocation StartLoc,
                                         SourceLocation LParenLoc,
                                         SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPNumThreadsClause(NumThreads, StartLoc,
-                                                 LParenLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPNumThreadsClause(NumThreads, StartLoc,
+                                                          LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'safelen' clause.
@@ -1719,7 +1720,8 @@ class TreeTransform {
   OMPClause *RebuildOMPSafelenClause(Expr *Len, SourceLocation StartLoc,
                                      SourceLocation LParenLoc,
                                      SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPSafelenClause(Len, StartLoc, LParenLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPSafelenClause(Len, StartLoc, LParenLoc,
+                                                       EndLoc);
   }
 
   /// Build a new OpenMP 'simdlen' clause.
@@ -1729,28 +1731,30 @@ class TreeTransform {
   OMPClause *RebuildOMPSimdlenClause(Expr *Len, SourceLocation StartLoc,
                                      SourceLocation LParenLoc,
                                      SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPSimdlenClause(Len, StartLoc, LParenLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPSimdlenClause(Len, StartLoc, LParenLoc,
+                                                       EndLoc);
   }
 
   OMPClause *RebuildOMPSizesClause(ArrayRef<Expr *> Sizes,
                                    SourceLocation StartLoc,
                                    SourceLocation LParenLoc,
                                    SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPSizesClause(Sizes, StartLoc, LParenLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPSizesClause(Sizes, StartLoc, LParenLoc,
+                                                     EndLoc);
   }
 
   /// Build a new OpenMP 'full' clause.
   OMPClause *RebuildOMPFullClause(SourceLocation StartLoc,
                                   SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPFullClause(StartLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPFullClause(StartLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'partial' clause.
   OMPClause *RebuildOMPPartialClause(Expr *Factor, SourceLocation StartLoc,
                                      SourceLocation LParenLoc,
                                      SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPPartialClause(Factor, StartLoc, LParenLoc,
-                                              EndLoc);
+    return getSema().OpenMP().ActOnOpenMPPartialClause(Factor, StartLoc,
+                                                       LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'allocator' clause.
@@ -1760,7 +1764,8 @@ class TreeTransform {
   OMPClause *RebuildOMPAllocatorClause(Expr *A, SourceLocation StartLoc,
                                        SourceLocation LParenLoc,
                                        SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPAllocatorClause(A, StartLoc, LParenLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPAllocatorClause(A, StartLoc, LParenLoc,
+                                                         EndLoc);
   }
 
   /// Build a new OpenMP 'collapse' clause.
@@ -1770,8 +1775,8 @@ class TreeTransform {
   OMPClause *RebuildOMPCollapseClause(Expr *Num, SourceLocation StartLoc,
                                       SourceLocation LParenLoc,
                                       SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPCollapseClause(Num, StartLoc, LParenLoc,
-                                               EndLoc);
+    return getSema().OpenMP().ActOnOpenMPCollapseClause(Num, StartLoc,
+                                                        LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'default' clause.
@@ -1782,8 +1787,8 @@ class TreeTransform {
                                      SourceLocation StartLoc,
                                      SourceLocation LParenLoc,
                                      SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPDefaultClause(Kind, KindKwLoc,
-                                              StartLoc, LParenLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPDefaultClause(
+        Kind, KindKwLoc, StartLoc, LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'proc_bind' clause.
@@ -1795,8 +1800,8 @@ class TreeTransform {
                                       SourceLocation StartLoc,
                                       SourceLocation LParenLoc,
                                       SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPProcBindClause(Kind, KindKwLoc,
-                                               StartLoc, LParenLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPProcBindClause(
+        Kind, KindKwLoc, StartLoc, LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'schedule' clause.
@@ -1808,7 +1813,7 @@ class TreeTransform {
       OpenMPScheduleClauseKind Kind, Expr *ChunkSize, SourceLocation StartLoc,
       SourceLocation LParenLoc, SourceLocation M1Loc, SourceLocation M2Loc,
       SourceLocation KindLoc, SourceLocation CommaLoc, SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPScheduleClause(
+    return getSema().OpenMP().ActOnOpenMPScheduleClause(
         M1, M2, Kind, ChunkSize, StartLoc, LParenLoc, M1Loc, M2Loc, KindLoc,
         CommaLoc, EndLoc);
   }
@@ -1820,7 +1825,8 @@ class TreeTransform {
   OMPClause *RebuildOMPOrderedClause(SourceLocation StartLoc,
                                      SourceLocation EndLoc,
                                      SourceLocation LParenLoc, Expr *Num) {
-    return getSema().ActOnOpenMPOrderedClause(StartLoc, EndLoc, LParenLoc, Num);
+    return getSema().OpenMP().ActOnOpenMPOrderedClause(StartLoc, EndLoc,
+                                                       LParenLoc, Num);
   }
 
   /// Build a new OpenMP 'private' clause.
@@ -1831,8 +1837,8 @@ class TreeTransform {
                                      SourceLocation StartLoc,
                                      SourceLocation LParenLoc,
                                      SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPPrivateClause(VarList, StartLoc, LParenLoc,
-                                              EndLoc);
+    return getSema().OpenMP().ActOnOpenMPPrivateClause(VarList, StartLoc,
+                                                       LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'firstprivate' clause.
@@ -1843,8 +1849,8 @@ class TreeTransform {
                                           SourceLocation StartLoc,
                                           SourceLocation LParenLoc,
                                           SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPFirstprivateClause(VarList, StartLoc, LParenLoc,
-                                                   EndLoc);
+    return getSema().OpenMP().ActOnOpenMPFirstprivateClause(VarList, StartLoc,
+                                                            LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'lastprivate' clause.
@@ -1858,7 +1864,7 @@ class TreeTransform {
                                          SourceLocation StartLoc,
                                          SourceLocation LParenLoc,
                                          SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPLastprivateClause(
+    return getSema().OpenMP().ActOnOpenMPLastprivateClause(
         VarList, LPKind, LPKindLoc, ColonLoc, StartLoc, LParenLoc, EndLoc);
   }
 
@@ -1870,8 +1876,8 @@ class TreeTransform {
                                     SourceLocation StartLoc,
                                     SourceLocation LParenLoc,
                                     SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPSharedClause(VarList, StartLoc, LParenLoc,
-                                             EndLoc);
+    return getSema().OpenMP().ActOnOpenMPSharedClause(VarList, StartLoc,
+                                                      LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'reduction' clause.
@@ -1885,7 +1891,7 @@ class TreeTransform {
       SourceLocation EndLoc, CXXScopeSpec &ReductionIdScopeSpec,
       const DeclarationNameInfo &ReductionId,
       ArrayRef<Expr *> UnresolvedReductions) {
-    return getSema().ActOnOpenMPReductionClause(
+    return getSema().OpenMP().ActOnOpenMPReductionClause(
         VarList, Modifier, StartLoc, LParenLoc, ModifierLoc, ColonLoc, EndLoc,
         ReductionIdScopeSpec, ReductionId, UnresolvedReductions);
   }
@@ -1900,7 +1906,7 @@ class TreeTransform {
       CXXScopeSpec &ReductionIdScopeSpec,
       const DeclarationNameInfo &ReductionId,
       ArrayRef<Expr *> UnresolvedReductions) {
-    return getSema().ActOnOpenMPTaskReductionClause(
+    return getSema().OpenMP().ActOnOpenMPTaskReductionClause(
         VarList, StartLoc, LParenLoc, ColonLoc, EndLoc, ReductionIdScopeSpec,
         ReductionId, UnresolvedReductions);
   }
@@ -1916,7 +1922,7 @@ class TreeTransform {
                               CXXScopeSpec &ReductionIdScopeSpec,
                               const DeclarationNameInfo &ReductionId,
                               ArrayRef<Expr *> UnresolvedReductions) {
-    return getSema().ActOnOpenMPInReductionClause(
+    return getSema().OpenMP().ActOnOpenMPInReductionClause(
         VarList, StartLoc, LParenLoc, ColonLoc, EndLoc, ReductionIdScopeSpec,
         ReductionId, UnresolvedReductions);
   }
@@ -1930,9 +1936,9 @@ class TreeTransform {
       SourceLocation LParenLoc, OpenMPLinearClauseKind Modifier,
       SourceLocation ModifierLoc, SourceLocation ColonLoc,
       SourceLocation StepModifierLoc, SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPLinearClause(VarList, Step, StartLoc, LParenLoc,
-                                             Modifier, ModifierLoc, ColonLoc,
-                                             StepModifierLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPLinearClause(
+        VarList, Step, StartLoc, LParenLoc, Modifier, ModifierLoc, ColonLoc,
+        StepModifierLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'aligned' clause.
@@ -1944,8 +1950,8 @@ class TreeTransform {
                                      SourceLocation LParenLoc,
                                      SourceLocation ColonLoc,
                                      SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPAlignedClause(VarList, Alignment, StartLoc,
-                                              LParenLoc, ColonLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPAlignedClause(
+        VarList, Alignment, StartLoc, LParenLoc, ColonLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'copyin' clause.
@@ -1956,8 +1962,8 @@ class TreeTransform {
                                     SourceLocation StartLoc,
                                     SourceLocation LParenLoc,
                                     SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPCopyinClause(VarList, StartLoc, LParenLoc,
-                                             EndLoc);
+    return getSema().OpenMP().ActOnOpenMPCopyinClause(VarList, StartLoc,
+                                                      LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'copyprivate' clause.
@@ -1968,8 +1974,8 @@ class TreeTransform {
                                          SourceLocation StartLoc,
                                          SourceLocation LParenLoc,
                                          SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPCopyprivateClause(VarList, StartLoc, LParenLoc,
-                                                  EndLoc);
+    return getSema().OpenMP().ActOnOpenMPCopyprivateClause(VarList, StartLoc,
+                                                           LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'flush' pseudo clause.
@@ -1980,8 +1986,8 @@ class TreeTransform {
                                    SourceLocation StartLoc,
                                    SourceLocation LParenLoc,
                                    SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPFlushClause(VarList, StartLoc, LParenLoc,
-                                            EndLoc);
+    return getSema().OpenMP().ActOnOpenMPFlushClause(VarList, StartLoc,
+                                                     LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'depobj' pseudo clause.
@@ -1991,8 +1997,8 @@ class TreeTransform {
   OMPClause *RebuildOMPDepobjClause(Expr *Depobj, SourceLocation StartLoc,
                                     SourceLocation LParenLoc,
                                     SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPDepobjClause(Depobj, StartLoc, LParenLoc,
-                                             EndLoc);
+    return getSema().OpenMP().ActOnOpenMPDepobjClause(Depobj, StartLoc,
+                                                      LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'depend' pseudo clause.
@@ -2004,8 +2010,8 @@ class TreeTransform {
                                     SourceLocation StartLoc,
                                     SourceLocation LParenLoc,
                                     SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPDependClause(Data, DepModifier, VarList,
-                                             StartLoc, LParenLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPDependClause(
+        Data, DepModifier, VarList, StartLoc, LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'device' clause.
@@ -2017,8 +2023,8 @@ class TreeTransform {
                                     SourceLocation LParenLoc,
                                     SourceLocation ModifierLoc,
                                     SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPDeviceClause(Modifier, Device, StartLoc,
-                                             LParenLoc, ModifierLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPDeviceClause(
+        Modifier, Device, StartLoc, LParenLoc, ModifierLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'map' clause.
@@ -2032,7 +2038,7 @@ class TreeTransform {
       OpenMPMapClauseKind MapType, bool IsMapTypeImplicit,
       SourceLocation MapLoc, SourceLocation ColonLoc, ArrayRef<Expr *> VarList,
       const OMPVarListLocTy &Locs, ArrayRef<Expr *> UnresolvedMappers) {
-    return getSema().ActOnOpenMPMapClause(
+    return getSema().OpenMP().ActOnOpenMPMapClause(
         IteratorModifier, MapTypeModifiers, MapTypeModifiersLoc,
         MapperIdScopeSpec, MapperId, MapType, IsMapTypeImplicit, MapLoc,
         ColonLoc, VarList, Locs,
@@ -2048,8 +2054,8 @@ class TreeTransform {
                                       SourceLocation LParenLoc,
                                       SourceLocation ColonLoc,
                                       SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPAllocateClause(Allocate, VarList, StartLoc,
-                                               LParenLoc, ColonLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPAllocateClause(
+        Allocate, VarList, StartLoc, LParenLoc, ColonLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'num_teams' clause.
@@ -2059,8 +2065,8 @@ class TreeTransform {
   OMPClause *RebuildOMPNumTeamsClause(Expr *NumTeams, SourceLocation StartLoc,
                                       SourceLocation LParenLoc,
                                       SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPNumTeamsClause(NumTeams, StartLoc, LParenLoc,
-                                               EndLoc);
+    return getSema().OpenMP().ActOnOpenMPNumTeamsClause(NumTeams, StartLoc,
+                                                        LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'thread_limit' clause.
@@ -2071,8 +2077,8 @@ class TreeTransform {
                                          SourceLocation StartLoc,
                                          SourceLocation LParenLoc,
                                          SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPThreadLimitClause(ThreadLimit, StartLoc,
-                                                  LParenLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPThreadLimitClause(
+        ThreadLimit, StartLoc, LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'priority' clause.
@@ -2082,8 +2088,8 @@ class TreeTransform {
   OMPClause *RebuildOMPPriorityClause(Expr *Priority, SourceLocation StartLoc,
                                       SourceLocation LParenLoc,
                                       SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPPriorityClause(Priority, StartLoc, LParenLoc,
-                                               EndLoc);
+    return getSema().OpenMP().ActOnOpenMPPriorityClause(Priority, StartLoc,
+                                                        LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'grainsize' clause.
@@ -2095,8 +2101,8 @@ class TreeTransform {
                                        SourceLocation LParenLoc,
                                        SourceLocation ModifierLoc,
                                        SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPGrainsizeClause(Modifier, Device, StartLoc,
-                                                LParenLoc, ModifierLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPGrainsizeClause(
+        Modifier, Device, StartLoc, LParenLoc, ModifierLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'num_tasks' clause.
@@ -2108,8 +2114,8 @@ class TreeTransform {
                                       SourceLocation LParenLoc,
                                       SourceLocation ModifierLoc,
                                       SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPNumTasksClause(Modifier, NumTasks, StartLoc,
-                                               LParenLoc, ModifierLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPNumTasksClause(
+        Modifier, NumTasks, StartLoc, LParenLoc, ModifierLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'hint' clause.
@@ -2119,7 +2125,8 @@ class TreeTransform {
   OMPClause *RebuildOMPHintClause(Expr *Hint, SourceLocation StartLoc,
                                   SourceLocation LParenLoc,
                                   SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPHintClause(Hint, StartLoc, LParenLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPHintClause(Hint, StartLoc, LParenLoc,
+                                                    EndLoc);
   }
 
   /// Build a new OpenMP 'detach' clause.
@@ -2129,7 +2136,8 @@ class TreeTransform {
   OMPClause *RebuildOMPDetachClause(Expr *Evt, SourceLocation StartLoc,
                                     SourceLocation LParenLoc,
                                     SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPDetachClause(Evt, StartLoc, LParenLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPDetachClause(Evt, StartLoc, LParenLoc,
+                                                      EndLoc);
   }
 
   /// Build a new OpenMP 'dist_schedule' clause.
@@ -2141,7 +2149,7 @@ class TreeTransform {
                                Expr *ChunkSize, SourceLocation StartLoc,
                                SourceLocation LParenLoc, SourceLocation KindLoc,
                                SourceLocation CommaLoc, SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPDistScheduleClause(
+    return getSema().OpenMP().ActOnOpenMPDistScheduleClause(
         Kind, ChunkSize, StartLoc, LParenLoc, KindLoc, CommaLoc, EndLoc);
   }
 
@@ -2156,9 +2164,9 @@ class TreeTransform {
                      DeclarationNameInfo &MapperId, SourceLocation ColonLoc,
                      ArrayRef<Expr *> VarList, const OMPVarListLocTy &Locs,
                      ArrayRef<Expr *> UnresolvedMappers) {
-    return getSema().ActOnOpenMPToClause(MotionModifiers, MotionModifiersLoc,
-                                         MapperIdScopeSpec, MapperId, ColonLoc,
-                                         VarList, Locs, UnresolvedMappers);
+    return getSema().OpenMP().ActOnOpenMPToClause(
+        MotionModifiers, MotionModifiersLoc, MapperIdScopeSpec, MapperId,
+        ColonLoc, VarList, Locs, UnresolvedMappers);
   }
 
   /// Build a new OpenMP 'from' clause.
@@ -2172,7 +2180,7 @@ class TreeTransform {
                        DeclarationNameInfo &MapperId, SourceLocation ColonLoc,
                        ArrayRef<Expr *> VarList, const OMPVarListLocTy &Locs,
                        ArrayRef<Expr *> UnresolvedMappers) {
-    return getSema().ActOnOpenMPFromClause(
+    return getSema().OpenMP().ActOnOpenMPFromClause(
         MotionModifiers, MotionModifiersLoc, MapperIdScopeSpec, MapperId,
         ColonLoc, VarList, Locs, UnresolvedMappers);
   }
@@ -2183,7 +2191,7 @@ class TreeTransform {
   /// Subclasses may override this routine to provide different behavior.
   OMPClause *RebuildOMPUseDevicePtrClause(ArrayRef<Expr *> VarList,
                                           const OMPVarListLocTy &Locs) {
-    return getSema().ActOnOpenMPUseDevicePtrClause(VarList, Locs);
+    return getSema().OpenMP().ActOnOpenMPUseDevicePtrClause(VarList, Locs);
   }
 
   /// Build a new OpenMP 'use_device_addr' clause.
@@ -2192,7 +2200,7 @@ class TreeTransform {
   /// Subclasses may override this routine to provide different behavior.
   OMPClause *RebuildOMPUseDeviceAddrClause(ArrayRef<Expr *> VarList,
                                            const OMPVarListLocTy &Locs) {
-    return getSema().ActOnOpenMPUseDeviceAddrClause(VarList, Locs);
+    return getSema().OpenMP().ActOnOpenMPUseDeviceAddrClause(VarList, Locs);
   }
 
   /// Build a new OpenMP 'is_device_ptr' clause.
@@ -2201,7 +2209,7 @@ class TreeTransform {
   /// Subclasses may override this routine to provide different behavior.
   OMPClause *RebuildOMPIsDevicePtrClause(ArrayRef<Expr *> VarList,
                                          const OMPVarListLocTy &Locs) {
-    return getSema().ActOnOpenMPIsDevicePtrClause(VarList, Locs);
+    return getSema().OpenMP().ActOnOpenMPIsDevicePtrClause(VarList, Locs);
   }
 
   /// Build a new OpenMP 'has_device_addr' clause.
@@ -2210,7 +2218,7 @@ class TreeTransform {
   /// Subclasses may override this routine to provide different behavior.
   OMPClause *RebuildOMPHasDeviceAddrClause(ArrayRef<Expr *> VarList,
                                            const OMPVarListLocTy &Locs) {
-    return getSema().ActOnOpenMPHasDeviceAddrClause(VarList, Locs);
+    return getSema().OpenMP().ActOnOpenMPHasDeviceAddrClause(VarList, Locs);
   }
 
   /// Build a new OpenMP 'defaultmap' clause.
@@ -2224,8 +2232,8 @@ class TreeTransform {
                                         SourceLocation MLoc,
                                         SourceLocation KindLoc,
                                         SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPDefaultmapClause(M, Kind, StartLoc, LParenLoc,
-                                                 MLoc, KindLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPDefaultmapClause(
+        M, Kind, StartLoc, LParenLoc, MLoc, KindLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'nontemporal' clause.
@@ -2236,8 +2244,8 @@ class TreeTransform {
                                          SourceLocation StartLoc,
                                          SourceLocation LParenLoc,
                                          SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPNontemporalClause(VarList, StartLoc, LParenLoc,
-                                                  EndLoc);
+    return getSema().OpenMP().ActOnOpenMPNontemporalClause(VarList, StartLoc,
+                                                           LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'inclusive' clause.
@@ -2248,8 +2256,8 @@ class TreeTransform {
                                        SourceLocation StartLoc,
                                        SourceLocation LParenLoc,
                                        SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPInclusiveClause(VarList, StartLoc, LParenLoc,
-                                                EndLoc);
+    return getSema().OpenMP().ActOnOpenMPInclusiveClause(VarList, StartLoc,
+                                                         LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'exclusive' clause.
@@ -2260,8 +2268,8 @@ class TreeTransform {
                                        SourceLocation StartLoc,
                                        SourceLocation LParenLoc,
                                        SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPExclusiveClause(VarList, StartLoc, LParenLoc,
-                                                EndLoc);
+    return getSema().OpenMP().ActOnOpenMPExclusiveClause(VarList, StartLoc,
+                                                         LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'uses_allocators' clause.
@@ -2269,10 +2277,10 @@ class TreeTransform {
   /// By default, performs semantic analysis to build the new OpenMP clause.
   /// Subclasses may override this routine to provide different behavior.
   OMPClause *RebuildOMPUsesAllocatorsClause(
-      ArrayRef<Sema::UsesAllocatorsData> Data, SourceLocation StartLoc,
+      ArrayRef<SemaOpenMP::UsesAllocatorsData> Data, SourceLocation StartLoc,
       SourceLocation LParenLoc, SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPUsesAllocatorClause(StartLoc, LParenLoc, EndLoc,
-                                                    Data);
+    return getSema().OpenMP().ActOnOpenMPUsesAllocatorClause(
+        StartLoc, LParenLoc, EndLoc, Data);
   }
 
   /// Build a new OpenMP 'affinity' clause.
@@ -2284,8 +2292,8 @@ class TreeTransform {
                                       SourceLocation ColonLoc,
                                       SourceLocation EndLoc, Expr *Modifier,
                                       ArrayRef<Expr *> Locators) {
-    return getSema().ActOnOpenMPAffinityClause(StartLoc, LParenLoc, ColonLoc,
-                                               EndLoc, Modifier, Locators);
+    return getSema().OpenMP().ActOnOpenMPAffinityClause(
+        StartLoc, LParenLoc, ColonLoc, EndLoc, Modifier, Locators);
   }
 
   /// Build a new OpenMP 'order' clause.
@@ -2296,8 +2304,8 @@ class TreeTransform {
       OpenMPOrderClauseKind Kind, SourceLocation KindKwLoc,
       SourceLocation StartLoc, SourceLocation LParenLoc, SourceLocation EndLoc,
       OpenMPOrderClauseModifier Modifier, SourceLocation ModifierKwLoc) {
-    return getSema().ActOnOpenMPOrderClause(Modifier, Kind, StartLoc, LParenLoc,
-                                            ModifierKwLoc, KindKwLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPOrderClause(
+        Modifier, Kind, StartLoc, LParenLoc, ModifierKwLoc, KindKwLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'init' clause.
@@ -2309,8 +2317,8 @@ class TreeTransform {
                                   SourceLocation LParenLoc,
                                   SourceLocation VarLoc,
                                   SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPInitClause(InteropVar, InteropInfo, StartLoc,
-                                           LParenLoc, VarLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPInitClause(
+        InteropVar, InteropInfo, StartLoc, LParenLoc, VarLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'use' clause.
@@ -2320,8 +2328,8 @@ class TreeTransform {
   OMPClause *RebuildOMPUseClause(Expr *InteropVar, SourceLocation StartLoc,
                                  SourceLocation LParenLoc,
                                  SourceLocation VarLoc, SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPUseClause(InteropVar, StartLoc, LParenLoc,
-                                          VarLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPUseClause(InteropVar, StartLoc,
+                                                   LParenLoc, VarLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'destroy' clause.
@@ -2332,8 +2340,8 @@ class TreeTransform {
                                      SourceLocation LParenLoc,
                                      SourceLocation VarLoc,
                                      SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPDestroyClause(InteropVar, StartLoc, LParenLoc,
-                                              VarLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPDestroyClause(
+        InteropVar, StartLoc, LParenLoc, VarLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'novariants' clause.
@@ -2344,8 +2352,8 @@ class TreeTransform {
                                         SourceLocation StartLoc,
                                         SourceLocation LParenLoc,
                                         SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPNovariantsClause(Condition, StartLoc, LParenLoc,
-                                                 EndLoc);
+    return getSema().OpenMP().ActOnOpenMPNovariantsClause(Condition, StartLoc,
+                                                          LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'nocontext' clause.
@@ -2355,8 +2363,8 @@ class TreeTransform {
   OMPClause *RebuildOMPNocontextClause(Expr *Condition, SourceLocation StartLoc,
                                        SourceLocation LParenLoc,
                                        SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPNocontextClause(Condition, StartLoc, LParenLoc,
-                                                EndLoc);
+    return getSema().OpenMP().ActOnOpenMPNocontextClause(Condition, StartLoc,
+                                                         LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'filter' clause.
@@ -2366,8 +2374,8 @@ class TreeTransform {
   OMPClause *RebuildOMPFilterClause(Expr *ThreadID, SourceLocation StartLoc,
                                     SourceLocation LParenLoc,
                                     SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPFilterClause(ThreadID, StartLoc, LParenLoc,
-                                             EndLoc);
+    return getSema().OpenMP().ActOnOpenMPFilterClause(ThreadID, StartLoc,
+                                                      LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'bind' clause.
@@ -2379,8 +2387,8 @@ class TreeTransform {
                                   SourceLocation StartLoc,
                                   SourceLocation LParenLoc,
                                   SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPBindClause(Kind, KindLoc, StartLoc, LParenLoc,
-                                           EndLoc);
+    return getSema().OpenMP().ActOnOpenMPBindClause(Kind, KindLoc, StartLoc,
+                                                    LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'ompx_dyn_cgroup_mem' clause.
@@ -2390,8 +2398,8 @@ class TreeTransform {
   OMPClause *RebuildOMPXDynCGroupMemClause(Expr *Size, SourceLocation StartLoc,
                                            SourceLocation LParenLoc,
                                            SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPXDynCGroupMemClause(Size, StartLoc, LParenLoc,
-                                                    EndLoc);
+    return getSema().OpenMP().ActOnOpenMPXDynCGroupMemClause(Size, StartLoc,
+                                                             LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'ompx_attribute' clause.
@@ -2402,8 +2410,8 @@ class TreeTransform {
                                         SourceLocation StartLoc,
                                         SourceLocation LParenLoc,
                                         SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPXAttributeClause(Attrs, StartLoc, LParenLoc,
-                                                 EndLoc);
+    return getSema().OpenMP().ActOnOpenMPXAttributeClause(Attrs, StartLoc,
+                                                          LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'ompx_bare' clause.
@@ -2412,7 +2420,7 @@ class TreeTransform {
   /// Subclasses may override this routine to provide different behavior.
   OMPClause *RebuildOMPXBareClause(SourceLocation StartLoc,
                                    SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPXBareClause(StartLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPXBareClause(StartLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'align' clause.
@@ -2422,7 +2430,8 @@ class TreeTransform {
   OMPClause *RebuildOMPAlignClause(Expr *A, SourceLocation StartLoc,
                                    SourceLocation LParenLoc,
                                    SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPAlignClause(A, StartLoc, LParenLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPAlignClause(A, StartLoc, LParenLoc,
+                                                     EndLoc);
   }
 
   /// Build a new OpenMP 'at' clause.
@@ -2433,8 +2442,8 @@ class TreeTransform {
                                 SourceLocation StartLoc,
                                 SourceLocation LParenLoc,
                                 SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPAtClause(Kind, KwLoc, StartLoc, LParenLoc,
-                                         EndLoc);
+    return getSema().OpenMP().ActOnOpenMPAtClause(Kind, KwLoc, StartLoc,
+                                                  LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'severity' clause.
@@ -2446,8 +2455,8 @@ class TreeTransform {
                                       SourceLocation StartLoc,
                                       SourceLocation LParenLoc,
                                       SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPSeverityClause(Kind, KwLoc, StartLoc, LParenLoc,
-                                               EndLoc);
+    return getSema().OpenMP().ActOnOpenMPSeverityClause(Kind, KwLoc, StartLoc,
+                                                        LParenLoc, EndLoc);
   }
 
   /// Build a new OpenMP 'message' clause.
@@ -2457,7 +2466,8 @@ class TreeTransform {
   OMPClause *RebuildOMPMessageClause(Expr *MS, SourceLocation StartLoc,
                                      SourceLocation LParenLoc,
                                      SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPMessageClause(MS, StartLoc, LParenLoc, EndLoc);
+    return getSema().OpenMP().ActOnOpenMPMessageClause(MS, StartLoc, LParenLoc,
+                                                       EndLoc);
   }
 
   /// Build a new OpenMP 'doacross' clause.
@@ -2469,7 +2479,7 @@ class TreeTransform {
                            SourceLocation DepLoc, SourceLocation ColonLoc,
                            ArrayRef<Expr *> VarList, SourceLocation StartLoc,
                            SourceLocation LParenLoc, SourceLocation EndLoc) {
-    return getSema().ActOnOpenMPDoacrossClause(
+    return getSema().OpenMP().ActOnOpenMPDoacrossClause(
         DepType, DepLoc, ColonLoc, VarList, StartLoc, LParenLoc, EndLoc);
   }
 
@@ -2777,9 +2787,9 @@ class TreeTransform {
                                         SourceLocation ColonLocSecond,
                                         Expr *Length, Expr *Stride,
                                         SourceLocation RBracketLoc) {
-    return getSema().ActOnOMPArraySectionExpr(Base, LBracketLoc, LowerBound,
-                                              ColonLocFirst, ColonLocSecond,
-                                              Length, Stride, RBracketLoc);
+    return getSema().OpenMP().ActOnOMPArraySectionExpr(
+        Base, LBracketLoc, LowerBound, ColonLocFirst, ColonLocSecond, Length,
+        Stride, RBracketLoc);
   }
 
   /// Build a new array shaping expression.
@@ -2790,19 +2800,20 @@ class TreeTransform {
                                         SourceLocation RParenLoc,
                                         ArrayRef<Expr *> Dims,
                                         ArrayRef<SourceRange> BracketsRanges) {
-    return getSema().ActOnOMPArrayShapingExpr(Base, LParenLoc, RParenLoc, Dims,
-                                              BracketsRanges);
+    return getSema().OpenMP().ActOnOMPArrayShapingExpr(
+        Base, LParenLoc, RParenLoc, Dims, BracketsRanges);
   }
 
   /// Build a new iterator expression.
   ///
   /// By default, performs semantic analysis to build the new expression.
   /// Subclasses may override this routine to provide different behavior.
-  ExprResult RebuildOMPIteratorExpr(
-      SourceLocation IteratorKwLoc, SourceLocation LLoc, SourceLocation RLoc,
-      ArrayRef<Sema::OMPIteratorData> Data) {
-    return getSema().ActOnOMPIteratorExpr(/*Scope=*/nullptr, IteratorKwLoc,
-                                          LLoc, RLoc, Data);
+  ExprResult
+  RebuildOMPIteratorExpr(SourceLocation IteratorKwLoc, SourceLocation LLoc,
+                         SourceLocation RLoc,
+                         ArrayRef<SemaOpenMP::OMPIteratorData> Data) {
+    return getSema().OpenMP().ActOnOMPIteratorExpr(
+        /*Scope=*/nullptr, IteratorKwLoc, LLoc, RLoc, Data);
   }
 
   /// Build a new call expression.
@@ -8060,7 +8071,7 @@ template<typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformForStmt(ForStmt *S) {
   if (getSema().getLangOpts().OpenMP)
-    getSema().startOpenMPLoop();
+    getSema().OpenMP().startOpenMPLoop();
 
   // Transform the initialization statement
   StmtResult Init = getDerived().TransformStmt(S->getInit());
@@ -8070,7 +8081,8 @@ TreeTransform<Derived>::TransformForStmt(ForStmt *S) {
   // In OpenMP loop region loop control variable must be captured and be
   // private. Perform analysis of first part (if any).
   if (getSema().getLangOpts().OpenMP && Init.isUsable())
-    getSema().ActOnOpenMPLoopInitialization(S->getForLoc(), Init.get());
+    getSema().OpenMP().ActOnOpenMPLoopInitialization(S->getForLoc(),
+                                                     Init.get());
 
   // Transform the condition
   Sema::ConditionResult Cond = getDerived().TransformCondition(
@@ -9029,9 +9041,9 @@ StmtResult TreeTransform<Derived>::TransformOMPExecutableDirective(
   for (ArrayRef<OMPClause *>::iterator I = Clauses.begin(), E = Clauses.end();
        I != E; ++I) {
     if (*I) {
-      getDerived().getSema().StartOpenMPClause((*I)->getClauseKind());
+      getDerived().getSema().OpenMP().StartOpenMPClause((*I)->getClauseKind());
       OMPClause *Clause = getDerived().TransformOMPClause(*I);
-      getDerived().getSema().EndOpenMPClause();
+      getDerived().getSema().OpenMP().EndOpenMPClause();
       if (Clause)
         TClauses.push_back(Clause);
     } else {
@@ -9040,8 +9052,9 @@ StmtResult TreeTransform<Derived>::TransformOMPExecutableDirective(
   }
   StmtResult AssociatedStmt;
   if (D->hasAssociatedStmt() && D->getAssociatedStmt()) {
-    getDerived().getSema().ActOnOpenMPRegionStart(D->getDirectiveKind(),
-                                                  /*CurScope=*/nullptr);
+    getDerived().getSema().OpenMP().ActOnOpenMPRegionStart(
+        D->getDirectiveKind(),
+        /*CurScope=*/nullptr);
     StmtResult Body;
     {
       Sema::CompoundScopeRAII CompoundScope(getSema());
@@ -9059,7 +9072,7 @@ StmtResult TreeTransform<Derived>::TransformOMPExecutableDirective(
         Body = getDerived().RebuildOMPCanonicalLoop(Body.get());
     }
     AssociatedStmt =
-        getDerived().getSema().ActOnOpenMPRegionEnd(Body, TClauses);
+        getDerived().getSema().OpenMP().ActOnOpenMPRegionEnd(Body, TClauses);
     if (AssociatedStmt.isInvalid()) {
       return StmtError();
     }
@@ -9100,10 +9113,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPParallelDirective(OMPParallelDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_parallel, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_parallel, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9111,10 +9124,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPSimdDirective(OMPSimdDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_simd, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_simd, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9122,10 +9135,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPTileDirective(OMPTileDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(D->getDirectiveKind(), DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      D->getDirectiveKind(), DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9133,10 +9146,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPUnrollDirective(OMPUnrollDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(D->getDirectiveKind(), DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      D->getDirectiveKind(), DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9144,10 +9157,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPForDirective(OMPForDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_for, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_for, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9155,10 +9168,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPForSimdDirective(OMPForSimdDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_for_simd, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_for_simd, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9166,10 +9179,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPSectionsDirective(OMPSectionsDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_sections, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_sections, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9177,10 +9190,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPSectionDirective(OMPSectionDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_section, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_section, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9188,10 +9201,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPScopeDirective(OMPScopeDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_scope, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_scope, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9199,10 +9212,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPSingleDirective(OMPSingleDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_single, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_single, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9210,20 +9223,20 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPMasterDirective(OMPMasterDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_master, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_master, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
 template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPCriticalDirective(OMPCriticalDirective *D) {
-  getDerived().getSema().StartOpenMPDSABlock(
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
       OMPD_critical, D->getDirectiveName(), nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9231,10 +9244,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPParallelForDirective(
     OMPParallelForDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_parallel_for, DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_parallel_for, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9242,10 +9255,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPParallelForSimdDirective(
     OMPParallelForSimdDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_parallel_for_simd, DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_parallel_for_simd, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9253,10 +9266,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPParallelMasterDirective(
     OMPParallelMasterDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_parallel_master, DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_parallel_master, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9264,10 +9277,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPParallelMaskedDirective(
     OMPParallelMaskedDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_parallel_masked, DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_parallel_masked, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9275,10 +9288,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPParallelSectionsDirective(
     OMPParallelSectionsDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_parallel_sections, DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_parallel_sections, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9286,10 +9299,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPTaskDirective(OMPTaskDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_task, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_task, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9297,10 +9310,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPTaskyieldDirective(
     OMPTaskyieldDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_taskyield, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_taskyield, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9308,10 +9321,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPBarrierDirective(OMPBarrierDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_barrier, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_barrier, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9319,10 +9332,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPTaskwaitDirective(OMPTaskwaitDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_taskwait, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_taskwait, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9330,10 +9343,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPErrorDirective(OMPErrorDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_error, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_error, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9341,10 +9354,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPTaskgroupDirective(
     OMPTaskgroupDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_taskgroup, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_taskgroup, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9352,10 +9365,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPFlushDirective(OMPFlushDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_flush, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_flush, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9363,10 +9376,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPDepobjDirective(OMPDepobjDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_depobj, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_depobj, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9374,10 +9387,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPScanDirective(OMPScanDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_scan, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_scan, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9385,10 +9398,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPOrderedDirective(OMPOrderedDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_ordered, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_ordered, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9396,10 +9409,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPAtomicDirective(OMPAtomicDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_atomic, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_atomic, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9407,10 +9420,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPTargetDirective(OMPTargetDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_target, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_target, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9418,10 +9431,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPTargetDataDirective(
     OMPTargetDataDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_target_data, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_target_data, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9429,10 +9442,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPTargetEnterDataDirective(
     OMPTargetEnterDataDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_target_enter_data, DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_target_enter_data, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9440,10 +9453,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPTargetExitDataDirective(
     OMPTargetExitDataDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_target_exit_data, DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_target_exit_data, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9451,10 +9464,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPTargetParallelDirective(
     OMPTargetParallelDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_target_parallel, DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_target_parallel, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9462,10 +9475,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPTargetParallelForDirective(
     OMPTargetParallelForDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_target_parallel_for, DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_target_parallel_for, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9473,10 +9486,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPTargetUpdateDirective(
     OMPTargetUpdateDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_target_update, DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_target_update, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9484,10 +9497,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPTeamsDirective(OMPTeamsDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_teams, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_teams, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9495,10 +9508,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPCancellationPointDirective(
     OMPCancellationPointDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_cancellation_point, DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_cancellation_point, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9506,10 +9519,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPCancelDirective(OMPCancelDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_cancel, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_cancel, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9517,10 +9530,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPTaskLoopDirective(OMPTaskLoopDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_taskloop, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_taskloop, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9528,10 +9541,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPTaskLoopSimdDirective(
     OMPTaskLoopSimdDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_taskloop_simd, DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_taskloop_simd, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9539,10 +9552,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPMasterTaskLoopDirective(
     OMPMasterTaskLoopDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_master_taskloop, DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_master_taskloop, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9550,10 +9563,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPMaskedTaskLoopDirective(
     OMPMaskedTaskLoopDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_masked_taskloop, DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_masked_taskloop, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9561,10 +9574,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPMasterTaskLoopSimdDirective(
     OMPMasterTaskLoopSimdDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_master_taskloop_simd, DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_master_taskloop_simd, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9572,10 +9585,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPMaskedTaskLoopSimdDirective(
     OMPMaskedTaskLoopSimdDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_masked_taskloop_simd, DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_masked_taskloop_simd, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9583,10 +9596,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPParallelMasterTaskLoopDirective(
     OMPParallelMasterTaskLoopDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
       OMPD_parallel_master_taskloop, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9594,10 +9607,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPParallelMaskedTaskLoopDirective(
     OMPParallelMaskedTaskLoopDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
       OMPD_parallel_masked_taskloop, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9606,10 +9619,10 @@ StmtResult
 TreeTransform<Derived>::TransformOMPParallelMasterTaskLoopSimdDirective(
     OMPParallelMasterTaskLoopSimdDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
       OMPD_parallel_master_taskloop_simd, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9618,10 +9631,10 @@ StmtResult
 TreeTransform<Derived>::TransformOMPParallelMaskedTaskLoopSimdDirective(
     OMPParallelMaskedTaskLoopSimdDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
       OMPD_parallel_masked_taskloop_simd, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9629,10 +9642,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPDistributeDirective(
     OMPDistributeDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_distribute, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_distribute, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9640,10 +9653,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPDistributeParallelForDirective(
     OMPDistributeParallelForDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
       OMPD_distribute_parallel_for, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9652,10 +9665,10 @@ StmtResult
 TreeTransform<Derived>::TransformOMPDistributeParallelForSimdDirective(
     OMPDistributeParallelForSimdDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
       OMPD_distribute_parallel_for_simd, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9663,10 +9676,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPDistributeSimdDirective(
     OMPDistributeSimdDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_distribute_simd, DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_distribute_simd, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9674,10 +9687,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPTargetParallelForSimdDirective(
     OMPTargetParallelForSimdDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
       OMPD_target_parallel_for_simd, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9685,10 +9698,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPTargetSimdDirective(
     OMPTargetSimdDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_target_simd, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_target_simd, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9696,10 +9709,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPTeamsDistributeDirective(
     OMPTeamsDistributeDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_teams_distribute, DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_teams_distribute, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9707,10 +9720,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPTeamsDistributeSimdDirective(
     OMPTeamsDistributeSimdDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
       OMPD_teams_distribute_simd, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9718,11 +9731,11 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPTeamsDistributeParallelForSimdDirective(
     OMPTeamsDistributeParallelForSimdDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
       OMPD_teams_distribute_parallel_for_simd, DirName, nullptr,
       D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9730,10 +9743,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPTeamsDistributeParallelForDirective(
     OMPTeamsDistributeParallelForDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
       OMPD_teams_distribute_parallel_for, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9741,10 +9754,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPTargetTeamsDirective(
     OMPTargetTeamsDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_target_teams, DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_target_teams, DirName, nullptr, D->getBeginLoc());
   auto Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9752,10 +9765,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPTargetTeamsDistributeDirective(
     OMPTargetTeamsDistributeDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
       OMPD_target_teams_distribute, DirName, nullptr, D->getBeginLoc());
   auto Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9764,11 +9777,11 @@ StmtResult
 TreeTransform<Derived>::TransformOMPTargetTeamsDistributeParallelForDirective(
     OMPTargetTeamsDistributeParallelForDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
       OMPD_target_teams_distribute_parallel_for, DirName, nullptr,
       D->getBeginLoc());
   auto Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9777,11 +9790,11 @@ StmtResult TreeTransform<Derived>::
     TransformOMPTargetTeamsDistributeParallelForSimdDirective(
         OMPTargetTeamsDistributeParallelForSimdDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
       OMPD_target_teams_distribute_parallel_for_simd, DirName, nullptr,
       D->getBeginLoc());
   auto Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9790,10 +9803,10 @@ StmtResult
 TreeTransform<Derived>::TransformOMPTargetTeamsDistributeSimdDirective(
     OMPTargetTeamsDistributeSimdDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
       OMPD_target_teams_distribute_simd, DirName, nullptr, D->getBeginLoc());
   auto Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9801,10 +9814,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPInteropDirective(OMPInteropDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_interop, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_interop, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9812,10 +9825,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPDispatchDirective(OMPDispatchDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_dispatch, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_dispatch, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9823,10 +9836,10 @@ template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOMPMaskedDirective(OMPMaskedDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_masked, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_masked, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9834,10 +9847,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPGenericLoopDirective(
     OMPGenericLoopDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_loop, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_loop, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9845,10 +9858,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPTeamsGenericLoopDirective(
     OMPTeamsGenericLoopDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_teams_loop, DirName, nullptr,
-                                             D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_teams_loop, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9856,10 +9869,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPTargetTeamsGenericLoopDirective(
     OMPTargetTeamsGenericLoopDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_target_teams_loop, DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_target_teams_loop, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9867,10 +9880,10 @@ template <typename Derived>
 StmtResult TreeTransform<Derived>::TransformOMPParallelGenericLoopDirective(
     OMPParallelGenericLoopDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_parallel_loop, DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_parallel_loop, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -9879,10 +9892,10 @@ StmtResult
 TreeTransform<Derived>::TransformOMPTargetParallelGenericLoopDirective(
     OMPTargetParallelGenericLoopDirective *D) {
   DeclarationNameInfo DirName;
-  getDerived().getSema().StartOpenMPDSABlock(OMPD_target_parallel_loop, DirName,
-                                             nullptr, D->getBeginLoc());
+  getDerived().getSema().OpenMP().StartOpenMPDSABlock(
+      OMPD_target_parallel_loop, DirName, nullptr, D->getBeginLoc());
   StmtResult Res = getDerived().TransformOMPExecutableDirective(D);
-  getDerived().getSema().EndOpenMPDSABlock(Res.get());
+  getDerived().getSema().OpenMP().EndOpenMPDSABlock(Res.get());
   return Res;
 }
 
@@ -10972,7 +10985,7 @@ TreeTransform<Derived>::TransformOMPExclusiveClause(OMPExclusiveClause *C) {
 template <typename Derived>
 OMPClause *TreeTransform<Derived>::TransformOMPUsesAllocatorsClause(
     OMPUsesAllocatorsClause *C) {
-  SmallVector<Sema::UsesAllocatorsData, 16> Data;
+  SmallVector<SemaOpenMP::UsesAllocatorsData, 16> Data;
   Data.reserve(C->getNumberOfAllocators());
   for (unsigned I = 0, E = C->getNumberOfAllocators(); I < E; ++I) {
     OMPUsesAllocatorsClause::Data D = C->getAllocatorData(I);
@@ -10985,7 +10998,7 @@ OMPClause *TreeTransform<Derived>::TransformOMPUsesAllocatorsClause(
       if (AllocatorTraits.isInvalid())
         continue;
     }
-    Sema::UsesAllocatorsData &NewD = Data.emplace_back();
+    SemaOpenMP::UsesAllocatorsData &NewD = Data.emplace_back();
     NewD.Allocator = Allocator.get();
     NewD.AllocatorTraits = AllocatorTraits.get();
     NewD.LParenLoc = D.LParenLoc;
@@ -11075,6 +11088,77 @@ OMPClause *TreeTransform<Derived>::TransformOMPXBareClause(OMPXBareClause *C) {
 //===----------------------------------------------------------------------===//
 // OpenACC transformation
 //===----------------------------------------------------------------------===//
+namespace {
+template <typename Derived>
+class OpenACCClauseTransform final
+    : public OpenACCClauseVisitor<OpenACCClauseTransform<Derived>> {
+  TreeTransform<Derived> &Self;
+  SemaOpenACC::OpenACCParsedClause &ParsedClause;
+  OpenACCClause *NewClause = nullptr;
+
+public:
+  OpenACCClauseTransform(TreeTransform<Derived> &Self,
+                         SemaOpenACC::OpenACCParsedClause &PC)
+      : Self(Self), ParsedClause(PC) {}
+
+  OpenACCClause *CreatedClause() const { return NewClause; }
+
+#define VISIT_CLAUSE(CLAUSE_NAME)                                              \
+  void Visit##CLAUSE_NAME##Clause(const OpenACC##CLAUSE_NAME##Clause &Clause);
+#include "clang/Basic/OpenACCClauses.def"
+};
+
+template <typename Derived>
+void OpenACCClauseTransform<Derived>::VisitDefaultClause(
+    const OpenACCDefaultClause &C) {
+  ParsedClause.setDefaultDetails(C.getDefaultClauseKind());
+
+  NewClause = OpenACCDefaultClause::Create(
+      Self.getSema().getASTContext(), ParsedClause.getDefaultClauseKind(),
+      ParsedClause.getBeginLoc(), ParsedClause.getLParenLoc(),
+      ParsedClause.getEndLoc());
+}
+
+template <typename Derived>
+void OpenACCClauseTransform<Derived>::VisitIfClause(const OpenACCIfClause &C) {
+  Expr *Cond = const_cast<Expr *>(C.getConditionExpr());
+  assert(Cond && "If constructed with invalid Condition");
+  Sema::ConditionResult Res = Self.TransformCondition(
+      Cond->getExprLoc(), /*Var=*/nullptr, Cond, Sema::ConditionKind::Boolean);
+
+  if (Res.isInvalid() || !Res.get().second)
+    return;
+
+  ParsedClause.setConditionDetails(Res.get().second);
+
+  NewClause = OpenACCIfClause::Create(
+      Self.getSema().getASTContext(), ParsedClause.getBeginLoc(),
+      ParsedClause.getLParenLoc(), ParsedClause.getConditionExpr(),
+      ParsedClause.getEndLoc());
+}
+
+template <typename Derived>
+void OpenACCClauseTransform<Derived>::VisitSelfClause(
+    const OpenACCSelfClause &C) {
+
+  if (C.hasConditionExpr()) {
+    Expr *Cond = const_cast<Expr *>(C.getConditionExpr());
+    Sema::ConditionResult Res =
+        Self.TransformCondition(Cond->getExprLoc(), /*Var=*/nullptr, Cond,
+                                Sema::ConditionKind::Boolean);
+
+    if (Res.isInvalid() || !Res.get().second)
+      return;
+
+    ParsedClause.setConditionDetails(Res.get().second);
+  }
+
+  NewClause = OpenACCSelfClause::Create(
+      Self.getSema().getASTContext(), ParsedClause.getBeginLoc(),
+      ParsedClause.getLParenLoc(), ParsedClause.getConditionExpr(),
+      ParsedClause.getEndLoc());
+}
+} // namespace
 template <typename Derived>
 OpenACCClause *TreeTransform<Derived>::TransformOpenACCClause(
     ArrayRef<const OpenACCClause *> ExistingClauses,
@@ -11087,33 +11171,10 @@ OpenACCClause *TreeTransform<Derived>::TransformOpenACCClause(
   if (const auto *WithParms = dyn_cast<OpenACCClauseWithParams>(OldClause))
     ParsedClause.setLParenLoc(WithParms->getLParenLoc());
 
-  switch (OldClause->getClauseKind()) {
-  case OpenACCClauseKind::Default:
-    // There is nothing to do here as nothing dependent can appear in this
-    // clause. So just set the values so Sema can set the right value.
-    ParsedClause.setDefaultDetails(
-        cast<OpenACCDefaultClause>(OldClause)->getDefaultClauseKind());
-    break;
-  case OpenACCClauseKind::If: {
-    Expr *Cond = const_cast<Expr *>(
-        cast<OpenACCIfClause>(OldClause)->getConditionExpr());
-    assert(Cond && "If constructed with invalid Condition");
-    Sema::ConditionResult Res =
-        TransformCondition(Cond->getExprLoc(), /*Var=*/nullptr, Cond,
-                           Sema::ConditionKind::Boolean);
-
-    if (Res.isInvalid() || !Res.get().second)
-      return nullptr;
-
-    ParsedClause.setConditionDetails(Res.get().second);
-    break;
-  }
-  default:
-    assert(false && "Unhandled OpenACC clause in TreeTransform");
-    return nullptr;
-  }
+  OpenACCClauseTransform<Derived> Transform{*this, ParsedClause};
+  Transform.Visit(OldClause);
 
-  return getSema().OpenACC().ActOnClause(ExistingClauses, ParsedClause);
+  return Transform.CreatedClause();
 }
 
 template <typename Derived>
@@ -11667,7 +11728,7 @@ template <typename Derived>
 ExprResult
 TreeTransform<Derived>::TransformOMPIteratorExpr(OMPIteratorExpr *E) {
   unsigned NumIterators = E->numOfIterators();
-  SmallVector<Sema::OMPIteratorData, 4> Data(NumIterators);
+  SmallVector<SemaOpenMP::OMPIteratorData, 4> Data(NumIterators);
 
   bool ErrorFound = false;
   bool NeedToRebuild = getDerived().AlwaysRebuild();
@@ -11802,7 +11863,8 @@ TreeTransform<Derived>::TransformMemberExpr(MemberExpr *E) {
     // Skip for member expression of (this->f), rebuilt thisi->f is needed
     // for Openmp where the field need to be privatizized in the case.
     if (!(isa<CXXThisExpr>(E->getBase()) &&
-          getSema().isOpenMPRebuildMemberExpr(cast<ValueDecl>(Member)))) {
+          getSema().OpenMP().isOpenMPRebuildMemberExpr(
+              cast<ValueDecl>(Member)))) {
       // Mark it referenced in the new context regardless.
       // FIXME: this is a bit instantiation-specific.
       SemaRef.MarkMemberReferenced(E);
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 8c4b460970ad2..b28df03b4a95e 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -31,7 +31,6 @@
 #include "clang/AST/ExternalASTSource.h"
 #include "clang/AST/NestedNameSpecifier.h"
 #include "clang/AST/ODRDiagsEmitter.h"
-#include "clang/AST/ODRHash.h"
 #include "clang/AST/OpenACCClause.h"
 #include "clang/AST/OpenMPClause.h"
 #include "clang/AST/RawCommentList.h"
@@ -915,10 +914,9 @@ ASTSelectorLookupTrait::ReadKey(const unsigned char* d, unsigned) {
   using namespace llvm::support;
 
   SelectorTable &SelTable = Reader.getContext().Selectors;
-  unsigned N =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(d);
+  unsigned N = endian::readNext<uint16_t, llvm::endianness::little>(d);
   const IdentifierInfo *FirstII = Reader.getLocalIdentifier(
-      F, endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d));
+      F, endian::readNext<uint32_t, llvm::endianness::little>(d));
   if (N == 0)
     return SelTable.getNullarySelector(FirstII);
   else if (N == 1)
@@ -928,7 +926,7 @@ ASTSelectorLookupTrait::ReadKey(const unsigned char* d, unsigned) {
   Args.push_back(FirstII);
   for (unsigned I = 1; I != N; ++I)
     Args.push_back(Reader.getLocalIdentifier(
-        F, endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d)));
+        F, endian::readNext<uint32_t, llvm::endianness::little>(d)));
 
   return SelTable.getSelector(N, Args.data());
 }
@@ -941,11 +939,11 @@ ASTSelectorLookupTrait::ReadData(Selector, const unsigned char* d,
   data_type Result;
 
   Result.ID = Reader.getGlobalSelectorID(
-      F, endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d));
+      F, endian::readNext<uint32_t, llvm::endianness::little>(d));
   unsigned FullInstanceBits =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(d);
+      endian::readNext<uint16_t, llvm::endianness::little>(d);
   unsigned FullFactoryBits =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(d);
+      endian::readNext<uint16_t, llvm::endianness::little>(d);
   Result.InstanceBits = FullInstanceBits & 0x3;
   Result.InstanceHasMoreThanOneDecl = (FullInstanceBits >> 2) & 0x1;
   Result.FactoryBits = FullFactoryBits & 0x3;
@@ -956,16 +954,14 @@ ASTSelectorLookupTrait::ReadData(Selector, const unsigned char* d,
   // Load instance methods
   for (unsigned I = 0; I != NumInstanceMethods; ++I) {
     if (ObjCMethodDecl *Method = Reader.GetLocalDeclAs<ObjCMethodDecl>(
-            F,
-            endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d)))
+            F, endian::readNext<uint32_t, llvm::endianness::little>(d)))
       Result.Instance.push_back(Method);
   }
 
   // Load factory methods
   for (unsigned I = 0; I != NumFactoryMethods; ++I) {
     if (ObjCMethodDecl *Method = Reader.GetLocalDeclAs<ObjCMethodDecl>(
-            F,
-            endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d)))
+            F, endian::readNext<uint32_t, llvm::endianness::little>(d)))
       Result.Factory.push_back(Method);
   }
 
@@ -1009,8 +1005,7 @@ static bool readBit(unsigned &Bits) {
 IdentID ASTIdentifierLookupTrait::ReadIdentifierID(const unsigned char *d) {
   using namespace llvm::support;
 
-  unsigned RawID =
-      endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d);
+  unsigned RawID = endian::readNext<uint32_t, llvm::endianness::little>(d);
   return Reader.getGlobalIdentifierID(F, RawID >> 1);
 }
 
@@ -1028,8 +1023,7 @@ IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k,
                                                    unsigned DataLen) {
   using namespace llvm::support;
 
-  unsigned RawID =
-      endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d);
+  unsigned RawID = endian::readNext<uint32_t, llvm::endianness::little>(d);
   bool IsInteresting = RawID & 0x01;
 
   // Wipe out the "is interesting" bit.
@@ -1053,9 +1047,8 @@ IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k,
   }
 
   unsigned ObjCOrBuiltinID =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(d);
-  unsigned Bits =
-      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(d);
+      endian::readNext<uint16_t, llvm::endianness::little>(d);
+  unsigned Bits = endian::readNext<uint16_t, llvm::endianness::little>(d);
   bool CPlusPlusOperatorKeyword = readBit(Bits);
   bool HasRevertedTokenIDToIdentifier = readBit(Bits);
   bool Poisoned = readBit(Bits);
@@ -1084,7 +1077,7 @@ IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k,
   // definition.
   if (HadMacroDefinition) {
     uint32_t MacroDirectivesOffset =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d);
+        endian::readNext<uint32_t, llvm::endianness::little>(d);
     DataLen -= 4;
 
     Reader.addPendingMacro(II, &F, MacroDirectivesOffset);
@@ -1098,8 +1091,7 @@ IdentifierInfo *ASTIdentifierLookupTrait::ReadData(const internal_key_type& k,
     SmallVector<uint32_t, 4> DeclIDs;
     for (; DataLen > 0; DataLen -= 4)
       DeclIDs.push_back(Reader.getGlobalDeclID(
-          F,
-          endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d)));
+          F, endian::readNext<uint32_t, llvm::endianness::little>(d)));
     Reader.SetGloballyVisibleDecls(II, DeclIDs);
   }
 
@@ -1169,7 +1161,7 @@ ASTDeclContextNameLookupTrait::ReadFileRef(const unsigned char *&d) {
   using namespace llvm::support;
 
   uint32_t ModuleFileID =
-      endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d);
+      endian::readNext<uint32_t, llvm::endianness::little>(d);
   return Reader.getLocalModuleFile(F, ModuleFileID);
 }
 
@@ -1189,18 +1181,15 @@ ASTDeclContextNameLookupTrait::ReadKey(const unsigned char *d, unsigned) {
   case DeclarationName::CXXLiteralOperatorName:
   case DeclarationName::CXXDeductionGuideName:
     Data = (uint64_t)Reader.getLocalIdentifier(
-        F, endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d));
+        F, endian::readNext<uint32_t, llvm::endianness::little>(d));
     break;
   case DeclarationName::ObjCZeroArgSelector:
   case DeclarationName::ObjCOneArgSelector:
   case DeclarationName::ObjCMultiArgSelector:
-    Data =
-        (uint64_t)Reader
-            .getLocalSelector(
-                F,
-                endian::readNext<uint32_t, llvm::endianness::little, unaligned>(
-                    d))
-            .getAsOpaquePtr();
+    Data = (uint64_t)Reader
+               .getLocalSelector(
+                   F, endian::readNext<uint32_t, llvm::endianness::little>(d))
+               .getAsOpaquePtr();
     break;
   case DeclarationName::CXXOperatorName:
     Data = *d++; // OverloadedOperatorKind
@@ -1223,8 +1212,7 @@ void ASTDeclContextNameLookupTrait::ReadDataInto(internal_key_type,
   using namespace llvm::support;
 
   for (unsigned NumDecls = DataLen / 4; NumDecls; --NumDecls) {
-    uint32_t LocalID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d);
+    uint32_t LocalID = endian::readNext<uint32_t, llvm::endianness::little>(d);
     Val.insert(Reader.getGlobalDeclID(F, LocalID));
   }
 }
@@ -2033,10 +2021,9 @@ HeaderFileInfoTrait::ReadKey(const unsigned char *d, unsigned) {
   using namespace llvm::support;
 
   internal_key_type ikey;
-  ikey.Size =
-      off_t(endian::readNext<uint64_t, llvm::endianness::little, unaligned>(d));
-  ikey.ModTime = time_t(
-      endian::readNext<uint64_t, llvm::endianness::little, unaligned>(d));
+  ikey.Size = off_t(endian::readNext<uint64_t, llvm::endianness::little>(d));
+  ikey.ModTime =
+      time_t(endian::readNext<uint64_t, llvm::endianness::little>(d));
   ikey.Filename = (const char *)d;
   ikey.Imported = true;
   return ikey;
@@ -2064,9 +2051,9 @@ HeaderFileInfoTrait::ReadData(internal_key_ref key, const unsigned char *d,
   HFI.DirInfo = (Flags >> 1) & 0x07;
   HFI.IndexHeaderMapHeader = Flags & 0x01;
   HFI.ControllingMacroID = Reader.getGlobalIdentifierID(
-      M, endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d));
+      M, endian::readNext<uint32_t, llvm::endianness::little>(d));
   if (unsigned FrameworkOffset =
-          endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d)) {
+          endian::readNext<uint32_t, llvm::endianness::little>(d)) {
     // The framework offset is 1 greater than the actual offset,
     // since 0 is used as an indicator for "no framework name".
     StringRef FrameworkName(FrameworkStrings + FrameworkOffset - 1);
@@ -2077,7 +2064,7 @@ HeaderFileInfoTrait::ReadData(internal_key_ref key, const unsigned char *d,
          "Wrong data length in HeaderFileInfo deserialization");
   while (d != End) {
     uint32_t LocalSMID =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d);
+        endian::readNext<uint32_t, llvm::endianness::little>(d);
     auto HeaderRole = static_cast<ModuleMap::ModuleHeaderRole>(LocalSMID & 7);
     LocalSMID >>= 3;
 
@@ -4085,9 +4072,8 @@ void ASTReader::ReadModuleOffsetMap(ModuleFile &F) const {
     // how it goes...
     using namespace llvm::support;
     ModuleKind Kind = static_cast<ModuleKind>(
-        endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data));
-    uint16_t Len =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint8_t, llvm::endianness::little>(Data));
+    uint16_t Len = endian::readNext<uint16_t, llvm::endianness::little>(Data);
     StringRef Name = StringRef((const char*)Data, Len);
     Data += Len;
     ModuleFile *OM = (Kind == MK_PrebuiltModule || Kind == MK_ExplicitModule ||
@@ -4103,21 +4089,21 @@ void ASTReader::ReadModuleOffsetMap(ModuleFile &F) const {
     }
 
     SourceLocation::UIntTy SLocOffset =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint32_t, llvm::endianness::little>(Data);
     uint32_t IdentifierIDOffset =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint32_t, llvm::endianness::little>(Data);
     uint32_t MacroIDOffset =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint32_t, llvm::endianness::little>(Data);
     uint32_t PreprocessedEntityIDOffset =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint32_t, llvm::endianness::little>(Data);
     uint32_t SubmoduleIDOffset =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint32_t, llvm::endianness::little>(Data);
     uint32_t SelectorIDOffset =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint32_t, llvm::endianness::little>(Data);
     uint32_t DeclIDOffset =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint32_t, llvm::endianness::little>(Data);
     uint32_t TypeIndexOffset =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+        endian::readNext<uint32_t, llvm::endianness::little>(Data);
 
     auto mapOffset = [&](uint32_t Offset, uint32_t BaseOffset,
                          RemapBuilder &Remap) {
@@ -9798,7 +9784,7 @@ void ASTReader::finishPendingActions() {
             !NonConstDefn->isLateTemplateParsed() &&
             // We only perform ODR checks for decls not in the explicit
             // global module fragment.
-            !FD->shouldSkipCheckingODR() &&
+            !shouldSkipCheckingODR(FD) &&
             FD->getODRHash() != NonConstDefn->getODRHash()) {
           if (!isa<CXXMethodDecl>(FD)) {
             PendingFunctionOdrMergeFailures[FD].push_back(NonConstDefn);
@@ -11794,6 +11780,12 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() {
     return OpenACCIfClause::Create(getContext(), BeginLoc, LParenLoc, CondExpr,
                                    EndLoc);
   }
+  case OpenACCClauseKind::Self: {
+    SourceLocation LParenLoc = readSourceLocation();
+    Expr *CondExpr = readBool() ? readSubExpr() : nullptr;
+    return OpenACCSelfClause::Create(getContext(), BeginLoc, LParenLoc,
+                                     CondExpr, EndLoc);
+  }
   case OpenACCClauseKind::Finalize:
   case OpenACCClauseKind::IfPresent:
   case OpenACCClauseKind::Seq:
@@ -11802,7 +11794,6 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() {
   case OpenACCClauseKind::Worker:
   case OpenACCClauseKind::Vector:
   case OpenACCClauseKind::NoHost:
-  case OpenACCClauseKind::Self:
   case OpenACCClauseKind::Copy:
   case OpenACCClauseKind::UseDevice:
   case OpenACCClauseKind::Attach:
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index e4b6a75c118ba..74d40f7da34ca 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -826,7 +826,7 @@ void ASTDeclReader::VisitEnumDecl(EnumDecl *ED) {
       Reader.mergeDefinitionVisibility(OldDef, ED);
       // We don't want to check the ODR hash value for declarations from global
       // module fragment.
-      if (!ED->shouldSkipCheckingODR() &&
+      if (!shouldSkipCheckingODR(ED) &&
           OldDef->getODRHash() != ED->getODRHash())
         Reader.PendingEnumOdrMergeFailures[OldDef].push_back(ED);
     } else {
@@ -868,7 +868,7 @@ void ASTDeclReader::VisitRecordDecl(RecordDecl *RD) {
   VisitRecordDeclImpl(RD);
   // We should only reach here if we're in C/Objective-C. There is no
   // global module fragment.
-  assert(!RD->shouldSkipCheckingODR());
+  assert(!shouldSkipCheckingODR(RD));
   RD->setODRHash(Record.readInt());
 
   // Maintain the invariant of a redeclaration chain containing only
@@ -2155,7 +2155,7 @@ void ASTDeclReader::MergeDefinitionData(
   }
 
   // We don't want to check ODR for decls in the global module fragment.
-  if (MergeDD.Definition->shouldSkipCheckingODR())
+  if (shouldSkipCheckingODR(MergeDD.Definition))
     return;
 
   if (D->getODRHash() != MergeDD.ODRHash) {
@@ -3530,7 +3530,7 @@ ASTDeclReader::FindExistingResult ASTDeclReader::findExisting(NamedDecl *D) {
   // same template specialization into the same CXXRecordDecl.
   auto MergedDCIt = Reader.MergedDeclContexts.find(D->getLexicalDeclContext());
   if (MergedDCIt != Reader.MergedDeclContexts.end() &&
-      !D->shouldSkipCheckingODR() && MergedDCIt->second == D->getDeclContext())
+      !shouldSkipCheckingODR(D) && MergedDCIt->second == D->getDeclContext())
     Reader.PendingOdrMergeChecks.push_back(D);
 
   return FindExistingResult(Reader, D, /*Existing=*/nullptr,
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 85b7fd5535a1b..b2a078b6d80f4 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -6188,7 +6188,7 @@ void ASTRecordWriter::AddCXXDefinitionData(const CXXRecordDecl *D) {
 
   BitsPacker DefinitionBits;
 
-  bool ShouldSkipCheckingODR = D->shouldSkipCheckingODR();
+  bool ShouldSkipCheckingODR = shouldSkipCheckingODR(D);
   DefinitionBits.addBit(ShouldSkipCheckingODR);
 
 #define FIELD(Name, Width, Merge)                                              \
@@ -7524,6 +7524,14 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
     AddStmt(const_cast<Expr *>(IC->getConditionExpr()));
     return;
   }
+  case OpenACCClauseKind::Self: {
+    const auto *SC = cast<OpenACCIfClause>(C);
+    writeSourceLocation(SC->getLParenLoc());
+    writeBool(SC->hasConditionExpr());
+    if (SC->hasConditionExpr())
+      AddStmt(const_cast<Expr *>(SC->getConditionExpr()));
+    return;
+  }
   case OpenACCClauseKind::Finalize:
   case OpenACCClauseKind::IfPresent:
   case OpenACCClauseKind::Seq:
@@ -7532,7 +7540,6 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
   case OpenACCClauseKind::Worker:
   case OpenACCClauseKind::Vector:
   case OpenACCClauseKind::NoHost:
-  case OpenACCClauseKind::Self:
   case OpenACCClauseKind::Copy:
   case OpenACCClauseKind::UseDevice:
   case OpenACCClauseKind::Attach:
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 276b6257f1d84..c6db107e0ca42 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -16,7 +16,6 @@
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/DeclVisitor.h"
 #include "clang/AST/Expr.h"
-#include "clang/AST/ODRHash.h"
 #include "clang/AST/OpenMPClause.h"
 #include "clang/AST/PrettyDeclStackTrace.h"
 #include "clang/Basic/SourceManager.h"
@@ -526,7 +525,7 @@ void ASTDeclWriter::VisitEnumDecl(EnumDecl *D) {
   BitsPacker EnumDeclBits;
   EnumDeclBits.addBits(D->getNumPositiveBits(), /*BitWidth=*/8);
   EnumDeclBits.addBits(D->getNumNegativeBits(), /*BitWidth=*/8);
-  bool ShouldSkipCheckingODR = D->shouldSkipCheckingODR();
+  bool ShouldSkipCheckingODR = shouldSkipCheckingODR(D);
   EnumDeclBits.addBit(ShouldSkipCheckingODR);
   EnumDeclBits.addBit(D->isScoped());
   EnumDeclBits.addBit(D->isScopedUsingClassTag());
@@ -552,7 +551,7 @@ void ASTDeclWriter::VisitEnumDecl(EnumDecl *D) {
       !D->isTopLevelDeclInObjCContainer() &&
       !CXXRecordDecl::classofKind(D->getKind()) &&
       !D->getIntegerTypeSourceInfo() && !D->getMemberSpecializationInfo() &&
-      !needsAnonymousDeclarationNumber(D) && !D->shouldSkipCheckingODR() &&
+      !needsAnonymousDeclarationNumber(D) && !shouldSkipCheckingODR(D) &&
       D->getDeclName().getNameKind() == DeclarationName::Identifier)
     AbbrevToUse = Writer.getDeclEnumAbbrev();
 
@@ -718,7 +717,7 @@ void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
   // FIXME: stable encoding
   FunctionDeclBits.addBits(llvm::to_underlying(D->getLinkageInternal()), 3);
   FunctionDeclBits.addBits((uint32_t)D->getStorageClass(), /*BitWidth=*/3);
-  bool ShouldSkipCheckingODR = D->shouldSkipCheckingODR();
+  bool ShouldSkipCheckingODR = shouldSkipCheckingODR(D);
   FunctionDeclBits.addBit(ShouldSkipCheckingODR);
   FunctionDeclBits.addBit(D->isInlineSpecified());
   FunctionDeclBits.addBit(D->isInlined());
@@ -1559,7 +1558,7 @@ void ASTDeclWriter::VisitCXXMethodDecl(CXXMethodDecl *D) {
       D->getFirstDecl() == D->getMostRecentDecl() && !D->isInvalidDecl() &&
       !D->hasAttrs() && !D->isTopLevelDeclInObjCContainer() &&
       D->getDeclName().getNameKind() == DeclarationName::Identifier &&
-      !D->shouldSkipCheckingODR() && !D->hasExtInfo() &&
+      !shouldSkipCheckingODR(D) && !D->hasExtInfo() &&
       !D->isExplicitlyDefaulted()) {
     if (D->getTemplatedKind() == FunctionDecl::TK_NonTemplate ||
         D->getTemplatedKind() == FunctionDecl::TK_FunctionTemplate ||
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index e3816181e2b2b..a736a7b0ef726 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -19,7 +19,6 @@
 #include "clang/AST/ExprOpenMP.h"
 #include "clang/AST/StmtVisitor.h"
 #include "clang/Lex/Token.h"
-#include "clang/Sema/DeclSpec.h"
 #include "clang/Serialization/ASTRecordWriter.h"
 #include "llvm/Bitstream/BitstreamWriter.h"
 using namespace clang;
diff --git a/clang/lib/Serialization/GeneratePCH.cpp b/clang/lib/Serialization/GeneratePCH.cpp
index 2fece29f34487..bed74399098d7 100644
--- a/clang/lib/Serialization/GeneratePCH.cpp
+++ b/clang/lib/Serialization/GeneratePCH.cpp
@@ -17,7 +17,6 @@
 #include "clang/Lex/HeaderSearchOptions.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Sema/SemaConsumer.h"
-#include "clang/Serialization/ASTReader.h"
 #include "clang/Serialization/ASTWriter.h"
 #include "llvm/Bitstream/BitstreamWriter.h"
 
diff --git a/clang/lib/Serialization/GlobalModuleIndex.cpp b/clang/lib/Serialization/GlobalModuleIndex.cpp
index dd4fc3e009050..f09ceb8d31620 100644
--- a/clang/lib/Serialization/GlobalModuleIndex.cpp
+++ b/clang/lib/Serialization/GlobalModuleIndex.cpp
@@ -13,7 +13,6 @@
 #include "clang/Serialization/GlobalModuleIndex.h"
 #include "ASTReaderInternals.h"
 #include "clang/Basic/FileManager.h"
-#include "clang/Lex/HeaderSearch.h"
 #include "clang/Serialization/ASTBitCodes.h"
 #include "clang/Serialization/ModuleFile.h"
 #include "clang/Serialization/PCHContainerOperations.h"
@@ -89,10 +88,8 @@ class IdentifierIndexReaderTrait {
   static std::pair<unsigned, unsigned>
   ReadKeyDataLength(const unsigned char*& d) {
     using namespace llvm::support;
-    unsigned KeyLen =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(d);
-    unsigned DataLen =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(d);
+    unsigned KeyLen = endian::readNext<uint16_t, llvm::endianness::little>(d);
+    unsigned DataLen = endian::readNext<uint16_t, llvm::endianness::little>(d);
     return std::make_pair(KeyLen, DataLen);
   }
 
@@ -113,8 +110,7 @@ class IdentifierIndexReaderTrait {
 
     data_type Result;
     while (DataLen > 0) {
-      unsigned ID =
-          endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d);
+      unsigned ID = endian::readNext<uint32_t, llvm::endianness::little>(d);
       Result.push_back(ID);
       DataLen -= 4;
     }
@@ -514,8 +510,7 @@ namespace {
       // The first bit indicates whether this identifier is interesting.
       // That's all we care about.
       using namespace llvm::support;
-      unsigned RawID =
-          endian::readNext<uint32_t, llvm::endianness::little, unaligned>(d);
+      unsigned RawID = endian::readNext<uint32_t, llvm::endianness::little>(d);
       bool IsInteresting = RawID & 0x01;
       return std::make_pair(k, IsInteresting);
     }
diff --git a/clang/lib/Serialization/ModuleFileExtension.cpp b/clang/lib/Serialization/ModuleFileExtension.cpp
index 95fff41e0d7a8..729529b5fca18 100644
--- a/clang/lib/Serialization/ModuleFileExtension.cpp
+++ b/clang/lib/Serialization/ModuleFileExtension.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 #include "clang/Serialization/ModuleFileExtension.h"
-#include "llvm/ADT/Hashing.h"
+
 using namespace clang;
 
 char ModuleFileExtension::ID = 0;
diff --git a/clang/lib/Serialization/MultiOnDiskHashTable.h b/clang/lib/Serialization/MultiOnDiskHashTable.h
index 2402a628b512f..a0d75ec3a9e76 100644
--- a/clang/lib/Serialization/MultiOnDiskHashTable.h
+++ b/clang/lib/Serialization/MultiOnDiskHashTable.h
@@ -200,11 +200,11 @@ template<typename Info> class MultiOnDiskHashTable {
     storage_type Ptr = Data;
 
     uint32_t BucketOffset =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Ptr);
+        endian::readNext<uint32_t, llvm::endianness::little>(Ptr);
 
     // Read the list of overridden files.
     uint32_t NumFiles =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Ptr);
+        endian::readNext<uint32_t, llvm::endianness::little>(Ptr);
     // FIXME: Add a reserve() to TinyPtrVector so that we don't need to make
     // an additional copy.
     llvm::SmallVector<file_type, 16> OverriddenFiles;
diff --git a/clang/lib/Serialization/PCHContainerOperations.cpp b/clang/lib/Serialization/PCHContainerOperations.cpp
index 56ca3394385b4..4aedb7debcff2 100644
--- a/clang/lib/Serialization/PCHContainerOperations.cpp
+++ b/clang/lib/Serialization/PCHContainerOperations.cpp
@@ -12,8 +12,6 @@
 
 #include "clang/Serialization/PCHContainerOperations.h"
 #include "clang/AST/ASTConsumer.h"
-#include "clang/Lex/ModuleLoader.h"
-#include "llvm/Bitstream/BitstreamReader.h"
 #include "llvm/Support/raw_ostream.h"
 #include <utility>
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp
index 01e46fa8591c0..1a75d7b52ad6e 100644
--- a/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp
@@ -6,7 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This checker evaluates clang builtin functions.
+// This checker evaluates "standalone" clang builtin functions that are not
+// just special-cased variants of well-known non-builtin functions.
+// Builtin functions like __builtin_memcpy and __builtin_alloca should be
+// evaluated by the same checker that handles their non-builtin variant to
+// ensure that the two variants are handled consistently.
 //
 //===----------------------------------------------------------------------===//
 
@@ -80,25 +84,6 @@ bool BuiltinFunctionChecker::evalCall(const CallEvent &Call,
     return true;
   }
 
-  case Builtin::BI__builtin_alloca_with_align:
-  case Builtin::BI__builtin_alloca: {
-    SValBuilder &SVB = C.getSValBuilder();
-    const loc::MemRegionVal R =
-        SVB.getAllocaRegionVal(CE, C.getLocationContext(), C.blockCount());
-
-    // Set the extent of the region in bytes. This enables us to use the SVal
-    // of the argument directly. If we saved the extent in bits, it'd be more
-    // difficult to reason about values like symbol*8.
-    auto Size = Call.getArgSVal(0);
-    if (auto DefSize = Size.getAs<DefinedOrUnknownSVal>()) {
-      // This `getAs()` is mostly paranoia, because core.CallAndMessage reports
-      // undefined function arguments (unless it's disabled somehow).
-      state = setDynamicExtent(state, R.getRegion(), *DefSize, SVB);
-    }
-    C.addTransition(state->BindExpr(CE, LCtx, R));
-    return true;
-  }
-
   case Builtin::BI__builtin_dynamic_object_size:
   case Builtin::BI__builtin_object_size:
   case Builtin::BI__builtin_constant_p: {
diff --git a/clang/lib/StaticAnalyzer/Checkers/ContainerModeling.cpp b/clang/lib/StaticAnalyzer/Checkers/ContainerModeling.cpp
index 009c0d3fb9368..55ed809bfed6c 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ContainerModeling.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ContainerModeling.cpp
@@ -72,26 +72,31 @@ class ContainerModeling
                                                    SVal) const;
 
   CallDescriptionMap<NoItParamFn> NoIterParamFunctions = {
-      {{{"clear"}, 0}, &ContainerModeling::handleClear},
-      {{{"assign"}, 2}, &ContainerModeling::handleAssign},
-      {{{"push_back"}, 1}, &ContainerModeling::handlePushBack},
-      {{{"emplace_back"}, 1}, &ContainerModeling::handlePushBack},
-      {{{"pop_back"}, 0}, &ContainerModeling::handlePopBack},
-      {{{"push_front"}, 1}, &ContainerModeling::handlePushFront},
-      {{{"emplace_front"}, 1}, &ContainerModeling::handlePushFront},
-      {{{"pop_front"}, 0}, &ContainerModeling::handlePopFront},
+      {{CDM::CXXMethod, {"clear"}, 0}, &ContainerModeling::handleClear},
+      {{CDM::CXXMethod, {"assign"}, 2}, &ContainerModeling::handleAssign},
+      {{CDM::CXXMethod, {"push_back"}, 1}, &ContainerModeling::handlePushBack},
+      {{CDM::CXXMethod, {"emplace_back"}, 1},
+       &ContainerModeling::handlePushBack},
+      {{CDM::CXXMethod, {"pop_back"}, 0}, &ContainerModeling::handlePopBack},
+      {{CDM::CXXMethod, {"push_front"}, 1},
+       &ContainerModeling::handlePushFront},
+      {{CDM::CXXMethod, {"emplace_front"}, 1},
+       &ContainerModeling::handlePushFront},
+      {{CDM::CXXMethod, {"pop_front"}, 0}, &ContainerModeling::handlePopFront},
   };
 
   CallDescriptionMap<OneItParamFn> OneIterParamFunctions = {
-      {{{"insert"}, 2}, &ContainerModeling::handleInsert},
-      {{{"emplace"}, 2}, &ContainerModeling::handleInsert},
-      {{{"erase"}, 1}, &ContainerModeling::handleErase},
-      {{{"erase_after"}, 1}, &ContainerModeling::handleEraseAfter},
+      {{CDM::CXXMethod, {"insert"}, 2}, &ContainerModeling::handleInsert},
+      {{CDM::CXXMethod, {"emplace"}, 2}, &ContainerModeling::handleInsert},
+      {{CDM::CXXMethod, {"erase"}, 1}, &ContainerModeling::handleErase},
+      {{CDM::CXXMethod, {"erase_after"}, 1},
+       &ContainerModeling::handleEraseAfter},
   };
 
   CallDescriptionMap<TwoItParamFn> TwoIterParamFunctions = {
-      {{{"erase"}, 2}, &ContainerModeling::handleErase},
-      {{{"erase_after"}, 2}, &ContainerModeling::handleEraseAfter},
+      {{CDM::CXXMethod, {"erase"}, 2}, &ContainerModeling::handleErase},
+      {{CDM::CXXMethod, {"erase_after"}, 2},
+       &ContainerModeling::handleEraseAfter},
   };
 };
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/DebugContainerModeling.cpp b/clang/lib/StaticAnalyzer/Checkers/DebugContainerModeling.cpp
index 72186a99d9435..d3830a01dd0cb 100644
--- a/clang/lib/StaticAnalyzer/Checkers/DebugContainerModeling.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/DebugContainerModeling.cpp
@@ -42,9 +42,9 @@ class DebugContainerModeling
                                                  CheckerContext &) const;
 
   CallDescriptionMap<FnCheck> Callbacks = {
-      {{{"clang_analyzer_container_begin"}, 1},
+      {{CDM::SimpleFunc, {"clang_analyzer_container_begin"}, 1},
        &DebugContainerModeling::analyzerContainerBegin},
-      {{{"clang_analyzer_container_end"}, 1},
+      {{CDM::SimpleFunc, {"clang_analyzer_container_end"}, 1},
        &DebugContainerModeling::analyzerContainerEnd},
   };
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/DebugIteratorModeling.cpp b/clang/lib/StaticAnalyzer/Checkers/DebugIteratorModeling.cpp
index 79ab71d7829db..203743dacda63 100644
--- a/clang/lib/StaticAnalyzer/Checkers/DebugIteratorModeling.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/DebugIteratorModeling.cpp
@@ -43,11 +43,11 @@ class DebugIteratorModeling
                                                  CheckerContext &) const;
 
   CallDescriptionMap<FnCheck> Callbacks = {
-      {{{"clang_analyzer_iterator_position"}, 1},
+      {{CDM::SimpleFunc, {"clang_analyzer_iterator_position"}, 1},
        &DebugIteratorModeling::analyzerIteratorPosition},
-      {{{"clang_analyzer_iterator_container"}, 1},
+      {{CDM::SimpleFunc, {"clang_analyzer_iterator_container"}, 1},
        &DebugIteratorModeling::analyzerIteratorContainer},
-      {{{"clang_analyzer_iterator_validity"}, 1},
+      {{CDM::SimpleFunc, {"clang_analyzer_iterator_validity"}, 1},
        &DebugIteratorModeling::analyzerIteratorValidity},
   };
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp b/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp
index a95e811c2a418..5649454b4cd47 100644
--- a/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp
@@ -129,19 +129,20 @@ class IteratorModeling
   CallDescriptionMap<AdvanceFn> AdvanceLikeFunctions = {
       // template<class InputIt, class Distance>
       // void advance(InputIt& it, Distance n);
-      {{{"std", "advance"}, 2}, &IteratorModeling::handleAdvance},
+      {{CDM::SimpleFunc, {"std", "advance"}, 2},
+       &IteratorModeling::handleAdvance},
 
       // template<class BidirIt>
       // BidirIt prev(
       //   BidirIt it,
       //   typename std::iterator_traits<BidirIt>::difference_type n = 1);
-      {{{"std", "prev"}, 2}, &IteratorModeling::handlePrev},
+      {{CDM::SimpleFunc, {"std", "prev"}, 2}, &IteratorModeling::handlePrev},
 
       // template<class ForwardIt>
       // ForwardIt next(
       //   ForwardIt it,
       //   typename std::iterator_traits<ForwardIt>::difference_type n = 1);
-      {{{"std", "next"}, 2}, &IteratorModeling::handleNext},
+      {{CDM::SimpleFunc, {"std", "next"}, 2}, &IteratorModeling::handleNext},
   };
 
 public:
diff --git a/clang/lib/StaticAnalyzer/Checkers/IteratorRangeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/IteratorRangeChecker.cpp
index d2b61fb92483c..4dd2f700a2a0e 100644
--- a/clang/lib/StaticAnalyzer/Checkers/IteratorRangeChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/IteratorRangeChecker.cpp
@@ -56,10 +56,15 @@ class IteratorRangeChecker
   using AdvanceFn = void (IteratorRangeChecker::*)(CheckerContext &, SVal,
                                                    SVal) const;
 
+  // FIXME: these three functions are also listed in IteratorModeling.cpp,
+  // perhaps unify their handling?
   CallDescriptionMap<AdvanceFn> AdvanceFunctions = {
-      {{{"std", "advance"}, 2}, &IteratorRangeChecker::verifyAdvance},
-      {{{"std", "prev"}, 2}, &IteratorRangeChecker::verifyPrev},
-      {{{"std", "next"}, 2}, &IteratorRangeChecker::verifyNext},
+      {{CDM::SimpleFunc, {"std", "advance"}, 2},
+       &IteratorRangeChecker::verifyAdvance},
+      {{CDM::SimpleFunc, {"std", "prev"}, 2},
+       &IteratorRangeChecker::verifyPrev},
+      {{CDM::SimpleFunc, {"std", "next"}, 2},
+       &IteratorRangeChecker::verifyNext},
   };
 };
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
index 88fb42b6625aa..11651fd491f74 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MallocChecker.cpp
@@ -401,10 +401,11 @@ class MallocChecker
   };
 
   const CallDescriptionMap<CheckFn> FreeingMemFnMap{
-      {{{"free"}, 1}, &MallocChecker::checkFree},
-      {{{"if_freenameindex"}, 1}, &MallocChecker::checkIfFreeNameIndex},
-      {{{"kfree"}, 1}, &MallocChecker::checkFree},
-      {{{"g_free"}, 1}, &MallocChecker::checkFree},
+      {{CDM::CLibrary, {"free"}, 1}, &MallocChecker::checkFree},
+      {{CDM::CLibrary, {"if_freenameindex"}, 1},
+       &MallocChecker::checkIfFreeNameIndex},
+      {{CDM::CLibrary, {"kfree"}, 1}, &MallocChecker::checkFree},
+      {{CDM::CLibrary, {"g_free"}, 1}, &MallocChecker::checkFree},
   };
 
   bool isFreeingCall(const CallEvent &Call) const;
@@ -413,41 +414,46 @@ class MallocChecker
   friend class NoOwnershipChangeVisitor;
 
   CallDescriptionMap<CheckFn> AllocatingMemFnMap{
-      {{{"alloca"}, 1}, &MallocChecker::checkAlloca},
-      {{{"_alloca"}, 1}, &MallocChecker::checkAlloca},
-      {{{"malloc"}, 1}, &MallocChecker::checkBasicAlloc},
-      {{{"malloc"}, 3}, &MallocChecker::checkKernelMalloc},
-      {{{"calloc"}, 2}, &MallocChecker::checkCalloc},
-      {{{"valloc"}, 1}, &MallocChecker::checkBasicAlloc},
+      {{CDM::CLibrary, {"alloca"}, 1}, &MallocChecker::checkAlloca},
+      {{CDM::CLibrary, {"_alloca"}, 1}, &MallocChecker::checkAlloca},
+      // The line for "alloca" also covers "__builtin_alloca", but the
+      // _with_align variant must be listed separately because it takes an
+      // extra argument:
+      {{CDM::CLibrary, {"__builtin_alloca_with_align"}, 2},
+       &MallocChecker::checkAlloca},
+      {{CDM::CLibrary, {"malloc"}, 1}, &MallocChecker::checkBasicAlloc},
+      {{CDM::CLibrary, {"malloc"}, 3}, &MallocChecker::checkKernelMalloc},
+      {{CDM::CLibrary, {"calloc"}, 2}, &MallocChecker::checkCalloc},
+      {{CDM::CLibrary, {"valloc"}, 1}, &MallocChecker::checkBasicAlloc},
       {{CDM::CLibrary, {"strndup"}, 2}, &MallocChecker::checkStrdup},
       {{CDM::CLibrary, {"strdup"}, 1}, &MallocChecker::checkStrdup},
-      {{{"_strdup"}, 1}, &MallocChecker::checkStrdup},
-      {{{"kmalloc"}, 2}, &MallocChecker::checkKernelMalloc},
-      {{{"if_nameindex"}, 1}, &MallocChecker::checkIfNameIndex},
+      {{CDM::CLibrary, {"_strdup"}, 1}, &MallocChecker::checkStrdup},
+      {{CDM::CLibrary, {"kmalloc"}, 2}, &MallocChecker::checkKernelMalloc},
+      {{CDM::CLibrary, {"if_nameindex"}, 1}, &MallocChecker::checkIfNameIndex},
       {{CDM::CLibrary, {"wcsdup"}, 1}, &MallocChecker::checkStrdup},
       {{CDM::CLibrary, {"_wcsdup"}, 1}, &MallocChecker::checkStrdup},
-      {{{"g_malloc"}, 1}, &MallocChecker::checkBasicAlloc},
-      {{{"g_malloc0"}, 1}, &MallocChecker::checkGMalloc0},
-      {{{"g_try_malloc"}, 1}, &MallocChecker::checkBasicAlloc},
-      {{{"g_try_malloc0"}, 1}, &MallocChecker::checkGMalloc0},
-      {{{"g_memdup"}, 2}, &MallocChecker::checkGMemdup},
-      {{{"g_malloc_n"}, 2}, &MallocChecker::checkGMallocN},
-      {{{"g_malloc0_n"}, 2}, &MallocChecker::checkGMallocN0},
-      {{{"g_try_malloc_n"}, 2}, &MallocChecker::checkGMallocN},
-      {{{"g_try_malloc0_n"}, 2}, &MallocChecker::checkGMallocN0},
+      {{CDM::CLibrary, {"g_malloc"}, 1}, &MallocChecker::checkBasicAlloc},
+      {{CDM::CLibrary, {"g_malloc0"}, 1}, &MallocChecker::checkGMalloc0},
+      {{CDM::CLibrary, {"g_try_malloc"}, 1}, &MallocChecker::checkBasicAlloc},
+      {{CDM::CLibrary, {"g_try_malloc0"}, 1}, &MallocChecker::checkGMalloc0},
+      {{CDM::CLibrary, {"g_memdup"}, 2}, &MallocChecker::checkGMemdup},
+      {{CDM::CLibrary, {"g_malloc_n"}, 2}, &MallocChecker::checkGMallocN},
+      {{CDM::CLibrary, {"g_malloc0_n"}, 2}, &MallocChecker::checkGMallocN0},
+      {{CDM::CLibrary, {"g_try_malloc_n"}, 2}, &MallocChecker::checkGMallocN},
+      {{CDM::CLibrary, {"g_try_malloc0_n"}, 2}, &MallocChecker::checkGMallocN0},
   };
 
   CallDescriptionMap<CheckFn> ReallocatingMemFnMap{
-      {{{"realloc"}, 2},
+      {{CDM::CLibrary, {"realloc"}, 2},
        std::bind(&MallocChecker::checkRealloc, _1, _2, _3, false)},
-      {{{"reallocf"}, 2},
+      {{CDM::CLibrary, {"reallocf"}, 2},
        std::bind(&MallocChecker::checkRealloc, _1, _2, _3, true)},
-      {{{"g_realloc"}, 2},
+      {{CDM::CLibrary, {"g_realloc"}, 2},
        std::bind(&MallocChecker::checkRealloc, _1, _2, _3, false)},
-      {{{"g_try_realloc"}, 2},
+      {{CDM::CLibrary, {"g_try_realloc"}, 2},
        std::bind(&MallocChecker::checkRealloc, _1, _2, _3, false)},
-      {{{"g_realloc_n"}, 3}, &MallocChecker::checkReallocN},
-      {{{"g_try_realloc_n"}, 3}, &MallocChecker::checkReallocN},
+      {{CDM::CLibrary, {"g_realloc_n"}, 3}, &MallocChecker::checkReallocN},
+      {{CDM::CLibrary, {"g_try_realloc_n"}, 3}, &MallocChecker::checkReallocN},
 
       // NOTE: the following CallDescription also matches the C++ standard
       // library function std::getline(); the callback will filter it out.
@@ -1259,9 +1265,6 @@ static bool isStandardRealloc(const CallEvent &Call) {
   assert(FD);
   ASTContext &AC = FD->getASTContext();
 
-  if (isa<CXXMethodDecl>(FD))
-    return false;
-
   return FD->getDeclaredReturnType().getDesugaredType(AC) == AC.VoidPtrTy &&
          FD->getParamDecl(0)->getType().getDesugaredType(AC) == AC.VoidPtrTy &&
          FD->getParamDecl(1)->getType().getDesugaredType(AC) ==
@@ -1273,9 +1276,6 @@ static bool isGRealloc(const CallEvent &Call) {
   assert(FD);
   ASTContext &AC = FD->getASTContext();
 
-  if (isa<CXXMethodDecl>(FD))
-    return false;
-
   return FD->getDeclaredReturnType().getDesugaredType(AC) == AC.VoidPtrTy &&
          FD->getParamDecl(0)->getType().getDesugaredType(AC) == AC.VoidPtrTy &&
          FD->getParamDecl(1)->getType().getDesugaredType(AC) ==
@@ -1284,14 +1284,14 @@ static bool isGRealloc(const CallEvent &Call) {
 
 void MallocChecker::checkRealloc(const CallEvent &Call, CheckerContext &C,
                                  bool ShouldFreeOnFail) const {
-  // HACK: CallDescription currently recognizes non-standard realloc functions
-  // as standard because it doesn't check the type, or wether its a non-method
-  // function. This should be solved by making CallDescription smarter.
-  // Mind that this came from a bug report, and all other functions suffer from
-  // this.
-  // https://bugs.llvm.org/show_bug.cgi?id=46253
+  // Ignore calls to functions whose type does not match the expected type of
+  // either the standard realloc or g_realloc from GLib.
+  // FIXME: Should we perform this kind of checking consistently for each
+  // function? If yes, then perhaps extend the `CallDescription` interface to
+  // handle this.
   if (!isStandardRealloc(Call) && !isGRealloc(Call))
     return;
+
   ProgramStateRef State = C.getState();
   State = ReallocMemAux(C, Call, ShouldFreeOnFail, State, AF_Malloc);
   State = ProcessZeroAllocCheck(Call, 1, State);
@@ -1842,9 +1842,18 @@ static ProgramStateRef MallocUpdateRefState(CheckerContext &C, const Expr *E,
     return nullptr;
 
   SymbolRef Sym = RetVal->getAsLocSymbol();
+
   // This is a return value of a function that was not inlined, such as malloc()
   // or new(). We've checked that in the caller. Therefore, it must be a symbol.
   assert(Sym);
+  // FIXME: In theory this assertion should fail for `alloca()` calls (because
+  // `AllocaRegion`s are not symbolic); but in practice this does not happen.
+  // As the current code appears to work correctly, I'm not touching this issue
+  // now, but it would be good to investigate and clarify this.
+  // Also note that perhaps the special `AllocaRegion` should be replaced by
+  // `SymbolicRegion` (or turned into a subclass of `SymbolicRegion`) to enable
+  // proper tracking of memory allocated by `alloca()` -- and after that change
+  // this assertion would become valid again.
 
   // Set the symbol's state to Allocated.
   return State->set<RegionState>(Sym, RefState::getAllocated(Family, E));
diff --git a/clang/lib/StaticAnalyzer/Checkers/STLAlgorithmModeling.cpp b/clang/lib/StaticAnalyzer/Checkers/STLAlgorithmModeling.cpp
index a5173a05636a0..e037719b90298 100644
--- a/clang/lib/StaticAnalyzer/Checkers/STLAlgorithmModeling.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/STLAlgorithmModeling.cpp
@@ -33,29 +33,50 @@ class STLAlgorithmModeling : public Checker<eval::Call> {
                                                 const CallExpr *) const;
 
   const CallDescriptionMap<FnCheck> Callbacks = {
-    {{{"std", "find"}, 3}, &STLAlgorithmModeling::evalFind},
-    {{{"std", "find"}, 4}, &STLAlgorithmModeling::evalFind},
-    {{{"std", "find_if"}, 3}, &STLAlgorithmModeling::evalFind},
-    {{{"std", "find_if"}, 4}, &STLAlgorithmModeling::evalFind},
-    {{{"std", "find_if_not"}, 3}, &STLAlgorithmModeling::evalFind},
-    {{{"std", "find_if_not"}, 4}, &STLAlgorithmModeling::evalFind},
-    {{{"std", "find_first_of"}, 4}, &STLAlgorithmModeling::evalFind},
-    {{{"std", "find_first_of"}, 5}, &STLAlgorithmModeling::evalFind},
-    {{{"std", "find_first_of"}, 6}, &STLAlgorithmModeling::evalFind},
-    {{{"std", "find_end"}, 4}, &STLAlgorithmModeling::evalFind},
-    {{{"std", "find_end"}, 5}, &STLAlgorithmModeling::evalFind},
-    {{{"std", "find_end"}, 6}, &STLAlgorithmModeling::evalFind},
-    {{{"std", "lower_bound"}, 3}, &STLAlgorithmModeling::evalFind},
-    {{{"std", "lower_bound"}, 4}, &STLAlgorithmModeling::evalFind},
-    {{{"std", "upper_bound"}, 3}, &STLAlgorithmModeling::evalFind},
-    {{{"std", "upper_bound"}, 4}, &STLAlgorithmModeling::evalFind},
-    {{{"std", "search"}, 3}, &STLAlgorithmModeling::evalFind},
-    {{{"std", "search"}, 4}, &STLAlgorithmModeling::evalFind},
-    {{{"std", "search"}, 5}, &STLAlgorithmModeling::evalFind},
-    {{{"std", "search"}, 6}, &STLAlgorithmModeling::evalFind},
-    {{{"std", "search_n"}, 4}, &STLAlgorithmModeling::evalFind},
-    {{{"std", "search_n"}, 5}, &STLAlgorithmModeling::evalFind},
-    {{{"std", "search_n"}, 6}, &STLAlgorithmModeling::evalFind},
+      {{CDM::SimpleFunc, {"std", "find"}, 3}, &STLAlgorithmModeling::evalFind},
+      {{CDM::SimpleFunc, {"std", "find"}, 4}, &STLAlgorithmModeling::evalFind},
+      {{CDM::SimpleFunc, {"std", "find_if"}, 3},
+       &STLAlgorithmModeling::evalFind},
+      {{CDM::SimpleFunc, {"std", "find_if"}, 4},
+       &STLAlgorithmModeling::evalFind},
+      {{CDM::SimpleFunc, {"std", "find_if_not"}, 3},
+       &STLAlgorithmModeling::evalFind},
+      {{CDM::SimpleFunc, {"std", "find_if_not"}, 4},
+       &STLAlgorithmModeling::evalFind},
+      {{CDM::SimpleFunc, {"std", "find_first_of"}, 4},
+       &STLAlgorithmModeling::evalFind},
+      {{CDM::SimpleFunc, {"std", "find_first_of"}, 5},
+       &STLAlgorithmModeling::evalFind},
+      {{CDM::SimpleFunc, {"std", "find_first_of"}, 6},
+       &STLAlgorithmModeling::evalFind},
+      {{CDM::SimpleFunc, {"std", "find_end"}, 4},
+       &STLAlgorithmModeling::evalFind},
+      {{CDM::SimpleFunc, {"std", "find_end"}, 5},
+       &STLAlgorithmModeling::evalFind},
+      {{CDM::SimpleFunc, {"std", "find_end"}, 6},
+       &STLAlgorithmModeling::evalFind},
+      {{CDM::SimpleFunc, {"std", "lower_bound"}, 3},
+       &STLAlgorithmModeling::evalFind},
+      {{CDM::SimpleFunc, {"std", "lower_bound"}, 4},
+       &STLAlgorithmModeling::evalFind},
+      {{CDM::SimpleFunc, {"std", "upper_bound"}, 3},
+       &STLAlgorithmModeling::evalFind},
+      {{CDM::SimpleFunc, {"std", "upper_bound"}, 4},
+       &STLAlgorithmModeling::evalFind},
+      {{CDM::SimpleFunc, {"std", "search"}, 3},
+       &STLAlgorithmModeling::evalFind},
+      {{CDM::SimpleFunc, {"std", "search"}, 4},
+       &STLAlgorithmModeling::evalFind},
+      {{CDM::SimpleFunc, {"std", "search"}, 5},
+       &STLAlgorithmModeling::evalFind},
+      {{CDM::SimpleFunc, {"std", "search"}, 6},
+       &STLAlgorithmModeling::evalFind},
+      {{CDM::SimpleFunc, {"std", "search_n"}, 4},
+       &STLAlgorithmModeling::evalFind},
+      {{CDM::SimpleFunc, {"std", "search_n"}, 5},
+       &STLAlgorithmModeling::evalFind},
+      {{CDM::SimpleFunc, {"std", "search_n"}, 6},
+       &STLAlgorithmModeling::evalFind},
   };
 
 public:
diff --git a/clang/lib/StaticAnalyzer/Checkers/cert/InvalidPtrChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/cert/InvalidPtrChecker.cpp
index e5dd907c660d8..fefe846b6911f 100644
--- a/clang/lib/StaticAnalyzer/Checkers/cert/InvalidPtrChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/cert/InvalidPtrChecker.cpp
@@ -48,14 +48,19 @@ class InvalidPtrChecker
   bool InvalidatingGetEnv = false;
 
   // GetEnv can be treated invalidating and non-invalidating as well.
-  const CallDescription GetEnvCall{{"getenv"}, 1};
+  const CallDescription GetEnvCall{CDM::CLibrary, {"getenv"}, 1};
 
   const CallDescriptionMap<HandlerFn> EnvpInvalidatingFunctions = {
-      {{{"setenv"}, 3}, &InvalidPtrChecker::EnvpInvalidatingCall},
-      {{{"unsetenv"}, 1}, &InvalidPtrChecker::EnvpInvalidatingCall},
-      {{{"putenv"}, 1}, &InvalidPtrChecker::EnvpInvalidatingCall},
-      {{{"_putenv_s"}, 2}, &InvalidPtrChecker::EnvpInvalidatingCall},
-      {{{"_wputenv_s"}, 2}, &InvalidPtrChecker::EnvpInvalidatingCall},
+      {{CDM::CLibrary, {"setenv"}, 3},
+       &InvalidPtrChecker::EnvpInvalidatingCall},
+      {{CDM::CLibrary, {"unsetenv"}, 1},
+       &InvalidPtrChecker::EnvpInvalidatingCall},
+      {{CDM::CLibrary, {"putenv"}, 1},
+       &InvalidPtrChecker::EnvpInvalidatingCall},
+      {{CDM::CLibrary, {"_putenv_s"}, 2},
+       &InvalidPtrChecker::EnvpInvalidatingCall},
+      {{CDM::CLibrary, {"_wputenv_s"}, 2},
+       &InvalidPtrChecker::EnvpInvalidatingCall},
   };
 
   void postPreviousReturnInvalidatingCall(const CallEvent &Call,
@@ -63,13 +68,13 @@ class InvalidPtrChecker
 
   // SEI CERT ENV34-C
   const CallDescriptionMap<HandlerFn> PreviousCallInvalidatingFunctions = {
-      {{{"setlocale"}, 2},
+      {{CDM::CLibrary, {"setlocale"}, 2},
        &InvalidPtrChecker::postPreviousReturnInvalidatingCall},
-      {{{"strerror"}, 1},
+      {{CDM::CLibrary, {"strerror"}, 1},
        &InvalidPtrChecker::postPreviousReturnInvalidatingCall},
-      {{{"localeconv"}, 0},
+      {{CDM::CLibrary, {"localeconv"}, 0},
        &InvalidPtrChecker::postPreviousReturnInvalidatingCall},
-      {{{"asctime"}, 1},
+      {{CDM::CLibrary, {"asctime"}, 1},
        &InvalidPtrChecker::postPreviousReturnInvalidatingCall},
   };
 
@@ -205,8 +210,12 @@ void InvalidPtrChecker::postPreviousReturnInvalidatingCall(
       CE, LCtx, CE->getType(), C.blockCount());
   State = State->BindExpr(CE, LCtx, RetVal);
 
+  const auto *SymRegOfRetVal =
+      dyn_cast_or_null<SymbolicRegion>(RetVal.getAsRegion());
+  if (!SymRegOfRetVal)
+    return;
+
   // Remember to this region.
-  const auto *SymRegOfRetVal = cast<SymbolicRegion>(RetVal.getAsRegion());
   const MemRegion *MR = SymRegOfRetVal->getBaseRegion();
   State = State->set<PreviousCallResultMap>(FD, MR);
 
diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
index 94ccbd3351b09..e19f19b2528c1 100644
--- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
+++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
@@ -154,6 +154,26 @@ void ModuleDepCollector::addOutputPaths(CowCompilerInvocation &CI,
   }
 }
 
+void dependencies::resetBenignCodeGenOptions(frontend::ActionKind ProgramAction,
+                                             const LangOptions &LangOpts,
+                                             CodeGenOptions &CGOpts) {
+  // TODO: Figure out better way to set options to their default value.
+  if (ProgramAction == frontend::GenerateModule) {
+    CGOpts.MainFileName.clear();
+    CGOpts.DwarfDebugFlags.clear();
+  }
+  if (ProgramAction == frontend::GeneratePCH ||
+      (ProgramAction == frontend::GenerateModule && !LangOpts.ModulesCodegen)) {
+    CGOpts.DebugCompilationDir.clear();
+    CGOpts.CoverageCompilationDir.clear();
+    CGOpts.CoverageDataFile.clear();
+    CGOpts.CoverageNotesFile.clear();
+    CGOpts.ProfileInstrumentUsePath.clear();
+    CGOpts.SampleProfileFile.clear();
+    CGOpts.ProfileRemappingFile.clear();
+  }
+}
+
 static CowCompilerInvocation
 makeCommonInvocationForModuleBuild(CompilerInvocation CI) {
   CI.resetNonModularOptions();
@@ -167,18 +187,8 @@ makeCommonInvocationForModuleBuild(CompilerInvocation CI) {
   // LLVM options are not going to affect the AST
   CI.getFrontendOpts().LLVMArgs.clear();
 
-  // TODO: Figure out better way to set options to their default value.
-  CI.getCodeGenOpts().MainFileName.clear();
-  CI.getCodeGenOpts().DwarfDebugFlags.clear();
-  if (!CI.getLangOpts().ModulesCodegen) {
-    CI.getCodeGenOpts().DebugCompilationDir.clear();
-    CI.getCodeGenOpts().CoverageCompilationDir.clear();
-    CI.getCodeGenOpts().CoverageDataFile.clear();
-    CI.getCodeGenOpts().CoverageNotesFile.clear();
-    CI.getCodeGenOpts().ProfileInstrumentUsePath.clear();
-    CI.getCodeGenOpts().SampleProfileFile.clear();
-    CI.getCodeGenOpts().ProfileRemappingFile.clear();
-  }
+  resetBenignCodeGenOptions(frontend::GenerateModule, CI.getLangOpts(),
+                            CI.getCodeGenOpts());
 
   // Map output paths that affect behaviour to "-" so their existence is in the
   // context hash. The final path will be computed in addOutputPaths.
@@ -342,6 +352,8 @@ static bool needsModules(FrontendInputFile FIF) {
 
 void ModuleDepCollector::applyDiscoveredDependencies(CompilerInvocation &CI) {
   CI.clearImplicitModuleBuildOptions();
+  resetBenignCodeGenOptions(CI.getFrontendOpts().ProgramAction,
+                            CI.getLangOpts(), CI.getCodeGenOpts());
 
   if (llvm::any_of(CI.getFrontendOpts().Inputs, needsModules)) {
     Preprocessor &PP = ScanInstance.getPreprocessor();
diff --git a/clang/test/AST/Interp/builtin-align-cxx.cpp b/clang/test/AST/Interp/builtin-align-cxx.cpp
new file mode 100644
index 0000000000000..62d73dba929b2
--- /dev/null
+++ b/clang/test/AST/Interp/builtin-align-cxx.cpp
@@ -0,0 +1,258 @@
+// C++-specific checks for the alignment builtins
+// RUN: %clang_cc1 -triple=x86_64-unknown-unknown -std=c++11 %s -fsyntax-only -verify=expected,both -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -triple=x86_64-unknown-unknown -std=c++11 %s -fsyntax-only -verify=ref,both
+
+
+/// This is just a copy of the one from test/SemaCXX/ with some of the
+/// diagnostic output adapted.
+/// Also, align32array has an initializer now, which means it's not just
+/// a dummy pointer for us and we do actually have type information for it.
+/// In the future, we need to retain type information for dummy pointers as
+/// well, so here is a test that will break once we do that:
+namespace {
+  _Alignas(32) char heh[4];
+  static_assert(!__builtin_is_aligned(&heh[1], 4), ""); // expected-error {{failed}}
+}
+
+
+// Check that we don't crash when using dependent types in __builtin_align:
+template <typename a, a b>
+void *c(void *d) { // both-note{{candidate template ignored}}
+  return __builtin_align_down(d, b);
+}
+
+struct x {};
+x foo;
+void test(void *value) {
+  c<int, 16>(value);
+  c<struct x, foo>(value); // both-error{{no matching function for call to 'c'}}
+}
+
+template <typename T, long Alignment, long ArraySize = 16>
+void test_templated_arguments() {
+  T array[ArraySize];                                                           // both-error{{variable has incomplete type 'fwddecl'}}
+  static_assert(__is_same(decltype(__builtin_align_up(array, Alignment)), T *), // both-error{{requested alignment is not a power of 2}}
+                "return type should be the decayed array type");
+  static_assert(__is_same(decltype(__builtin_align_down(array, Alignment)), T *),
+                "return type should be the decayed array type");
+  static_assert(__is_same(decltype(__builtin_is_aligned(array, Alignment)), bool),
+                "return type should be bool");
+  T *x1 = __builtin_align_up(array, Alignment);
+  T *x2 = __builtin_align_down(array, Alignment);
+  bool x3 = __builtin_align_up(array, Alignment);
+}
+
+void test() {
+  test_templated_arguments<int, 32>(); // fine
+  test_templated_arguments<struct fwddecl, 16>();
+  // both-note@-1{{in instantiation of function template specialization 'test_templated_arguments<fwddecl, 16L, 16L>'}}
+  // both-note@-2{{forward declaration of 'fwddecl'}}
+  test_templated_arguments<int, 7>(); // invalid alignment value
+  // both-note@-1{{in instantiation of function template specialization 'test_templated_arguments<int, 7L, 16L>'}}
+}
+
+template <typename T, long ArraySize>
+void test_incorrect_alignment_without_instatiation(T value) {
+  int array[32];
+  static_assert(__is_same(decltype(__builtin_align_up(array, 31)), int *), // both-error{{requested alignment is not a power of 2}}
+                "return type should be the decayed array type");
+  static_assert(__is_same(decltype(__builtin_align_down(array, 7)), int *), // both-error{{requested alignment is not a power of 2}}
+                "return type should be the decayed array type");
+  static_assert(__is_same(decltype(__builtin_is_aligned(array, -1)), bool), // both-error{{requested alignment must be 1 or greater}}
+                "return type should be bool");
+  __builtin_align_up(array);       // both-error{{too few arguments to function call, expected 2, have 1}}
+  __builtin_align_up(array, 31);   // both-error{{requested alignment is not a power of 2}}
+  __builtin_align_down(array, 31); // both-error{{requested alignment is not a power of 2}}
+  __builtin_align_up(array, 31);   // both-error{{requested alignment is not a power of 2}}
+  __builtin_align_up(value, 31);   // This shouldn't want since the type is dependent
+  __builtin_align_up(value);       // Same here
+
+  __builtin_align_up(array, sizeof(sizeof(value)) - 1); // both-error{{requested alignment is not a power of 2}}
+  __builtin_align_up(array, value); // no diagnostic as the alignment is value dependent.
+  (void)__builtin_align_up(array, ArraySize); // The same above here
+}
+
+// The original fix for the issue above broke some legitimate code.
+// Here is a regression test:
+typedef __SIZE_TYPE__ size_t;
+void *allocate_impl(size_t size);
+template <typename T>
+T *allocate() {
+  constexpr size_t allocation_size =
+      __builtin_align_up(sizeof(T), sizeof(void *));
+  return static_cast<T *>(
+      __builtin_assume_aligned(allocate_impl(allocation_size), sizeof(void *)));
+}
+struct Foo {
+  int value;
+};
+void *test2() {
+  return allocate<struct Foo>();
+}
+
+// Check that pointers-to-members cannot be used:
+class MemPtr {
+public:
+  int data;
+  void func();
+  virtual void vfunc();
+};
+void test_member_ptr() {
+  __builtin_align_up(&MemPtr::data, 64);    // both-error{{operand of type 'int MemPtr::*' where arithmetic or pointer type is required}}
+  __builtin_align_down(&MemPtr::func, 64);  // both-error{{operand of type 'void (MemPtr::*)()' where arithmetic or pointer type is required}}
+  __builtin_is_aligned(&MemPtr::vfunc, 64); // both-error{{operand of type 'void (MemPtr::*)()' where arithmetic or pointer type is required}}
+}
+
+void test_references(Foo &i) {
+  // Check that the builtins look at the referenced type rather than the reference itself.
+  (void)__builtin_align_up(i, 64);                            // both-error{{operand of type 'Foo' where arithmetic or pointer type is required}}
+  (void)__builtin_align_up(static_cast<Foo &>(i), 64);        // both-error{{operand of type 'Foo' where arithmetic or pointer type is required}}
+  (void)__builtin_align_up(static_cast<const Foo &>(i), 64);  // both-error{{operand of type 'const Foo' where arithmetic or pointer type is required}}
+  (void)__builtin_align_up(static_cast<Foo &&>(i), 64);       // both-error{{operand of type 'Foo' where arithmetic or pointer type is required}}
+  (void)__builtin_align_up(static_cast<const Foo &&>(i), 64); // both-error{{operand of type 'const Foo' where arithmetic or pointer type is required}}
+  (void)__builtin_align_up(&i, 64);
+}
+
+// Check that constexpr wrapper functions can be constant-evaluated.
+template <typename T>
+constexpr bool wrap_is_aligned(T ptr, long align) {
+  return __builtin_is_aligned(ptr, align);
+  // both-note@-1{{requested alignment -3 is not a positive power of two}}
+  // both-note@-2{{requested alignment 19 is not a positive power of two}}
+  // both-note@-3{{requested alignment must be 128 or less for type 'char'; 4194304 is invalid}}
+}
+template <typename T>
+constexpr T wrap_align_up(T ptr, long align) {
+  return __builtin_align_up(ptr, align);
+  // both-note@-1{{requested alignment -2 is not a positive power of two}}
+  // both-note@-2{{requested alignment 18 is not a positive power of two}}
+  // both-note@-3{{requested alignment must be 2147483648 or less for type 'int'; 8589934592 is invalid}}
+  // both-error@-4{{operand of type 'bool' where arithmetic or pointer type is required}}
+}
+
+template <typename T>
+constexpr T wrap_align_down(T ptr, long align) {
+  return __builtin_align_down(ptr, align);
+  // both-note@-1{{requested alignment -1 is not a positive power of two}}
+  // both-note@-2{{requested alignment 17 is not a positive power of two}}
+  // both-note@-3{{requested alignment must be 32768 or less for type 'short'; 1048576 is invalid}}
+}
+
+constexpr int a1 = wrap_align_up(22, 32);
+static_assert(a1 == 32, "");
+constexpr int a2 = wrap_align_down(22, 16);
+static_assert(a2 == 16, "");
+constexpr bool a3 = wrap_is_aligned(22, 32);
+static_assert(!a3, "");
+static_assert(wrap_align_down(wrap_align_up(22, 16), 32) == 32, "");
+static_assert(wrap_is_aligned(wrap_align_down(wrap_align_up(22, 16), 32), 32), "");
+static_assert(!wrap_is_aligned(wrap_align_down(wrap_align_up(22, 16), 32), 64), "");
+
+constexpr long const_value(long l) { return l; }
+// Check some invalid values during constant-evaluation
+static_assert(wrap_align_down(1, const_value(-1)), ""); // both-error{{not an integral constant expression}}
+// both-note@-1{{in call to}}
+static_assert(wrap_align_up(1, const_value(-2)), ""); // both-error{{not an integral constant expression}}
+// both-note@-1{{in call to}}
+static_assert(wrap_is_aligned(1, const_value(-3)), ""); // both-error{{not an integral constant expression}}
+// both-note@-1{{in call to}}
+static_assert(wrap_align_down(1, const_value(17)), ""); // both-error{{not an integral constant expression}}
+// both-note@-1{{in call to}}
+static_assert(wrap_align_up(1, const_value(18)), ""); // both-error{{not an integral constant expression}}
+// both-note@-1{{in call to}}
+static_assert(wrap_is_aligned(1, const_value(19)), ""); // both-error{{not an integral constant expression}}
+// both-note@-1{{in call to}}
+
+// Check invalid values for smaller types:
+static_assert(wrap_align_down(static_cast<short>(1), const_value(1 << 20)), ""); // both-error{{not an integral constant expression}}
+// both-note@-1{{in call to }}
+// Check invalid boolean type
+static_assert(wrap_align_up(static_cast<int>(1), const_value(1ull << 33)), ""); // both-error{{not an integral constant expression}}
+// both-note@-1{{in call to}}
+static_assert(wrap_is_aligned(static_cast<char>(1), const_value(1 << 22)), ""); // both-error{{not an integral constant expression}}
+// both-note@-1{{in call to}}
+
+// Check invalid boolean type
+static_assert(wrap_align_up(static_cast<bool>(1), const_value(1 << 21)), ""); // both-error{{not an integral constant expression}}
+// both-note@-1{{in instantiation of function template specialization 'wrap_align_up<bool>' requested here}}
+
+// Check constant evaluation for pointers:
+_Alignas(32) char align32array[128] = {};
+static_assert(&align32array[0] == &align32array[0], "");
+// __builtin_align_up/down can be constant evaluated as a no-op for values
+// that are known to have greater alignment:
+static_assert(__builtin_align_up(&align32array[0], 32) == &align32array[0], "");
+static_assert(__builtin_align_up(&align32array[0], 4) == &align32array[0], "");
+static_assert(__builtin_align_down(&align32array[0], 4) == __builtin_align_up(&align32array[0], 8), "");
+// But it can not be evaluated if the alignment is greater than the minimum
+// known alignment, since in that case the value might be the same if it happens
+// to actually be aligned to 64 bytes at run time.
+static_assert(&align32array[0] == __builtin_align_up(&align32array[0], 64), ""); // both-error{{not an integral constant expression}}
+// both-note@-1{{cannot constant evaluate the result of adjusting alignment to 64}}
+static_assert(__builtin_align_up(&align32array[0], 64) == __builtin_align_up(&align32array[0], 64), ""); // both-error{{not an integral constant expression}}
+// both-note@-1{{cannot constant evaluate the result of adjusting alignment to 64}}
+
+// However, we can compute in case the requested alignment is less than the
+// base alignment:
+static_assert(__builtin_align_up(&align32array[0], 4) == &align32array[0], "");
+static_assert(__builtin_align_up(&align32array[1], 4) == &align32array[4], "");
+static_assert(__builtin_align_up(&align32array[2], 4) == &align32array[4], "");
+static_assert(__builtin_align_up(&align32array[3], 4) == &align32array[4], "");
+static_assert(__builtin_align_up(&align32array[4], 4) == &align32array[4], "");
+static_assert(__builtin_align_up(&align32array[5], 4) == &align32array[8], "");
+static_assert(__builtin_align_up(&align32array[6], 4) == &align32array[8], "");
+static_assert(__builtin_align_up(&align32array[7], 4) == &align32array[8], "");
+static_assert(__builtin_align_up(&align32array[8], 4) == &align32array[8], "");
+
+static_assert(__builtin_align_down(&align32array[0], 4) == &align32array[0], "");
+static_assert(__builtin_align_down(&align32array[1], 4) == &align32array[0], "");
+static_assert(__builtin_align_down(&align32array[2], 4) == &align32array[0], "");
+static_assert(__builtin_align_down(&align32array[3], 4) == &align32array[0], "");
+static_assert(__builtin_align_down(&align32array[4], 4) == &align32array[4], "");
+static_assert(__builtin_align_down(&align32array[5], 4) == &align32array[4], "");
+static_assert(__builtin_align_down(&align32array[6], 4) == &align32array[4], "");
+static_assert(__builtin_align_down(&align32array[7], 4) == &align32array[4], "");
+static_assert(__builtin_align_down(&align32array[8], 4) == &align32array[8], "");
+
+// Achieving the same thing using casts to uintptr_t is not allowed:
+static_assert((char *)((__UINTPTR_TYPE__)&align32array[7] & ~3) == &align32array[4], ""); // both-error{{not an integral constant expression}} \
+                                                                                          // expected-note {{cast that performs the conversions of a reinterpret_cast is not allowed in a constant expression}}
+
+static_assert(__builtin_align_down(&align32array[1], 4) == &align32array[0], "");
+static_assert(__builtin_align_down(&align32array[1], 64) == &align32array[0], ""); // both-error{{not an integral constant expression}}
+// both-note@-1{{cannot constant evaluate the result of adjusting alignment to 64}}
+
+// Add some checks for __builtin_is_aligned:
+static_assert(__builtin_is_aligned(&align32array[0], 32), "");
+static_assert(__builtin_is_aligned(&align32array[4], 4), "");
+// We cannot constant evaluate whether the array is aligned to > 32 since this
+// may well be true at run time.
+static_assert(!__builtin_is_aligned(&align32array[0], 64), ""); // both-error{{not an integral constant expression}}
+// both-note@-1{{cannot constant evaluate whether run-time alignment is at least 64}}
+
+// However, if the alignment being checked is less than the minimum alignment of
+// the base object we can check the low bits of the alignment:
+static_assert(__builtin_is_aligned(&align32array[0], 4), "");
+static_assert(!__builtin_is_aligned(&align32array[1], 4), "");
+static_assert(!__builtin_is_aligned(&align32array[2], 4), "");
+static_assert(!__builtin_is_aligned(&align32array[3], 4), "");
+static_assert(__builtin_is_aligned(&align32array[4], 4), "");
+
+// TODO: this should evaluate to true even though we can't evaluate the result
+//  of __builtin_align_up() to a concrete value
+static_assert(__builtin_is_aligned(__builtin_align_up(&align32array[0], 64), 64), ""); // both-error{{not an integral constant expression}}
+// both-note@-1{{cannot constant evaluate the result of adjusting alignment to 64}}
+
+// Check different source and alignment type widths are handled correctly.
+static_assert(!__builtin_is_aligned(static_cast<signed long>(7), static_cast<signed short>(4)), "");
+static_assert(!__builtin_is_aligned(static_cast<signed short>(7), static_cast<signed long>(4)), "");
+// Also check signed -- unsigned mismatch.
+static_assert(!__builtin_is_aligned(static_cast<signed long>(7), static_cast<signed long>(4)), "");
+static_assert(!__builtin_is_aligned(static_cast<unsigned long>(7), static_cast<unsigned long>(4)), "");
+static_assert(!__builtin_is_aligned(static_cast<signed long>(7), static_cast<unsigned long>(4)), "");
+static_assert(!__builtin_is_aligned(static_cast<unsigned long>(7), static_cast<signed long>(4)), "");
+static_assert(!__builtin_is_aligned(static_cast<signed long>(7), static_cast<unsigned short>(4)), "");
+static_assert(!__builtin_is_aligned(static_cast<unsigned short>(7), static_cast<signed long>(4)), "");
+
+// Check the diagnostic message
+_Alignas(void) char align_void_array[1]; // both-error {{invalid application of '_Alignas' to an incomplete type 'void'}}
diff --git a/clang/test/AST/Interp/builtin-functions.cpp b/clang/test/AST/Interp/builtin-functions.cpp
index a7adc92d3714f..1a29a664d7ce5 100644
--- a/clang/test/AST/Interp/builtin-functions.cpp
+++ b/clang/test/AST/Interp/builtin-functions.cpp
@@ -24,16 +24,13 @@ namespace strcmp {
   static_assert(__builtin_strcmp("abab", "abab\0banana") == 0, "");
   static_assert(__builtin_strcmp("abab\0banana", "abab\0canada") == 0, "");
   static_assert(__builtin_strcmp(0, "abab") == 0, ""); // both-error {{not an integral constant}} \
-                                                       // both-note {{dereferenced null}} \
-                                                       // expected-note {{in call to}}
+                                                       // both-note {{dereferenced null}}
   static_assert(__builtin_strcmp("abab", 0) == 0, ""); // both-error {{not an integral constant}} \
-                                                       // both-note {{dereferenced null}} \
-                                                       // expected-note {{in call to}}
+                                                       // both-note {{dereferenced null}}
 
   static_assert(__builtin_strcmp(kFoobar, kFoobazfoobar) == -1, "");
   static_assert(__builtin_strcmp(kFoobar, kFoobazfoobar + 6) == 0, ""); // both-error {{not an integral constant}} \
-                                                                        // both-note {{dereferenced one-past-the-end}} \
-                                                                        // expected-note {{in call to}}
+                                                                        // both-note {{dereferenced one-past-the-end}}
 
   /// Used to assert because we're passing a dummy pointer to
   /// __builtin_strcmp() when evaluating the return statement.
@@ -72,14 +69,11 @@ constexpr const char *a = "foo\0quux";
   static_assert(check(c), "");
 
   constexpr int over1 = __builtin_strlen(a + 9); // both-error {{constant expression}} \
-                                                 // both-note {{one-past-the-end}} \
-                                                 // expected-note {{in call to}}
+                                                 // both-note {{one-past-the-end}}
   constexpr int over2 = __builtin_strlen(b + 9); // both-error {{constant expression}} \
-                                                 // both-note {{one-past-the-end}} \
-                                                 // expected-note {{in call to}}
+                                                 // both-note {{one-past-the-end}}
   constexpr int over3 = __builtin_strlen(c + 9); // both-error {{constant expression}} \
-                                                 // both-note {{one-past-the-end}} \
-                                                 // expected-note {{in call to}}
+                                                 // both-note {{one-past-the-end}}
 
   constexpr int under1 = __builtin_strlen(a - 1); // both-error {{constant expression}} \
                                                   // both-note {{cannot refer to element -1}}
@@ -90,8 +84,7 @@ constexpr const char *a = "foo\0quux";
 
   constexpr char d[] = { 'f', 'o', 'o' }; // no nul terminator.
   constexpr int bad = __builtin_strlen(d); // both-error {{constant expression}} \
-                                           // both-note {{one-past-the-end}} \
-                                           // expected-note {{in call to}}
+                                           // both-note {{one-past-the-end}}
 }
 
 namespace nan {
@@ -114,8 +107,7 @@ namespace nan {
   /// FIXME: Current interpreter misses diagnostics.
   constexpr char f2[] = {'0', 'x', 'A', 'E'}; /// No trailing 0 byte.
   constexpr double NaN7 = __builtin_nan(f2); // both-error {{must be initialized by a constant expression}} \
-                                             // expected-note {{read of dereferenced one-past-the-end pointer}} \
-                                             // expected-note {{in call to}}
+                                             // expected-note {{read of dereferenced one-past-the-end pointer}}
   static_assert(!__builtin_issignaling(__builtin_nan("")), "");
   static_assert(__builtin_issignaling(__builtin_nans("")), "");
 }
diff --git a/clang/test/AST/Interp/cxx03.cpp b/clang/test/AST/Interp/cxx03.cpp
index d30cbb2fd7a20..b6aaf0840cfb3 100644
--- a/clang/test/AST/Interp/cxx03.cpp
+++ b/clang/test/AST/Interp/cxx03.cpp
@@ -10,3 +10,17 @@ namespace NonInitializingMemberExpr {
                                                                                               // both-note {{required by}} \
                                                                                               // both-note {{subexpression not valid}}
 }
+
+
+namespace NonLValueMemberExpr {
+  struct PODType {
+    int value;
+  };
+
+#define ATTR __attribute__((require_constant_initialization))
+  struct TT1 {
+    ATTR static const int &subobj_init;
+  };
+
+  const int &TT1::subobj_init = PODType().value;
+}
diff --git a/clang/test/AST/Interp/functions.cpp b/clang/test/AST/Interp/functions.cpp
index 4fb3c816000ab..f9bb5d53634e0 100644
--- a/clang/test/AST/Interp/functions.cpp
+++ b/clang/test/AST/Interp/functions.cpp
@@ -584,9 +584,20 @@ namespace VariadicOperator {
 namespace WeakCompare {
   [[gnu::weak]]void weak_method();
   static_assert(weak_method != nullptr, ""); // both-error {{not an integral constant expression}} \
-                                         // both-note {{comparison against address of weak declaration '&weak_method' can only be performed at runtim}}
+                                             // both-note {{comparison against address of weak declaration '&weak_method' can only be performed at runtim}}
 
   constexpr auto A = &weak_method;
   static_assert(A != nullptr, ""); // both-error {{not an integral constant expression}} \
-                               // both-note {{comparison against address of weak declaration '&weak_method' can only be performed at runtim}}
+                                   // both-note {{comparison against address of weak declaration '&weak_method' can only be performed at runtim}}
+}
+
+namespace FromIntegral {
+#if __cplusplus >= 202002L
+  typedef double (*DoubleFn)();
+  int a[(int)DoubleFn((void*)-1)()]; // both-error {{not allowed at file scope}} \
+                                    // both-warning {{variable length arrays}}
+  int b[(int)DoubleFn((void*)(-1 + 1))()]; // both-error {{not allowed at file scope}} \
+                                           // expected-note {{evaluates to a null function pointer}} \
+                                           // both-warning {{variable length arrays}}
+#endif
 }
diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp
index f251497ed7018..2c33fa1bf8843 100644
--- a/clang/test/AST/Interp/records.cpp
+++ b/clang/test/AST/Interp/records.cpp
@@ -1309,3 +1309,11 @@ namespace pr18633 {
     func2<int>();
   }
 }
+
+namespace {
+  struct F {
+    static constexpr int Z = 12;
+  };
+  F f;
+  static_assert(f.Z == 12, "");
+}
diff --git a/clang/test/AST/Interp/vectors.cpp b/clang/test/AST/Interp/vectors.cpp
index 8afef3c897bff..5c4694f122d81 100644
--- a/clang/test/AST/Interp/vectors.cpp
+++ b/clang/test/AST/Interp/vectors.cpp
@@ -1,10 +1,23 @@
 // RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify=expected,both %s
 // RUN: %clang_cc1 -verify=ref,both %s
 
-// both-no-diagnostics
-
 typedef int __attribute__((vector_size(16))) VI4;
 constexpr VI4 A = {1,2,3,4};
+static_assert(A[0] == 1, ""); // ref-error {{not an integral constant expression}}
+static_assert(A[1] == 2, ""); // ref-error {{not an integral constant expression}}
+static_assert(A[2] == 3, ""); // ref-error {{not an integral constant expression}}
+static_assert(A[3] == 4, ""); // ref-error {{not an integral constant expression}}
+
+/// VectorSplat casts
+typedef __attribute__(( ext_vector_type(4) )) float float4;
+constexpr float4 vec4_0 = (float4)0.5f;
+static_assert(vec4_0[0] == 0.5, ""); // ref-error {{not an integral constant expression}}
+static_assert(vec4_0[1] == 0.5, ""); // ref-error {{not an integral constant expression}}
+static_assert(vec4_0[2] == 0.5, ""); // ref-error {{not an integral constant expression}}
+static_assert(vec4_0[3] == 0.5, ""); // ref-error {{not an integral constant expression}}
+constexpr int vec4_0_discarded = ((float4)12.0f, 0);
+
+
 
 /// From constant-expression-cxx11.cpp
 namespace Vector {
@@ -13,10 +26,18 @@ namespace Vector {
     return VI4 { n * 3, n + 4, n - 5, n / 6 };
   }
   constexpr auto v1 = f(10);
+  static_assert(__builtin_vectorelements(v1) == (16 / sizeof(int)), "");
 
   typedef double __attribute__((vector_size(32))) VD4;
   constexpr VD4 g(int n) {
     return (VD4) { n / 2.0, n + 1.5, n - 5.4, n * 0.9 };
   }
   constexpr auto v2 = g(4);
+  static_assert(__builtin_vectorelements(v2) == (32 / sizeof(double)), "");
+}
+
+/// FIXME: We need to support BitCasts between vector types.
+namespace {
+  typedef float __attribute__((vector_size(16))) VI42;
+  constexpr VI42 A2 = A; // expected-error {{must be initialized by a constant expression}}
 }
diff --git a/clang/test/AST/ast-dump-attr-json.cpp b/clang/test/AST/ast-dump-attr-json.cpp
index 051c2956abfdf..883e584bfedf0 100644
--- a/clang/test/AST/ast-dump-attr-json.cpp
+++ b/clang/test/AST/ast-dump-attr-json.cpp
@@ -46,6 +46,7 @@ __thread __attribute__ ((tls_model ("local-exec"))) int tls_model_var;
 // CHECK-NEXT:    "tokLen": 11
 // CHECK-NEXT:   }
 // CHECK-NEXT:  },
+// CHECK-NEXT:  "isUsed": true,
 // CHECK-NEXT:  "name": "global_decl",
 // CHECK-NEXT:  "mangledName": "global_decl",
 // CHECK-NEXT:  "type": {
diff --git a/clang/test/Analysis/Inputs/system-header-simulator-cxx.h b/clang/test/Analysis/Inputs/system-header-simulator-cxx.h
index 85db68d41a6c8..1c2be322f83c2 100644
--- a/clang/test/Analysis/Inputs/system-header-simulator-cxx.h
+++ b/clang/test/Analysis/Inputs/system-header-simulator-cxx.h
@@ -1106,6 +1106,7 @@ using ostream = basic_ostream<char>;
 extern std::ostream cout;
 
 ostream &operator<<(ostream &, const string &);
+
 #if __cplusplus >= 202002L
 template <class T>
 ostream &operator<<(ostream &, const std::unique_ptr<T> &);
@@ -1122,11 +1123,12 @@ istream &getline(istream &, string &, char);
 istream &getline(istream &, string &);
 } // namespace std
 
-#ifdef TEST_INLINABLE_ALLOCATORS
 namespace std {
   void *malloc(size_t);
   void free(void *);
-}
+} // namespace std
+
+#ifdef TEST_INLINABLE_ALLOCATORS
 void* operator new(std::size_t size, const std::nothrow_t&) throw() { return std::malloc(size); }
 void* operator new[](std::size_t size, const std::nothrow_t&) throw() { return std::malloc(size); }
 void operator delete(void* ptr, const std::nothrow_t&) throw() { std::free(ptr); }
diff --git a/clang/test/Analysis/cxx-uninitialized-object-ptr-ref.cpp b/clang/test/Analysis/cxx-uninitialized-object-ptr-ref.cpp
index fc067dd04428a..f46a2c9bc368f 100644
--- a/clang/test/Analysis/cxx-uninitialized-object-ptr-ref.cpp
+++ b/clang/test/Analysis/cxx-uninitialized-object-ptr-ref.cpp
@@ -1,9 +1,9 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,optin.cplusplus.UninitializedObject \
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc,optin.cplusplus.UninitializedObject \
 // RUN:   -analyzer-config optin.cplusplus.UninitializedObject:Pedantic=true -DPEDANTIC \
 // RUN:   -analyzer-config optin.cplusplus.UninitializedObject:CheckPointeeInitialization=true \
 // RUN:   -std=c++11 -verify  %s
 
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,optin.cplusplus.UninitializedObject \
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc,optin.cplusplus.UninitializedObject \
 // RUN:   -analyzer-config optin.cplusplus.UninitializedObject:CheckPointeeInitialization=true \
 // RUN:   -std=c++11 -verify  %s
 
@@ -316,7 +316,10 @@ void fCyclicPointerTest2() {
 
 // Void pointer tests are mainly no-crash tests.
 
-void *malloc(int size);
+typedef __typeof(sizeof(int)) size_t;
+
+void *calloc(size_t nmemb, size_t size);
+void free(void *p);
 
 class VoidPointerTest1 {
   void *vptr;
@@ -328,8 +331,9 @@ class VoidPointerTest1 {
 };
 
 void fVoidPointerTest1() {
-  void *vptr = malloc(sizeof(int));
+  void *vptr = calloc(1, sizeof(int));
   VoidPointerTest1(vptr, char());
+  free(vptr);
 }
 
 class VoidPointerTest2 {
@@ -342,8 +346,9 @@ class VoidPointerTest2 {
 };
 
 void fVoidPointerTest2() {
-  void *vptr = malloc(sizeof(int));
+  void *vptr = calloc(1, sizeof(int));
   VoidPointerTest2(&vptr, char());
+  free(vptr);
 }
 
 class VoidPointerRRefTest1 {
@@ -359,8 +364,9 @@ upon returning to the caller.  This will be a dangling reference}}
 };
 
 void fVoidPointerRRefTest1() {
-  void *vptr = malloc(sizeof(int));
+  void *vptr = calloc(1, sizeof(int));
   VoidPointerRRefTest1(vptr, char());
+  free(vptr);
 }
 
 class VoidPointerRRefTest2 {
@@ -376,8 +382,9 @@ upon returning to the caller.  This will be a dangling reference}}
 };
 
 void fVoidPointerRRefTest2() {
-  void *vptr = malloc(sizeof(int));
+  void *vptr = calloc(1, sizeof(int));
   VoidPointerRRefTest2(&vptr, char());
+  free(vptr);
 }
 
 class VoidPointerLRefTest {
@@ -393,8 +400,9 @@ upon returning to the caller.  This will be a dangling reference}}
 };
 
 void fVoidPointerLRefTest() {
-  void *vptr = malloc(sizeof(int));
+  void *vptr = calloc(1, sizeof(int));
   VoidPointerLRefTest(vptr, char());
+  free(vptr);
 }
 
 struct CyclicVoidPointerTest {
diff --git a/clang/test/Analysis/exercise-ps.c b/clang/test/Analysis/exercise-ps.c
index d214c3959b207..d1e1771afddb5 100644
--- a/clang/test/Analysis/exercise-ps.c
+++ b/clang/test/Analysis/exercise-ps.c
@@ -1,5 +1,5 @@
 // RUN: %clang_analyze_cc1 %s -verify -Wno-error=implicit-function-declaration \
-// RUN:   -analyzer-checker=core \
+// RUN:   -analyzer-checker=core,unix.Malloc \
 // RUN:   -analyzer-config core.CallAndMessage:ArgPointeeInitializedness=true
 //
 // Just exercise the analyzer on code that has at one point caused issues
diff --git a/clang/test/Analysis/explain-svals.cpp b/clang/test/Analysis/explain-svals.cpp
index 30368b6976cc2..33fce10c4e2b2 100644
--- a/clang/test/Analysis/explain-svals.cpp
+++ b/clang/test/Analysis/explain-svals.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_analyze_cc1 -triple i386-apple-darwin10 -verify %s \
-// RUN:   -analyzer-checker=core.builtin \
 // RUN:   -analyzer-checker=debug.ExprInspection \
 // RUN:   -analyzer-checker=unix.cstring \
+// RUN:   -analyzer-checker=unix.Malloc \
 // RUN:   -analyzer-config display-checker-name=false
 
 typedef unsigned long size_t;
diff --git a/clang/test/Analysis/invalid-ptr-checker.cpp b/clang/test/Analysis/invalid-ptr-checker.cpp
new file mode 100644
index 0000000000000..58bb45e0fb842
--- /dev/null
+++ b/clang/test/Analysis/invalid-ptr-checker.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,security.cert.env.InvalidPtr -verify %s
+
+// expected-no-diagnostics
+
+namespace other {
+int strerror(int errnum); // custom strerror
+void no_crash_on_custom_strerror() {
+  (void)strerror(0); // no-crash
+}
+} // namespace other
diff --git a/clang/test/Analysis/malloc-std-namespace.cpp b/clang/test/Analysis/malloc-std-namespace.cpp
new file mode 100644
index 0000000000000..d4e397bb812aa
--- /dev/null
+++ b/clang/test/Analysis/malloc-std-namespace.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc -verify -analyzer-output=text %s
+
+// This file tests that unix.Malloc can handle C++ code where e.g. malloc and
+// free are declared within the namespace 'std' by the header <cstdlib>.
+
+#include "Inputs/system-header-simulator-cxx.h"
+
+void leak() {
+  int *p = static_cast<int*>(std::malloc(sizeof(int))); // expected-note{{Memory is allocated}}
+} // expected-warning{{Potential leak of memory pointed to by 'p'}}
+  // expected-note@-1{{Potential leak of memory pointed to by 'p'}}
+
+void no_leak() {
+  int *p = static_cast<int*>(std::malloc(sizeof(int)));
+  std::free(p); // no-warning
+}
+
+void invalid_free() {
+  int i;
+  int *p = &i;
+  //expected-note@+2{{Argument to free() is the address of the local variable 'i', which is not memory allocated by malloc()}}
+  //expected-warning@+1{{Argument to free() is the address of the local variable 'i', which is not memory allocated by malloc()}}
+  std::free(p);
+}
diff --git a/clang/test/Analysis/malloc.c b/clang/test/Analysis/malloc.c
index 09cd4b0bfce63..e5cb45ba73352 100644
--- a/clang/test/Analysis/malloc.c
+++ b/clang/test/Analysis/malloc.c
@@ -740,6 +740,17 @@ void allocaFree(void) {
   free(p); // expected-warning {{Memory allocated by alloca() should not be deallocated}}
 }
 
+void allocaFreeBuiltin(void) {
+  int *p = __builtin_alloca(sizeof(int));
+  free(p); // expected-warning {{Memory allocated by alloca() should not be deallocated}}
+}
+
+void allocaFreeBuiltinAlign(void) {
+  int *p = __builtin_alloca_with_align(sizeof(int), 64);
+  free(p); // expected-warning {{Memory allocated by alloca() should not be deallocated}}
+}
+
+
 int* mallocEscapeRet(void) {
   int *p = malloc(12);
   return p; // no warning
diff --git a/clang/test/Analysis/malloc.cpp b/clang/test/Analysis/malloc.cpp
index 14b4c0576384f..300b344ab25d6 100644
--- a/clang/test/Analysis/malloc.cpp
+++ b/clang/test/Analysis/malloc.cpp
@@ -214,3 +214,14 @@ void *realloc(void **ptr, size_t size) { realloc(ptr, size); } // no-crash
 namespace pr46253_paramty2{
 void *realloc(void *ptr, int size) { realloc(ptr, size); } // no-crash
 } // namespace pr46253_paramty2
+
+namespace pr81597 {
+struct S {};
+struct T {
+  void free(const S& s);
+};
+void f(T& t) {
+  S s;
+  t.free(s); // no-warning: This is not the free you are looking for...
+}
+} // namespace pr81597
diff --git a/clang/test/Analysis/stack-addr-ps.c b/clang/test/Analysis/stack-addr-ps.c
index e469396e1bb22..e69ab4189b524 100644
--- a/clang/test/Analysis/stack-addr-ps.c
+++ b/clang/test/Analysis/stack-addr-ps.c
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core -fblocks -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc -fblocks -verify %s
 
 int* f1(void) {
   int x = 0;
diff --git a/clang/test/Analysis/stackaddrleak.c b/clang/test/Analysis/stackaddrleak.c
index 0583bfc18711c..39c29f2a2635b 100644
--- a/clang/test/Analysis/stackaddrleak.c
+++ b/clang/test/Analysis/stackaddrleak.c
@@ -1,5 +1,5 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core -verify -std=c99 -Dbool=_Bool -Wno-bool-conversion %s
-// RUN: %clang_analyze_cc1 -analyzer-checker=core -verify -x c++ -Wno-bool-conversion %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc -verify -std=c99 -Dbool=_Bool -Wno-bool-conversion %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix.Malloc -verify -x c++ -Wno-bool-conversion %s
 
 typedef __INTPTR_TYPE__ intptr_t;
 char const *p;
diff --git a/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p6-cxx23.cpp b/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p6-cxx23.cpp
new file mode 100644
index 0000000000000..9c1f30f81a011
--- /dev/null
+++ b/clang/test/CXX/dcl.decl/dcl.meaning/dcl.fct/p6-cxx23.cpp
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify %s
+
+auto x0 = requires (this int) { true; }; // expected-error {{a requires expression cannot have an explicit object parameter}}
+auto x1 = requires (int, this int) { true; }; // expected-error {{a requires expression cannot have an explicit object parameter}}
+
+template<this auto> // expected-error {{expected template parameter}}
+void f(); // expected-error {{no function template matches function template specialization 'f'}}
diff --git a/clang/test/CXX/drs/dr0xx.cpp b/clang/test/CXX/drs/dr0xx.cpp
index a304862885c64..6c600bbc7c3f6 100644
--- a/clang/test/CXX/drs/dr0xx.cpp
+++ b/clang/test/CXX/drs/dr0xx.cpp
@@ -5,6 +5,11 @@
 // RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors -triple %itanium_abi_triple
 // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors -triple %itanium_abi_triple
 
+#if __cplusplus == 199711L
+#define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
+// cxx98-error@-1 {{variadic macros are a C99 feature}}
+#endif
+
 namespace cwg1 { // cwg1: no
   namespace X { extern "C" void cwg1_f(int a = 1); }
   namespace Y { extern "C" void cwg1_f(int a = 1); }
@@ -897,7 +902,7 @@ namespace cwg54 { // cwg54: 2.8
 
 namespace cwg55 { // cwg55: yes
   enum E { e = 5 };
-  int test[(e + 1 == 6) ? 1 : -1];
+  static_assert(e + 1 == 6, "");
 }
 
 namespace cwg56 { // cwg56: yes
@@ -1163,10 +1168,9 @@ namespace cwg75 { // cwg75: yes
 
 namespace cwg76 { // cwg76: yes
   const volatile int n = 1;
-  int arr[n]; // #cwg76-vla
-  // expected-error@#cwg76-vla {{variable length arrays in C++ are a Clang extension}}
-  //   expected-note@#cwg76-vla {{read of volatile-qualified type 'const volatile int' is not allowed in a constant expression}}
-  // expected-error@#cwg76-vla {{variable length array declaration not allowed at file scope}}
+  static_assert(n, "");
+  // expected-error@-1 {{static assertion expression is not an integral constant expression}}
+  //   expected-note@-2 {{read of volatile-qualified type 'const volatile int' is not allowed in a constant expression}}
 }
 
 namespace cwg77 { // cwg77: yes
diff --git a/clang/test/CXX/drs/dr16xx.cpp b/clang/test/CXX/drs/dr16xx.cpp
index 6d7bb7619f8b8..cf6b45ceabf2c 100644
--- a/clang/test/CXX/drs/dr16xx.cpp
+++ b/clang/test/CXX/drs/dr16xx.cpp
@@ -153,10 +153,9 @@ namespace cwg1645 { // cwg1645: 3.9
 
 namespace cwg1652 { // cwg1652: 3.6
   int a, b;
-  int arr[&a + 1 == &b ? 1 : 2];
-  // expected-error@-1 {{variable length arrays in C++ are a Clang extension}}
+  static_assert(&a + 1 == &b, "");
+  // expected-error@-1 {{static assertion expression is not an integral constant expression}}
   //   expected-note@-2 {{comparison against pointer '&a + 1' that points past the end of a complete object has unspecified value}}
-  // expected-error@-3 {{variable length array declaration not allowed at file scope}}
 }
 
 namespace cwg1653 { // cwg1653: 4 c++17
diff --git a/clang/test/CXX/drs/dr1xx.cpp b/clang/test/CXX/drs/dr1xx.cpp
index 5b497dda047d6..a8f9b705a9866 100644
--- a/clang/test/CXX/drs/dr1xx.cpp
+++ b/clang/test/CXX/drs/dr1xx.cpp
@@ -5,6 +5,17 @@
 // RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx11,since-cxx17 -fexceptions -fcxx-exceptions -pedantic-errors
 
+#if __cplusplus == 199711L
+#define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
+// cxx98-error@-1 {{variadic macros are a C99 feature}}
+#endif
+
+#if __cplusplus == 199711L
+#define __enable_constant_folding(x) (__builtin_constant_p(x) ? (x) : (x))
+#else
+#define __enable_constant_folding
+#endif
+
 namespace cwg100 { // cwg100: yes
   template<const char (*)[4]> struct A {}; // #cwg100-A
   template<const char (&)[4]> struct B {}; // #cwg100-B
@@ -736,8 +747,8 @@ namespace cwg147 { // cwg147: yes
 
 namespace cwg148 { // cwg148: yes
   struct A { int A::*p; };
-  int check1[__is_pod(int(A::*)) ? 1 : -1];
-  int check2[__is_pod(A) ? 1 : -1];
+  static_assert(__is_pod(int(A::*)), "");
+  static_assert(__is_pod(A), "");
 }
 
 // cwg149: na
@@ -745,13 +756,7 @@ namespace cwg148 { // cwg148: yes
 namespace cwg151 { // cwg151: 3.1
   struct X {};
   typedef int X::*p;
-#if __cplusplus < 201103L
-#define fold(x) (__builtin_constant_p(0) ? (x) : (x))
-#else
-#define fold
-#endif
-  int check[fold(p() == 0) ? 1 : -1];
-#undef fold
+  static_assert(__enable_constant_folding(p() == 0), "");
 }
 
 namespace cwg152 { // cwg152: yes
@@ -956,42 +961,42 @@ namespace cwg171 {
 
 namespace cwg172 { // cwg172: yes
   enum { zero };
-  int check1[-1 < zero ? 1 : -1];
+  static_assert(-1 < zero, "");
 
   enum { x = -1, y = (unsigned int)-1 };
-  int check2[sizeof(x) > sizeof(int) ? 1 : -1];
+  static_assert(sizeof(x) > sizeof(int), "");
 
   enum { a = (unsigned int)-1 / 2 };
-  int check3a[sizeof(a) == sizeof(int) ? 1 : -1];
-  int check3b[-a < 0 ? 1 : -1];
+  static_assert(sizeof(a) == sizeof(int), "");
+  static_assert(-a < 0, "");
 
   enum { b = (unsigned int)-1 / 2 + 1 };
-  int check4a[sizeof(b) == sizeof(unsigned int) ? 1 : -1];
-  int check4b[-b > 0 ? 1 : -1];
+  static_assert(sizeof(b) == sizeof(unsigned int), "");
+  static_assert(-b > 0, "");
 
   enum { c = (unsigned long)-1 / 2 };
-  int check5a[sizeof(c) == sizeof(long) ? 1 : -1];
-  int check5b[-c < 0 ? 1 : -1];
+  static_assert(sizeof(c) == sizeof(long), "");
+  static_assert(-c < 0, "");
 
   enum { d = (unsigned long)-1 / 2 + 1 };
-  int check6a[sizeof(d) == sizeof(unsigned long) ? 1 : -1];
-  int check6b[-d > 0 ? 1 : -1];
+  static_assert(sizeof(d) == sizeof(unsigned long), "");
+  static_assert(-d > 0, "");
 
   enum { e = (unsigned long long)-1 / 2 };
   // cxx98-error@-1 {{'long long' is a C++11 extension}}
-  int check7a[sizeof(e) == sizeof(long) ? 1 : -1];
-  int check7b[-e < 0 ? 1 : -1];
+  static_assert(sizeof(e) == sizeof(long), "");
+  static_assert(-e < 0, "");
 
   enum { f = (unsigned long long)-1 / 2 + 1 };
   // cxx98-error@-1 {{'long long' is a C++11 extension}}
-  int check8a[sizeof(f) == sizeof(unsigned long) ? 1 : -1];
-  int check8b[-f > 0 ? 1 : -1];
+  static_assert(sizeof(f) == sizeof(unsigned long), "");
+  static_assert(-f > 0, "");
 }
 
 namespace cwg173 { // cwg173: yes
-  int check[('0' + 1 == '1' && '0' + 2 == '2' && '0' + 3 == '3' &&
-             '0' + 4 == '4' && '0' + 5 == '5' && '0' + 6 == '6' &&
-             '0' + 7 == '7' && '0' + 8 == '8' && '0' + 9 == '9') ? 1 : -1];
+  static_assert('0' + 1 == '1' && '0' + 2 == '2' && '0' + 3 == '3' &&
+                '0' + 4 == '4' && '0' + 5 == '5' && '0' + 6 == '6' &&
+                '0' + 7 == '7' && '0' + 8 == '8' && '0' + 9 == '9', "");
 }
 
 // cwg174: sup 1012
@@ -1070,7 +1075,7 @@ namespace cwg177 { // cwg177: yes
 }
 
 namespace cwg178 { // cwg178: yes
-  int check[int() == 0 ? 1 : -1];
+  static_assert(int() == 0, "");
 #if __cplusplus >= 201103L
   static_assert(int{} == 0, "");
   struct S { int a, b; };
@@ -1180,7 +1185,7 @@ namespace cwg187 { // cwg187: sup 481
 
 namespace cwg188 { // cwg188: yes
   char c[10];
-  int check[sizeof(0, c) == 10 ? 1 : -1];
+  static_assert(sizeof(0, c) == 10, "");
 }
 
 // cwg190 FIXME: add codegen test for tbaa
diff --git a/clang/test/CXX/drs/dr2xx.cpp b/clang/test/CXX/drs/dr2xx.cpp
index e655e7226d51d..5d3e8ce4bea3b 100644
--- a/clang/test/CXX/drs/dr2xx.cpp
+++ b/clang/test/CXX/drs/dr2xx.cpp
@@ -10,10 +10,15 @@
 typedef __SIZE_TYPE__ size_t;
 // cxx98-error@-1 0-1 {{'long long' is a C++11 extension}}
 
-#if __cplusplus < 201103L
-#define fold(x) (__builtin_constant_p(x) ? (x) : (x))
+#if __cplusplus == 199711L
+#define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
+// cxx98-error@-1 {{variadic macros are a C99 feature}}
+#endif
+
+#if __cplusplus == 199711L
+#define __enable_constant_folding(x) (__builtin_constant_p(x) ? (x) : (x))
 #else
-#define fold
+#define __enable_constant_folding
 #endif
 
 namespace cwg200 { // cwg200: dup 214
@@ -31,7 +36,7 @@ namespace cwg200 { // cwg200: dup 214
 namespace cwg202 { // cwg202: 3.1
   template<typename T> T f();
   template<int (*g)()> struct X {
-    int arr[fold(g == &f<int>) ? 1 : -1];
+    static_assert(__enable_constant_folding(g == &f<int>), "");
   };
   template struct X<f>;
 }
@@ -1024,7 +1029,7 @@ namespace cwg275 { // cwg275: no
 namespace cwg277 { // cwg277: 3.1
   typedef int *intp;
   int *p = intp();
-  int a[fold(intp() ? -1 : 1)];
+  static_assert(__enable_constant_folding(!intp()), "");
 }
 
 namespace cwg280 { // cwg280: 2.9
diff --git a/clang/test/CXX/drs/dr3xx.cpp b/clang/test/CXX/drs/dr3xx.cpp
index 6d1c6958ac8eb..3e9228fe21fb6 100644
--- a/clang/test/CXX/drs/dr3xx.cpp
+++ b/clang/test/CXX/drs/dr3xx.cpp
@@ -5,6 +5,17 @@
 // RUN: %clang_cc1 -std=c++11 -verify=expected,cxx98-14,cxx98-17,cxx98-20,cxx11-14,since-cxx11 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++98 -verify=expected,cxx98-14,cxx98-17,cxx98-20,cxx98 -triple %itanium_abi_triple %s -fexceptions -fcxx-exceptions -pedantic-errors
 
+#if __cplusplus == 199711L
+#define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
+// cxx98-error@-1 {{variadic macros are a C99 feature}}
+#endif
+
+#if __cplusplus == 199711L
+#define __enable_constant_folding(x) (__builtin_constant_p(x) ? (x) : (x))
+#else
+#define __enable_constant_folding
+#endif
+
 namespace cwg300 { // cwg300: yes
   template<typename R, typename A> void f(R (&)(A)) {}
   int g(int);
@@ -396,7 +407,7 @@ namespace cwg324 { // cwg324: 3.6
 
 namespace cwg326 { // cwg326: 3.1
   struct S {};
-  int test[__is_trivially_constructible(S, const S&) ? 1 : -1];
+  static_assert(__is_trivially_constructible(S, const S&), "");
 }
 
 namespace cwg327 { // cwg327: dup 538
@@ -653,7 +664,7 @@ namespace cwg339 { // cwg339: 2.8
 
   template<typename T> A<sizeof(f(T()))> make_A();
 
-  int a[conv_int<char>::value ? 1 : -1];
+  static_assert(conv_int<char>::value, "");
   bool b = conv_int2<char>(A<1>());
   A<1> c = make_A<char>();
 }
@@ -1099,21 +1110,14 @@ namespace cwg364 { // cwg364: yes
 #endif
 
 namespace cwg367 { // cwg367: yes
-  // FIXME: These diagnostics are terrible. Don't diagnose an ill-formed global
-  // array as being a VLA!
-  int a[true ? throw 0 : 4];
-  // expected-error@-1 {{variable length arrays in C++ are a Clang extension}}
-  // expected-error@-2 {{variable length array declaration not allowed at file scope}}
-  int b[true ? 4 : throw 0];
-  // cxx98-error@-1 {{variable length arrays in C++ are a Clang extension}}
-  // cxx98-error@-2 {{variable length array folded to constant array as an extension}}
-  int c[true ? *new int : 4];
-  // expected-error@-1 {{variable length arrays in C++ are a Clang extension}}
+  static_assert(__enable_constant_folding(true ? throw 0 : 4), "");
+  // expected-error@-1 {{expression is not an integral constant expression}}
+  static_assert(__enable_constant_folding(true ? 4 : throw 0), "");
+  static_assert(__enable_constant_folding(true ? *new int : 4), "");
+  // expected-error@-1 {{expression is not an integral constant expression}}
   //   expected-note@-2 {{read of uninitialized object is not allowed in a constant expression}}
-  // expected-error@-3 {{variable length array declaration not allowed at file scope}}
-  int d[true ? 4 : *new int];
-  // cxx98-error@-1 {{variable length arrays in C++ are a Clang extension}}
-  // cxx98-error@-2 {{variable length array folded to constant array as an extension}}
+  static_assert(__enable_constant_folding(true ? 4 : *new int), "");
+
 }
 
 namespace cwg368 { // cwg368: 3.6
@@ -1325,7 +1329,7 @@ namespace cwg383 { // cwg383: yes
   struct B { ~B(); };
   union C { C &operator=(const C&); };
   union D { ~D(); };
-  int check[(__is_pod(A) || __is_pod(B) || __is_pod(C) || __is_pod(D)) ? -1 : 1];
+  static_assert(!__is_pod(A) && !__is_pod(B) && !__is_pod(C) && !__is_pod(D), "");
 }
 
 namespace cwg384 { // cwg384: yes
diff --git a/clang/test/CXX/drs/dr4xx.cpp b/clang/test/CXX/drs/dr4xx.cpp
index 611b791470785..07162cc28f6b6 100644
--- a/clang/test/CXX/drs/dr4xx.cpp
+++ b/clang/test/CXX/drs/dr4xx.cpp
@@ -6,6 +6,11 @@
 // RUN: env ASAN_OPTIONS=detect_stack_use_after_return=0 %clang_cc1 -std=c++23 %s -verify=expected,since-cxx20,since-cxx17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: env ASAN_OPTIONS=detect_stack_use_after_return=0 %clang_cc1 -std=c++2c %s -verify=expected,since-cxx20,since-cxx17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
 
+#if __cplusplus == 199711L
+#define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
+// cxx98-error@-1 {{variadic macros are a C99 feature}}
+#endif
+
 // FIXME: __SIZE_TYPE__ expands to 'long long' on some targets.
 __extension__ typedef __SIZE_TYPE__ size_t;
 
@@ -217,7 +222,7 @@ namespace cwg407 { // cwg407: 3.8
 }
 
 namespace cwg408 { // cwg408: 3.4
-  template<int N> void g() { int arr[N != 1 ? 1 : -1]; }
+  template<int N> void g() { static_assert(N != 1, ""); }
   template<> void g<2>() { }
 
   template<typename T> struct S {
@@ -239,7 +244,7 @@ namespace cwg408 { // cwg408: 3.4
   };
   template<typename T> int R<T>::arr[1];
   template<typename T> void R<T>::f() {
-    int arr[sizeof(arr) != sizeof(int) ? 1 : -1];
+    static_assert(sizeof(arr) != sizeof(int), "");
   }
   template<> int R<int>::arr[2];
   template void R<int>::f();
@@ -842,11 +847,10 @@ namespace cwg451 { // cwg451: yes
   // expected-warning@-1 {{division by zero is undefined}}
   const int b = 1 / 0; // #cwg451-b
   // expected-warning@-1 {{division by zero is undefined}}
-  int arr[b]; // #cwg451-arr
-  // expected-error@-1 {{variable length arrays in C++ are a Clang extension}}
+  static_assert(b, "");
+  // expected-error@-1 {{expression is not an integral constant expression}}
   //   expected-note@-2 {{initializer of 'b' is not a constant expression}}
   //   expected-note@#cwg451-b {{declared here}}
-  // expected-error@#cwg451-arr {{variable length array declaration not allowed at file scope}}
 }
 
 namespace cwg452 { // cwg452: yes
@@ -876,11 +880,10 @@ namespace cwg456 { // cwg456: yes
 namespace cwg457 { // cwg457: yes
   const int a = 1;
   const volatile int b = 1;
-  int ax[a];
-  int bx[b];
-  // expected-error@-1 {{variable length arrays in C++ are a Clang extension}}
+  static_assert(a, "");
+  static_assert(b, "");
+  // expected-error@-1 {{expression is not an integral constant expression}}
   //   expected-note@-2 {{read of volatile-qualified type 'const volatile int' is not allowed in a constant expression}}
-  // expected-error@-3 {{variable length array declaration not allowed at file scope}}
 
   enum E {
     ea = a,
@@ -1276,20 +1279,18 @@ namespace cwg482 { // cwg482: 3.5
 
 namespace cwg483 { // cwg483: yes
   namespace climits {
-    int check1[__SCHAR_MAX__ >= 127 ? 1 : -1];
-    int check2[__SHRT_MAX__ >= 32767 ? 1 : -1];
-    int check3[__INT_MAX__ >= 32767 ? 1 : -1];
-    int check4[__LONG_MAX__ >= 2147483647 ? 1 : -1];
-    int check5[__LONG_LONG_MAX__ >= 9223372036854775807 ? 1 : -1];
-    // cxx98-error@-1 {{'long long' is a C++11 extension}}
-    // cxx98-error@-2 0-1{{'long long' is a C++11 extension}}
+    static_assert(__SCHAR_MAX__ >= 127, "");
+    static_assert(__SHRT_MAX__ >= 32767, "");
+    static_assert(__INT_MAX__ >= 32767, "");
+    static_assert(__LONG_MAX__ >= 2147483647, "");
+    static_assert(__LONG_LONG_MAX__ >= 9223372036854775807, "");
   }
   namespace cstdint {
-    int check1[__PTRDIFF_WIDTH__ >= 16 ? 1 : -1];
-    int check2[__SIG_ATOMIC_WIDTH__ >= 8 ? 1 : -1];
-    int check3[__SIZE_WIDTH__ >= 16 ? 1 : -1];
-    int check4[__WCHAR_WIDTH__ >= 8 ? 1 : -1];
-    int check5[__WINT_WIDTH__ >= 16 ? 1 : -1];
+    static_assert(__PTRDIFF_WIDTH__ >= 16, "");
+    static_assert(__SIG_ATOMIC_WIDTH__ >= 8, "");
+    static_assert(__SIZE_WIDTH__ >= 16, "");
+    static_assert(__WCHAR_WIDTH__ >= 8, "");
+    static_assert(__WINT_WIDTH__ >= 16, "");
   }
 }
 
@@ -1366,11 +1367,10 @@ namespace cwg486 { // cwg486: yes
 namespace cwg487 { // cwg487: yes
   enum E { e };
   int operator+(int, E); // #cwg487-operator-plus
-  int i[4 + e]; // #cwg487-i
-  // expected-error@-1 {{variable length arrays in C++ are a Clang extension}}
+  static_assert(4 + e, "");
+  // expected-error@-1 {{expression is not an integral constant expression}}
   //   since-cxx11-note@-2 {{non-constexpr function 'operator+' cannot be used in a constant expression}}
   //   since-cxx11-note@#cwg487-operator-plus {{declared here}}
-  // expected-error@#cwg487-i {{variable length array declaration not allowed at file scope}}
 }
 
 namespace cwg488 { // cwg488: yes c++11
@@ -1485,13 +1485,13 @@ namespace cwg495 { // cwg495: 3.5
 namespace cwg496 { // cwg496: sup 2094
   struct A { int n; };
   struct B { volatile int n; };
-  int check1[ __is_trivially_copyable(const int) ? 1 : -1];
+  static_assert(__is_trivially_copyable(const int), "");
   // This checks the cwg2094 behavior, not cwg496
-  int check2[ __is_trivially_copyable(volatile int) ? 1 : -1];
-  int check3[ __is_trivially_constructible(A, const A&) ? 1 : -1];
-  int check4[ __is_trivially_constructible(B, const B&) ? 1 : -1];
-  int check5[ __is_trivially_assignable(A, const A&) ? 1 : -1];
-  int check6[ __is_trivially_assignable(B, const B&) ? 1 : -1];
+  static_assert(__is_trivially_copyable(volatile int), "");
+  static_assert(__is_trivially_constructible(A, const A&), "");
+  static_assert(__is_trivially_constructible(B, const B&), "");
+  static_assert(__is_trivially_assignable(A, const A&), "");
+  static_assert(__is_trivially_assignable(B, const B&), "");
 }
 
 namespace cwg497 { // cwg497: sup 253
diff --git a/clang/test/CXX/drs/dr5xx.cpp b/clang/test/CXX/drs/dr5xx.cpp
index 0fe64102d70b0..9d890f981348a 100644
--- a/clang/test/CXX/drs/dr5xx.cpp
+++ b/clang/test/CXX/drs/dr5xx.cpp
@@ -5,6 +5,11 @@
 // RUN: %clang_cc1 -std=c++20 %s -verify=expected,since-cxx20,since-cxx17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx23,since-cxx20,since-cxx17,since-cxx11 -fexceptions -fcxx-exceptions -pedantic-errors
 
+#if __cplusplus == 199711L
+#define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
+// cxx98-error@-1 {{variadic macros are a C99 feature}}
+#endif
+
 // FIXME: This is included to avoid a diagnostic with no source location
 // pointing at the implicit operator new. We can't match such a diagnostic
 // with -verify.
@@ -819,7 +824,7 @@ namespace cwg565 { // cwg565: yes
 
 namespace cwg566 { // cwg566: yes
 #if __cplusplus >= 201103L
-  int check[int(-3.99) == -3 ? 1 : -1];
+  static_assert(int(-3.99) == -3, "");
 #endif
 }
 
@@ -834,7 +839,7 @@ namespace cwg568 { // cwg568: 3.0 c++11
   public:
     int n;
   };
-  int check_trivial[__is_trivial(trivial) ? 1 : -1];
+  static_assert(__is_trivial(trivial), "");
 
   struct std_layout {
     std_layout();
@@ -843,7 +848,7 @@ namespace cwg568 { // cwg568: 3.0 c++11
   private:
     int n;
   };
-  int check_std_layout[__is_standard_layout(std_layout) ? 1 : -1];
+  static_assert(__is_standard_layout(std_layout), "");
 
   struct aggregate {
     int x;
@@ -885,7 +890,7 @@ namespace cwg570 { // cwg570: dup 633
 
 namespace cwg572 { // cwg572: yes
   enum E { a = 1, b = 2 };
-  int check[a + b == 3 ? 1 : -1];
+  static_assert(a + b == 3, "");
 }
 
 namespace cwg573 { // cwg573: no
diff --git a/clang/test/CXX/drs/dr6xx.cpp b/clang/test/CXX/drs/dr6xx.cpp
index 9d3613ae8589e..069102d9c5975 100644
--- a/clang/test/CXX/drs/dr6xx.cpp
+++ b/clang/test/CXX/drs/dr6xx.cpp
@@ -144,7 +144,7 @@ namespace cwg608 { // cwg608: yes
   struct D : B, C {};
 }
 
-int cwg610[-0u == 0u ? 1 : -1]; // cwg610: yes
+static_assert(-0u == 0u, ""); // cwg610: yes
 
 namespace cwg611 { // cwg611: yes
   int k;
@@ -190,8 +190,8 @@ namespace cwg613 { // cwg613: yes c++11
   }
 }
 
-int cwg614_a[(-1) / 2 == 0 ? 1 : -1]; // cwg614: yes
-int cwg614_b[(-1) % 2 == -1 ? 1 : -1];
+static_assert((-1) / 2 == 0, ""); // cwg614: yes
+static_assert((-1) % 2 == -1, "");
 
 namespace cwg615 { // cwg615: yes
   int f();
diff --git a/clang/test/ClangScanDeps/error.cpp b/clang/test/ClangScanDeps/error.cpp
index 0095a6c900c3b..593dbf35edca5 100644
--- a/clang/test/ClangScanDeps/error.cpp
+++ b/clang/test/ClangScanDeps/error.cpp
@@ -1,23 +1,10 @@
 // RUN: rm -rf %t
 // RUN: split-file %s %t
 
-//--- missing_tu.json.in
-[{
-  "directory": "DIR",
-  "command": "clang -fsyntax-only DIR/missing_tu.c",
-  "file": "DIR/missing_tu.c"
-}]
-//--- missing_header.json.in
-[{
-  "directory": "DIR",
-  "command": "clang -fsyntax-only DIR/missing_header.c",
-  "file": "DIR/missing_header.c"
-}]
 //--- missing_header.c
 #include "missing.h"
 
-// RUN: sed -e "s|DIR|%/t|g" %t/missing_tu.json.in > %t/missing_tu.json
-// RUN: not clang-scan-deps -compilation-database %t/missing_tu.json 2>%t/missing_tu.errs
+// RUN: not clang-scan-deps -- %clang -c %t/missing_tu.c 2>%t/missing_tu.errs
 // RUN: echo EOF >> %t/missing_tu.errs
 // RUN: cat %t/missing_tu.errs | sed 's:\\\\\?:/:g' | FileCheck %s --check-prefix=CHECK-MISSING-TU -DPREFIX=%/t
 // CHECK-MISSING-TU: Error while scanning dependencies for [[PREFIX]]/missing_tu.c
@@ -26,8 +13,7 @@
 // CHECK-MISSING-TU-NEXT: error:
 // CHECK-MISSING-TU-NEXT: EOF
 
-// RUN: sed -e "s|DIR|%/t|g" %t/missing_header.json.in > %t/missing_header.json
-// RUN: not clang-scan-deps -compilation-database %t/missing_header.json 2>%t/missing_header.errs
+// RUN: not clang-scan-deps -- %clang -c %t/missing_header.c 2>%t/missing_header.errs
 // RUN: echo EOF >> %t/missing_header.errs
 // RUN: cat %t/missing_header.errs | sed 's:\\\\\?:/:g' | FileCheck %s --check-prefix=CHECK-MISSING-HEADER -DPREFIX=%/t
 // CHECK-MISSING-HEADER: Error while scanning dependencies for [[PREFIX]]/missing_header.c
diff --git a/clang/test/ClangScanDeps/module-format.c b/clang/test/ClangScanDeps/module-format.c
index 001a011ae0b59..0a6abec80dd90 100644
--- a/clang/test/ClangScanDeps/module-format.c
+++ b/clang/test/ClangScanDeps/module-format.c
@@ -16,7 +16,7 @@
 // RUN: rm -f %t/cdb_pch.json
 // RUN: sed "s|DIR|%/t|g" %S/Inputs/modules-pch/cdb_pch.json > %t/cdb_pch.json
 // RUN: clang-scan-deps -compilation-database %t/cdb_pch.json -format experimental-full \
-// RUN:   -module-files-dir %t/build > %t/result_pch.json
+// RUN:   -module-files-dir %t/build -o %t/result_pch.json
 
 // Explicitly build the PCH:
 //
diff --git a/clang/test/ClangScanDeps/removed-args.c b/clang/test/ClangScanDeps/removed-args.c
index f49e4ead82f7b..3e108f0549450 100644
--- a/clang/test/ClangScanDeps/removed-args.c
+++ b/clang/test/ClangScanDeps/removed-args.c
@@ -93,3 +93,31 @@
 // CHECK-NOT:          "-fmodules-prune-interval=
 // CHECK-NOT:          "-fmodules-prune-after=
 // CHECK:            ],
+
+// Check for removed args for PCH invocations.
+
+// RUN: split-file %s %t
+// RUN: sed "s|DIR|%/t|g" %t/cdb-pch.json.template > %t/cdb-pch.json
+// RUN: clang-scan-deps -compilation-database %t/cdb-pch.json -format experimental-full > %t/result-pch.json
+// RUN: cat %t/result-pch.json | sed 's:\\\\\?:/:g' | FileCheck %s -DPREFIX=%/t -check-prefix=PCH
+//
+// PCH-NOT:          "-fdebug-compilation-dir="
+// PCH-NOT:          "-fcoverage-compilation-dir="
+// PCH-NOT:          "-coverage-notes-file
+// PCH-NOT:          "-coverage-data-file
+// PCH-NOT:          "-fprofile-instrument-use-path
+// PCH-NOT:          "-include"
+// PCH-NOT:          "-fmodules-cache-path=
+// PCH-NOT:          "-fmodules-validate-once-per-build-session"
+// PCH-NOT:          "-fbuild-session-timestamp=
+// PCH-NOT:          "-fmodules-prune-interval=
+// PCH-NOT:          "-fmodules-prune-after=
+
+//--- cdb-pch.json.template
+[
+  {
+    "directory": "DIR",
+    "command": "clang -x c-header DIR/header.h -fmodules -fimplicit-module-maps -fmodules-cache-path=DIR/cache -fdebug-compilation-dir=DIR/debug -fcoverage-compilation-dir=DIR/coverage -ftest-coverage -fprofile-instr-use=DIR/tu.profdata -o DIR/header.h.pch -serialize-diagnostics DIR/header.h.pch.diag ",
+    "file": "DIR/header.h.pch"
+  }
+]
diff --git a/clang/test/CodeGen/aarch64-cpu-supports.c b/clang/test/CodeGen/aarch64-cpu-supports.c
index c54b7475a3fd5..7fad9724dfb6c 100644
--- a/clang/test/CodeGen/aarch64-cpu-supports.c
+++ b/clang/test/CodeGen/aarch64-cpu-supports.c
@@ -1,15 +1,17 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --version 2
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s
 
+//.
 // CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
+//.
 // CHECK-LABEL: define dso_local i32 @main
 // CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 70368744177664
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70368744177664
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 34359738368
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 34359738368
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 // CHECK:       if.then:
@@ -17,8 +19,8 @@
 // CHECK-NEXT:    br label [[RETURN:%.*]]
 // CHECK:       if.end:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 9070970929152
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 9070970929152
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 17716740096
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 17716740096
 // CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-NEXT:    br i1 [[TMP7]], label [[IF_THEN1:%.*]], label [[IF_END2:%.*]]
 // CHECK:       if.then1:
@@ -26,8 +28,8 @@
 // CHECK-NEXT:    br label [[RETURN]]
 // CHECK:       if.end2:
 // CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 166633186212708352
-// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 166633186212708352
+// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 5222680231936
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 5222680231936
 // CHECK-NEXT:    [[TMP11:%.*]] = and i1 true, [[TMP10]]
 // CHECK-NEXT:    br i1 [[TMP11]], label [[IF_THEN3:%.*]], label [[IF_END4:%.*]]
 // CHECK:       if.then3:
@@ -49,10 +51,10 @@ int main(void) {
   if (__builtin_cpu_supports("sb"))
     return 1;
 
-  if (__builtin_cpu_supports("sve2-pmull128+memtag"))
+  if (__builtin_cpu_supports("sve2-aes+memtag"))
     return 2;
 
-  if (__builtin_cpu_supports("sme2+ls64_v+wfxt"))
+  if (__builtin_cpu_supports("sme2+ls64+wfxt"))
     return 3;
 
   if (__builtin_cpu_supports("avx2"))
@@ -60,3 +62,9 @@ int main(void) {
 
   return 0;
 }
+//.
+// CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+//.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/CodeGen/aarch64-mixed-target-attributes.c b/clang/test/CodeGen/aarch64-mixed-target-attributes.c
index aef6ce36ab1c0..be290ff9ecee6 100644
--- a/clang/test/CodeGen/aarch64-mixed-target-attributes.c
+++ b/clang/test/CodeGen/aarch64-mixed-target-attributes.c
@@ -69,8 +69,8 @@ __attribute__((target_version("jscvt"))) int default_def_with_version_decls(void
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1048576
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1048576
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 131072
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 131072
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
@@ -143,8 +143,8 @@ __attribute__((target_version("jscvt"))) int default_def_with_version_decls(void
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1048576
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1048576
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 131072
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 131072
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
@@ -210,8 +210,8 @@ __attribute__((target_version("jscvt"))) int default_def_with_version_decls(void
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1048576
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1048576
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 131072
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 131072
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
diff --git a/clang/test/CodeGen/alias.cpp b/clang/test/CodeGen/alias.cpp
index 17c1e1ae32f03..a468c31d369ed 100644
--- a/clang/test/CodeGen/alias.cpp
+++ b/clang/test/CodeGen/alias.cpp
@@ -1,27 +1,42 @@
-// RUN: %clang_cc1 -triple x86_64-linux -verify -emit-llvm-only %s
-// RUN: not %clang_cc1 -triple x86_64-linux -emit-llvm-only -fdiagnostics-parseable-fixits %s 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux -verify -emit-llvm-only -DERR %s
+// RUN: not %clang_cc1 -triple x86_64-linux -emit-llvm-only -fdiagnostics-parseable-fixits -DERR %s 2>&1 | FileCheck %s --check-prefix=FIXIT
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm %s -o - | FileCheck %s
 
+#ifdef ERR
 void *f1_ifunc(void) { return nullptr; }
 void f1(void) __attribute__((alias("f1_ifunc")));
 // expected-error@-1 {{alias must point to a defined variable or function}}
 // expected-note@-2 {{must refer to its mangled name}}
 // expected-note@-3 {{function by that name is mangled as}}
-// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:30-[[@LINE-4]]:47}:"alias(\"_Z8f1_ifuncv\")"
+// FIXIT: fix-it:"{{.*}}":{[[@LINE-4]]:30-[[@LINE-4]]:47}:"alias(\"_Z8f1_ifuncv\")"
 
 void *f6_resolver_resolver(void) { return 0; }
 void *f6_resolver(void) __attribute__((alias("f6_resolver_resolver")));
 // expected-error@-1 {{alias must point to a defined variable or function}}
 // expected-note@-2 {{must refer to its mangled name}}
 // expected-note@-3 {{function by that name is mangled as}}
-// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:40-[[@LINE-4]]:69}:"alias(\"_Z20f6_resolver_resolverv\")"
+// FIXIT: fix-it:"{{.*}}":{[[@LINE-4]]:40-[[@LINE-4]]:69}:"alias(\"_Z20f6_resolver_resolverv\")"
 void f6(void) __attribute__((alias("f6_resolver")));
 // expected-error@-1 {{alias must point to a defined variable or function}}
 // expected-note@-2 {{must refer to its mangled name}}
 // expected-note@-3 {{function by that name is mangled as}}
-// CHECK: fix-it:"{{.*}}":{[[@LINE-4]]:30-[[@LINE-4]]:50}:"alias(\"_Z11f6_resolverv\")"
+// FIXIT: fix-it:"{{.*}}":{[[@LINE-4]]:30-[[@LINE-4]]:50}:"alias(\"_Z11f6_resolverv\")"
 
 __attribute__((unused, alias("resolver"), deprecated("hahahaha, isn't C great?")))
 void func();
 // expected-error@-2 {{alias must point to a defined variable or function}}
 // expected-note@-3 {{must refer to its mangled name}}
+#endif
 
+// CHECK: @_ZN4libc4log2Ed ={{.*}} alias double (double), ptr @log2
+// CHECK: define{{.*}} @log2(
+namespace libc { double log2(double x); }
+extern "C" double log2(double);
+namespace std { using ::log2; }
+using std::log2;
+
+namespace libc {
+decltype(libc::log2) __log2_impl__ __asm__("log2");
+decltype(libc::log2) log2 [[gnu::alias("log2")]];
+double __log2_impl__(double x) { return x; }
+}
diff --git a/clang/test/CodeGen/attr-target-clones-aarch64.c b/clang/test/CodeGen/attr-target-clones-aarch64.c
index 8c8b951e9118d..c715001f6a722 100644
--- a/clang/test/CodeGen/attr-target-clones-aarch64.c
+++ b/clang/test/CodeGen/attr-target-clones-aarch64.c
@@ -3,7 +3,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fmv -S -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-NOFMV
 
 int __attribute__((target_clones("lse+aes", "sve2"))) ftc(void) { return 0; }
-int __attribute__((target_clones("sha2", "sha2+memtag2", " default "))) ftc_def(void) { return 1; }
+int __attribute__((target_clones("sha2", "sha2+memtag", " default "))) ftc_def(void) { return 1; }
 int __attribute__((target_clones("sha2", "default"))) ftc_dup1(void) { return 2; }
 int __attribute__((target_clones("fp", "crc+dotprod"))) ftc_dup2(void) { return 3; }
 int foo() {
@@ -12,7 +12,7 @@ int foo() {
 
 inline int __attribute__((target_clones("rng+simd", "rcpc+predres", "sve2-aes+wfxt"))) ftc_inline1(void) { return 1; }
 inline int __attribute__((target_clones("fp16", "fcma+sve2-bitperm", "default"))) ftc_inline2(void);
-inline int __attribute__((target_clones("bti", "sve+sb"))) ftc_inline3(void) { return 3; }
+inline int __attribute__((target_clones("mops", "sve+sb"))) ftc_inline3(void) { return 3; }
 
 int __attribute__((target_clones("default"))) ftc_direct(void) { return 4; }
 
@@ -56,16 +56,16 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 16512
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 16512
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 8320
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 8320
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
 // CHECK-NEXT:    ret ptr @ftc._MaesMlse
 // CHECK:       resolver_else:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 68719476736
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 68719476736
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 268435456
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 268435456
 // CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK:       resolver_return1:
@@ -81,7 +81,7 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @ftc_def._Mmemtag2Msha2(
+// CHECK-LABEL: @ftc_def._MmemtagMsha2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
@@ -90,16 +90,16 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 17592186048512
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 17592186048512
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 17179871232
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 17179871232
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @ftc_def._Mmemtag2Msha2
+// CHECK-NEXT:    ret ptr @ftc_def._MmemtagMsha2
 // CHECK:       resolver_else:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 4096
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 4096
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 2048
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 2048
 // CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK:       resolver_return1:
@@ -118,8 +118,8 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 4096
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 4096
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 2048
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 2048
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
@@ -198,16 +198,16 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 18014535948435456
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 18014535948435456
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 550292684800
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 550292684800
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
 // CHECK-NEXT:    ret ptr @ftc_inline1._Msve2-aesMwfxt
 // CHECK:       resolver_else:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 140737492549632
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 140737492549632
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 68720001024
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 68720001024
 // CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK:       resolver_return1:
@@ -228,16 +228,16 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 549757911040
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 549757911040
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1074003968
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1074003968
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
 // CHECK-NEXT:    ret ptr @ftc_inline2._MfcmaMsve2-bitperm
 // CHECK:       resolver_else:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 65536
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 65536
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 16384
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 16384
 // CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK:       resolver_return1:
@@ -250,20 +250,20 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 70369817919488
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70369817919488
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 34393292800
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 34393292800
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
 // CHECK-NEXT:    ret ptr @ftc_inline3._MsbMsve
 // CHECK:       resolver_else:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 1125899906842624
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 1125899906842624
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 17592186044416
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 17592186044416
 // CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK:       resolver_return1:
-// CHECK-NEXT:    ret ptr @ftc_inline3._Mbti
+// CHECK-NEXT:    ret ptr @ftc_inline3._Mmops
 // CHECK:       resolver_else2:
 // CHECK-NEXT:    ret ptr @ftc_inline3.default
 //
@@ -329,7 +329,7 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @ftc_inline3._Mbti(
+// CHECK-LABEL: @ftc_inline3._Mmops(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
 //
@@ -407,17 +407,16 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK: attributes #[[ATTR0:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+lse,+neon" }
 // CHECK: attributes #[[ATTR1:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2" }
 // CHECK: attributes #[[ATTR2:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+sha2" }
-// CHECK: attributes #[[ATTR3:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+mte,+neon,+sha2" }
-// CHECK: attributes #[[ATTR4:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" }
-// CHECK: attributes #[[ATTR5:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+dotprod,+fp-armv8,+neon" }
-// CHECK: attributes #[[ATTR6:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CHECK: attributes #[[ATTR7:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon" }
-// CHECK: attributes #[[ATTR8:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+complxnum,+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-bitperm" }
-// CHECK: attributes #[[ATTR9:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rand" }
-// CHECK: attributes #[[ATTR10:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+predres,+rcpc" }
-// CHECK: attributes #[[ATTR11:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-aes,+wfxt" }
-// CHECK: attributes #[[ATTR12:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti" }
-// CHECK: attributes #[[ATTR13:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sb,+sve" }
+// CHECK: attributes #[[ATTR3:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" }
+// CHECK: attributes #[[ATTR4:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+dotprod,+fp-armv8,+neon" }
+// CHECK: attributes #[[ATTR5:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR6:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon" }
+// CHECK: attributes #[[ATTR7:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+complxnum,+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-bitperm" }
+// CHECK: attributes #[[ATTR8:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rand" }
+// CHECK: attributes #[[ATTR9:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+predres,+rcpc" }
+// CHECK: attributes #[[ATTR10:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-aes,+wfxt" }
+// CHECK: attributes #[[ATTR11:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops" }
+// CHECK: attributes #[[ATTR12:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sb,+sve" }
 //.
 // CHECK-NOFMV: attributes #[[ATTR0:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" }
 // CHECK-NOFMV: attributes #[[ATTR1:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" }
diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c
index dd4cbbf5a8986..e71370e6d91df 100644
--- a/clang/test/CodeGen/attr-target-version.c
+++ b/clang/test/CodeGen/attr-target-version.c
@@ -5,11 +5,11 @@
 int __attribute__((target_version("rng+flagm+fp16fml"))) fmv(void) { return 1; }
 int __attribute__((target_version("flagm2+sme-i16i64"))) fmv(void) { return 2; }
 int __attribute__((target_version("lse+sha2"))) fmv(void) { return 3; }
-int __attribute__((target_version("dotprod+ls64_accdata"))) fmv(void) { return 4; }
+int __attribute__((target_version("dotprod+ls64"))) fmv(void) { return 4; }
 int __attribute__((target_version("fp16fml+memtag"))) fmv(void) { return 5; }
 int __attribute__((target_version("fp+aes"))) fmv(void) { return 6; }
-int __attribute__((target_version("crc+ls64_v"))) fmv(void) { return 7; }
-int __attribute__((target_version("bti"))) fmv(void) { return 8; }
+int __attribute__((target_version("crc+ls64"))) fmv(void) { return 7; }
+int __attribute__((target_version("mops"))) fmv(void) { return 8; }
 int __attribute__((target_version("sme2"))) fmv(void) { return 9; }
 int __attribute__((target_version("default"))) fmv(void);
 int __attribute__((target_version("ls64+simd"))) fmv_one(void) { return 1; }
@@ -17,25 +17,25 @@ int __attribute__((target_version("dpb"))) fmv_one(void) { return 2; }
 int __attribute__((target_version("default"))) fmv_one(void);
 int __attribute__((target_version("fp"))) fmv_two(void) { return 1; }
 int __attribute__((target_version("simd"))) fmv_two(void) { return 2; }
-int __attribute__((target_version("dgh"))) fmv_two(void) { return 3; }
+int __attribute__((target_version("frintts"))) fmv_two(void) { return 3; }
 int __attribute__((target_version("fp16+simd"))) fmv_two(void) { return 4; }
 int __attribute__((target_version("default"))) fmv_two(void);
 int foo() {
   return fmv()+fmv_one()+fmv_two();
 }
 
-inline int __attribute__((target_version("sha1+pmull+f64mm"))) fmv_inline(void) { return 1; }
+inline int __attribute__((target_version("crypto+f64mm"))) fmv_inline(void) { return 1; }
 inline int __attribute__((target_version("fp16+fcma+rdma+sme+ fp16 "))) fmv_inline(void) { return 2; }
 inline int __attribute__((target_version("sha3+i8mm+f32mm"))) fmv_inline(void) { return 12; }
-inline int __attribute__((target_version("dit+sve-ebf16"))) fmv_inline(void) { return 8; }
+inline int __attribute__((target_version("sme2+ssbs"))) fmv_inline(void) { return 8; }
 inline int __attribute__((target_version("dpb+rcpc2 "))) fmv_inline(void) { return 6; }
 inline int __attribute__((target_version(" dpb2 + jscvt"))) fmv_inline(void) { return 7; }
 inline int __attribute__((target_version("rcpc+frintts"))) fmv_inline(void) { return 3; }
-inline int __attribute__((target_version("sve+sve-bf16"))) fmv_inline(void) { return 4; }
+inline int __attribute__((target_version("sve+sme"))) fmv_inline(void) { return 4; }
 inline int __attribute__((target_version("sve2-aes+sve2-sha3"))) fmv_inline(void) { return 5; }
-inline int __attribute__((target_version("sve2+sve2-pmull128+sve2-bitperm"))) fmv_inline(void) { return 9; }
-inline int __attribute__((target_version("sve2-sm4+memtag2"))) fmv_inline(void) { return 10; }
-inline int __attribute__((target_version("memtag3+rcpc3+mops"))) fmv_inline(void) { return 11; }
+inline int __attribute__((target_version("sve2+sve2-aes+sve2-bitperm"))) fmv_inline(void) { return 9; }
+inline int __attribute__((target_version("sve2-sm4+memtag"))) fmv_inline(void) { return 10; }
+inline int __attribute__((target_version("memtag+rcpc3+mops"))) fmv_inline(void) { return 11; }
 inline int __attribute__((target_version("aes+dotprod"))) fmv_inline(void) { return 13; }
 inline int __attribute__((target_version("simd+fp16fml"))) fmv_inline(void) { return 14; }
 inline int __attribute__((target_version("fp+sm4"))) fmv_inline(void) { return 15; }
@@ -186,7 +186,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv._MdotprodMls64_accdata
+// CHECK-LABEL: define {{[^@]+}}@fmv._MdotprodMls64
 // CHECK-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 4
@@ -207,14 +207,14 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv._McrcMls64_v
+// CHECK-LABEL: define {{[^@]+}}@fmv._McrcMls64
 // CHECK-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 7
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv._Mbti
+// CHECK-LABEL: define {{[^@]+}}@fmv._Mmops
 // CHECK-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 8
@@ -256,7 +256,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_two._Mdgh
+// CHECK-LABEL: define {{[^@]+}}@fmv_two._Mfrintts
 // CHECK-SAME: () #[[ATTR11:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
@@ -271,7 +271,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@foo
-// CHECK-SAME: () #[[ATTR11]] {
+// CHECK-SAME: () #[[ATTR13:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 @fmv()
 // CHECK-NEXT:    [[CALL1:%.*]] = call i32 @fmv_one()
@@ -293,68 +293,68 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:    ret ptr @fmv._MflagmMfp16fmlMrng
 // CHECK:       resolver_else:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927940
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 72057594037927940
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 2199023255556
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 2199023255556
 // CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK:       resolver_return1:
 // CHECK-NEXT:    ret ptr @fmv._Mflagm2Msme-i16i64
 // CHECK:       resolver_else2:
 // CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 9007199254741008
-// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 9007199254741008
+// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 274877906960
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 274877906960
 // CHECK-NEXT:    [[TMP11:%.*]] = and i1 true, [[TMP10]]
 // CHECK-NEXT:    br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]]
 // CHECK:       resolver_return3:
-// CHECK-NEXT:    ret ptr @fmv._MdotprodMls64_accdata
+// CHECK-NEXT:    ret ptr @fmv._MdotprodMls64
 // CHECK:       resolver_else4:
 // CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = and i64 [[TMP12]], 4503599627371520
-// CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 4503599627371520
+// CHECK-NEXT:    [[TMP13:%.*]] = and i64 [[TMP12]], 274877907968
+// CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 274877907968
 // CHECK-NEXT:    [[TMP15:%.*]] = and i1 true, [[TMP14]]
 // CHECK-NEXT:    br i1 [[TMP15]], label [[RESOLVER_RETURN5:%.*]], label [[RESOLVER_ELSE6:%.*]]
 // CHECK:       resolver_return5:
-// CHECK-NEXT:    ret ptr @fmv._McrcMls64_v
+// CHECK-NEXT:    ret ptr @fmv._McrcMls64
 // CHECK:       resolver_else6:
 // CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 8796093022216
-// CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 8796093022216
+// CHECK-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 17179869192
+// CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 17179869192
 // CHECK-NEXT:    [[TMP19:%.*]] = and i1 true, [[TMP18]]
 // CHECK-NEXT:    br i1 [[TMP19]], label [[RESOLVER_RETURN7:%.*]], label [[RESOLVER_ELSE8:%.*]]
 // CHECK:       resolver_return7:
 // CHECK-NEXT:    ret ptr @fmv._Mfp16fmlMmemtag
 // CHECK:       resolver_else8:
 // CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = and i64 [[TMP20]], 16640
-// CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 16640
+// CHECK-NEXT:    [[TMP21:%.*]] = and i64 [[TMP20]], 8448
+// CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 8448
 // CHECK-NEXT:    [[TMP23:%.*]] = and i1 true, [[TMP22]]
 // CHECK-NEXT:    br i1 [[TMP23]], label [[RESOLVER_RETURN9:%.*]], label [[RESOLVER_ELSE10:%.*]]
 // CHECK:       resolver_return9:
 // CHECK-NEXT:    ret ptr @fmv._MaesMfp
 // CHECK:       resolver_else10:
 // CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = and i64 [[TMP24]], 4224
-// CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 4224
+// CHECK-NEXT:    [[TMP25:%.*]] = and i64 [[TMP24]], 2176
+// CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 2176
 // CHECK-NEXT:    [[TMP27:%.*]] = and i1 true, [[TMP26]]
 // CHECK-NEXT:    br i1 [[TMP27]], label [[RESOLVER_RETURN11:%.*]], label [[RESOLVER_ELSE12:%.*]]
 // CHECK:       resolver_return11:
 // CHECK-NEXT:    ret ptr @fmv._MlseMsha2
 // CHECK:       resolver_else12:
 // CHECK-NEXT:    [[TMP28:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = and i64 [[TMP28]], 144115188075855872
-// CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[TMP29]], 144115188075855872
+// CHECK-NEXT:    [[TMP29:%.*]] = and i64 [[TMP28]], 17592186044416
+// CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[TMP29]], 17592186044416
 // CHECK-NEXT:    [[TMP31:%.*]] = and i1 true, [[TMP30]]
 // CHECK-NEXT:    br i1 [[TMP31]], label [[RESOLVER_RETURN13:%.*]], label [[RESOLVER_ELSE14:%.*]]
 // CHECK:       resolver_return13:
-// CHECK-NEXT:    ret ptr @fmv._Msme2
+// CHECK-NEXT:    ret ptr @fmv._Mmops
 // CHECK:       resolver_else14:
 // CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = and i64 [[TMP32]], 1125899906842624
-// CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[TMP33]], 1125899906842624
+// CHECK-NEXT:    [[TMP33:%.*]] = and i64 [[TMP32]], 4398046511104
+// CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[TMP33]], 4398046511104
 // CHECK-NEXT:    [[TMP35:%.*]] = and i1 true, [[TMP34]]
 // CHECK-NEXT:    br i1 [[TMP35]], label [[RESOLVER_RETURN15:%.*]], label [[RESOLVER_ELSE16:%.*]]
 // CHECK:       resolver_return15:
-// CHECK-NEXT:    ret ptr @fmv._Mbti
+// CHECK-NEXT:    ret ptr @fmv._Msme2
 // CHECK:       resolver_else16:
 // CHECK-NEXT:    ret ptr @fmv.default
 //
@@ -363,16 +363,16 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 2251799813685760
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 2251799813685760
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 274877907456
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 274877907456
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
 // CHECK-NEXT:    ret ptr @fmv_one._Mls64Msimd
 // CHECK:       resolver_else:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 262144
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 262144
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 32768
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 32768
 // CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK:       resolver_return1:
@@ -385,20 +385,20 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 66048
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 66048
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 16896
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 16896
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
 // CHECK-NEXT:    ret ptr @fmv_two._Mfp16Msimd
 // CHECK:       resolver_else:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 33554432
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 33554432
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 2097152
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 2097152
 // CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK:       resolver_return1:
-// CHECK-NEXT:    ret ptr @fmv_two._Mdgh
+// CHECK-NEXT:    ret ptr @fmv_two._Mfrintts
 // CHECK:       resolver_else2:
 // CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
 // CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 512
@@ -421,49 +421,49 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_e.default
-// CHECK-SAME: () #[[ATTR11]] {
+// CHECK-SAME: () #[[ATTR13]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 20
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_d._Msb
-// CHECK-SAME: () #[[ATTR13:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR14:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_d.default
-// CHECK-SAME: () #[[ATTR11]] {
+// CHECK-SAME: () #[[ATTR13]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_default
-// CHECK-SAME: () #[[ATTR11]] {
+// CHECK-SAME: () #[[ATTR13]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 111
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_c._Mssbs
-// CHECK-SAME: () #[[ATTR11]] {
+// CHECK-SAME: () #[[ATTR13]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret void
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_c.default
-// CHECK-SAME: () #[[ATTR11]] {
+// CHECK-SAME: () #[[ATTR13]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret void
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@goo
-// CHECK-SAME: () #[[ATTR11]] {
+// CHECK-SAME: () #[[ATTR13]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 @fmv_inline()
 // CHECK-NEXT:    [[CALL1:%.*]] = call i32 @fmv_e()
@@ -477,96 +477,96 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 4398048673856
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 4398048673856
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 8590213184
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 8590213184
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
 // CHECK-NEXT:    ret ptr @fmv_inline._MfcmaMfp16MrdmMsme
 // CHECK:       resolver_else:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 864726312827224064
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 864726312827224064
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 26405458935808
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 26405458935808
 // CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK:       resolver_return1:
-// CHECK-NEXT:    ret ptr @fmv_inline._Mmemtag3MmopsMrcpc3
+// CHECK-NEXT:    ret ptr @fmv_inline._MmemtagMmopsMrcpc3
 // CHECK:       resolver_else2:
 // CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 893353197568
-// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 893353197568
+// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 1879048192
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 1879048192
 // CHECK-NEXT:    [[TMP11:%.*]] = and i1 true, [[TMP10]]
 // CHECK-NEXT:    br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]]
 // CHECK:       resolver_return3:
-// CHECK-NEXT:    ret ptr @fmv_inline._Msve2Msve2-bitpermMsve2-pmull128
+// CHECK-NEXT:    ret ptr @fmv_inline._Msve2Msve2-aesMsve2-bitperm
 // CHECK:       resolver_else4:
 // CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = and i64 [[TMP12]], 34359773184
-// CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 34359773184
+// CHECK-NEXT:    [[TMP13:%.*]] = and i64 [[TMP12]], 71307264
+// CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 71307264
 // CHECK-NEXT:    [[TMP15:%.*]] = and i1 true, [[TMP14]]
 // CHECK-NEXT:    br i1 [[TMP15]], label [[RESOLVER_RETURN5:%.*]], label [[RESOLVER_ELSE6:%.*]]
 // CHECK:       resolver_return5:
-// CHECK-NEXT:    ret ptr @fmv_inline._Mf64mmMpmullMsha1
+// CHECK-NEXT:    ret ptr @fmv_inline._Mf32mmMi8mmMsha3
 // CHECK:       resolver_else6:
 // CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 17246986240
-// CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 17246986240
+// CHECK-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 4535485464576
+// CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 4535485464576
 // CHECK-NEXT:    [[TMP19:%.*]] = and i1 true, [[TMP18]]
 // CHECK-NEXT:    br i1 [[TMP19]], label [[RESOLVER_RETURN7:%.*]], label [[RESOLVER_ELSE8:%.*]]
 // CHECK:       resolver_return7:
-// CHECK-NEXT:    ret ptr @fmv_inline._Mf32mmMi8mmMsha3
+// CHECK-NEXT:    ret ptr @fmv_inline._Msme2Mssbs
 // CHECK:       resolver_else8:
 // CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP21:%.*]] = and i64 [[TMP20]], 19791209299968
-// CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 19791209299968
+// CHECK-NEXT:    [[TMP21:%.*]] = and i64 [[TMP20]], 21474836480
+// CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 21474836480
 // CHECK-NEXT:    [[TMP23:%.*]] = and i1 true, [[TMP22]]
 // CHECK-NEXT:    br i1 [[TMP23]], label [[RESOLVER_RETURN9:%.*]], label [[RESOLVER_ELSE10:%.*]]
 // CHECK:       resolver_return9:
-// CHECK-NEXT:    ret ptr @fmv_inline._Mmemtag2Msve2-sm4
+// CHECK-NEXT:    ret ptr @fmv_inline._MmemtagMsve2-sm4
 // CHECK:       resolver_else10:
 // CHECK-NEXT:    [[TMP24:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP25:%.*]] = and i64 [[TMP24]], 1236950581248
-// CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 1236950581248
+// CHECK-NEXT:    [[TMP25:%.*]] = and i64 [[TMP24]], 8623489024
+// CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[TMP25]], 8623489024
 // CHECK-NEXT:    [[TMP27:%.*]] = and i1 true, [[TMP26]]
 // CHECK-NEXT:    br i1 [[TMP27]], label [[RESOLVER_RETURN11:%.*]], label [[RESOLVER_ELSE12:%.*]]
 // CHECK:       resolver_return11:
-// CHECK-NEXT:    ret ptr @fmv_inline._Msve2-aesMsve2-sha3
+// CHECK-NEXT:    ret ptr @fmv_inline._MsmeMsve
 // CHECK:       resolver_else12:
 // CHECK-NEXT:    [[TMP28:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP29:%.*]] = and i64 [[TMP28]], 4295098368
-// CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[TMP29]], 4295098368
+// CHECK-NEXT:    [[TMP29:%.*]] = and i64 [[TMP28]], 2684354560
+// CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[TMP29]], 2684354560
 // CHECK-NEXT:    [[TMP31:%.*]] = and i1 true, [[TMP30]]
 // CHECK-NEXT:    br i1 [[TMP31]], label [[RESOLVER_RETURN13:%.*]], label [[RESOLVER_ELSE14:%.*]]
 // CHECK:       resolver_return13:
-// CHECK-NEXT:    ret ptr @fmv_inline._MditMsve-ebf16
+// CHECK-NEXT:    ret ptr @fmv_inline._Msve2-aesMsve2-sha3
 // CHECK:       resolver_else14:
 // CHECK-NEXT:    [[TMP32:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP33:%.*]] = and i64 [[TMP32]], 3221225472
-// CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[TMP33]], 3221225472
+// CHECK-NEXT:    [[TMP33:%.*]] = and i64 [[TMP32]], 281475110928384
+// CHECK-NEXT:    [[TMP34:%.*]] = icmp eq i64 [[TMP33]], 281475110928384
 // CHECK-NEXT:    [[TMP35:%.*]] = and i1 true, [[TMP34]]
 // CHECK-NEXT:    br i1 [[TMP35]], label [[RESOLVER_RETURN15:%.*]], label [[RESOLVER_ELSE16:%.*]]
 // CHECK:       resolver_return15:
-// CHECK-NEXT:    ret ptr @fmv_inline._MsveMsve-bf16
+// CHECK-NEXT:    ret ptr @fmv_inline._McryptoMf64mm
 // CHECK:       resolver_else16:
 // CHECK-NEXT:    [[TMP36:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = and i64 [[TMP36]], 20971520
-// CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[TMP37]], 20971520
+// CHECK-NEXT:    [[TMP37:%.*]] = and i64 [[TMP36]], 2621440
+// CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[TMP37]], 2621440
 // CHECK-NEXT:    [[TMP39:%.*]] = and i1 true, [[TMP38]]
 // CHECK-NEXT:    br i1 [[TMP39]], label [[RESOLVER_RETURN17:%.*]], label [[RESOLVER_ELSE18:%.*]]
 // CHECK:       resolver_return17:
 // CHECK-NEXT:    ret ptr @fmv_inline._MfrinttsMrcpc
 // CHECK:       resolver_else18:
 // CHECK-NEXT:    [[TMP40:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP41:%.*]] = and i64 [[TMP40]], 8650752
-// CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[TMP41]], 8650752
+// CHECK-NEXT:    [[TMP41:%.*]] = and i64 [[TMP40]], 1081344
+// CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[TMP41]], 1081344
 // CHECK-NEXT:    [[TMP43:%.*]] = and i1 true, [[TMP42]]
 // CHECK-NEXT:    br i1 [[TMP43]], label [[RESOLVER_RETURN19:%.*]], label [[RESOLVER_ELSE20:%.*]]
 // CHECK:       resolver_return19:
 // CHECK-NEXT:    ret ptr @fmv_inline._MdpbMrcpc2
 // CHECK:       resolver_else20:
 // CHECK-NEXT:    [[TMP44:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP45:%.*]] = and i64 [[TMP44]], 1572864
-// CHECK-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[TMP45]], 1572864
+// CHECK-NEXT:    [[TMP45:%.*]] = and i64 [[TMP44]], 196608
+// CHECK-NEXT:    [[TMP46:%.*]] = icmp eq i64 [[TMP45]], 196608
 // CHECK-NEXT:    [[TMP47:%.*]] = and i1 true, [[TMP46]]
 // CHECK-NEXT:    br i1 [[TMP47]], label [[RESOLVER_RETURN21:%.*]], label [[RESOLVER_ELSE22:%.*]]
 // CHECK:       resolver_return21:
@@ -581,8 +581,8 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:    ret ptr @fmv_inline._Mfp16fmlMsimd
 // CHECK:       resolver_else24:
 // CHECK-NEXT:    [[TMP52:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP53:%.*]] = and i64 [[TMP52]], 16400
-// CHECK-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[TMP53]], 16400
+// CHECK-NEXT:    [[TMP53:%.*]] = and i64 [[TMP52]], 8208
+// CHECK-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[TMP53]], 8208
 // CHECK-NEXT:    [[TMP55:%.*]] = and i1 true, [[TMP54]]
 // CHECK-NEXT:    br i1 [[TMP55]], label [[RESOLVER_RETURN25:%.*]], label [[RESOLVER_ELSE26:%.*]]
 // CHECK:       resolver_return25:
@@ -611,8 +611,8 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 2251799813685248
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 2251799813685248
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 274877906944
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 274877906944
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
@@ -625,8 +625,8 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 70368744177664
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 70368744177664
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 34359738368
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 34359738368
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
@@ -639,8 +639,8 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 281474976710656
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 281474976710656
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 137438953472
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 137438953472
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
@@ -651,7 +651,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@recur
-// CHECK-SAME: () #[[ATTR11]] {
+// CHECK-SAME: () #[[ATTR13]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @reca()
 // CHECK-NEXT:    ret void
@@ -659,7 +659,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@main
-// CHECK-SAME: () #[[ATTR11]] {
+// CHECK-SAME: () #[[ATTR13]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -670,7 +670,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@hoo
-// CHECK-SAME: () #[[ATTR11]] {
+// CHECK-SAME: () #[[ATTR13]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FP1:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[FP2:%.*]] = alloca ptr, align 8
@@ -687,14 +687,14 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_forward_default_decl._Mmops
-// CHECK-SAME: () #[[ATTR14:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR7]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_extern_forward_default_decl._Mdotprod
-// CHECK-SAME: () #[[ATTR15:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR3]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
@@ -708,14 +708,14 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_default_def._Msve
-// CHECK-SAME: () #[[ATTR16:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR15:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_default_def.default
-// CHECK-SAME: () #[[ATTR11]] {
+// CHECK-SAME: () #[[ATTR13]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
@@ -729,56 +729,56 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_default_def.default
-// CHECK-SAME: () #[[ATTR11]] {
+// CHECK-SAME: () #[[ATTR13]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_forward_default_def.default
-// CHECK-SAME: () #[[ATTR11]] {
+// CHECK-SAME: () #[[ATTR13]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_with_implicit_forward_default_def._Mlse
-// CHECK-SAME: () #[[ATTR17:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR16:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@unused_without_default._Mrdm
-// CHECK-SAME: () #[[ATTR18:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR17:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@default_def_with_version_decls.default
-// CHECK-SAME: () #[[ATTR11]] {
+// CHECK-SAME: () #[[ATTR13]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@used_def_without_default_decl._Mjscvt
-// CHECK-SAME: () #[[ATTR21:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR20:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@used_def_without_default_decl._Mrdm
-// CHECK-SAME: () #[[ATTR18]] {
+// CHECK-SAME: () #[[ATTR17]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@caller
-// CHECK-SAME: () #[[ATTR11]] {
+// CHECK-SAME: () #[[ATTR13]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 @used_def_without_default_decl()
 // CHECK-NEXT:    [[CALL1:%.*]] = call i32 @used_decl_without_default_decl()
@@ -790,8 +790,8 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1048576
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1048576
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 131072
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 131072
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
@@ -812,8 +812,8 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1048576
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1048576
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 131072
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 131072
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
@@ -831,92 +831,92 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mf64mmMpmullMsha1
-// CHECK-SAME: () #[[ATTR22:[0-9]+]] {
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._McryptoMf64mm
+// CHECK-SAME: () #[[ATTR21:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._MfcmaMfp16MrdmMsme
-// CHECK-SAME: () #[[ATTR23:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR22:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mf32mmMi8mmMsha3
-// CHECK-SAME: () #[[ATTR24:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR23:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 12
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MditMsve-ebf16
-// CHECK-SAME: () #[[ATTR25:[0-9]+]] {
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msme2Mssbs
+// CHECK-SAME: () #[[ATTR8]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 8
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._MdpbMrcpc2
-// CHECK-SAME: () #[[ATTR26:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR24:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 6
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mdpb2Mjscvt
-// CHECK-SAME: () #[[ATTR27:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR25:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 7
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._MfrinttsMrcpc
-// CHECK-SAME: () #[[ATTR28:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR26:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MsveMsve-bf16
-// CHECK-SAME: () #[[ATTR29:[0-9]+]] {
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MsmeMsve
+// CHECK-SAME: () #[[ATTR27:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 4
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msve2-aesMsve2-sha3
-// CHECK-SAME: () #[[ATTR30:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR28:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 5
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msve2Msve2-bitpermMsve2-pmull128
-// CHECK-SAME: () #[[ATTR31:[0-9]+]] {
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msve2Msve2-aesMsve2-bitperm
+// CHECK-SAME: () #[[ATTR29:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 9
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mmemtag2Msve2-sm4
-// CHECK-SAME: () #[[ATTR32:[0-9]+]] {
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MmemtagMsve2-sm4
+// CHECK-SAME: () #[[ATTR30:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 10
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mmemtag3MmopsMrcpc3
-// CHECK-SAME: () #[[ATTR33:[0-9]+]] {
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MmemtagMmopsMrcpc3
+// CHECK-SAME: () #[[ATTR31:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 11
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._MaesMdotprod
-// CHECK-SAME: () #[[ATTR15]] {
+// CHECK-SAME: () #[[ATTR3]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 13
 //
@@ -930,21 +930,21 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._MfpMsm4
-// CHECK-SAME: () #[[ATTR34:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR32:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 15
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline._MlseMrdm
-// CHECK-SAME: () #[[ATTR35:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR33:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 16
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_inline.default
-// CHECK-SAME: () #[[ATTR11]] {
+// CHECK-SAME: () #[[ATTR13]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
 //
@@ -953,8 +953,8 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1073741824
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1073741824
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 33554432
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 33554432
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
@@ -967,8 +967,8 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 65536
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 65536
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 16384
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 16384
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
@@ -995,8 +995,8 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1048576
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1048576
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 131072
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 131072
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
@@ -1132,39 +1132,37 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+flagm,+fp16fml,+fullfp16,+neon,+rand,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+altnzcv,+bf16,+flagm,+sme,+sme-i16i64,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,+neon,+sha2,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR3]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+ls64,+neon,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR3]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+neon,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp16fml,+fullfp16,+neon,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR5]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR6]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR7]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR7]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR8]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme,+sme2,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR9:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR10]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccpp,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR11]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR11]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fptoint,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR12]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR13]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sb,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR14]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR15]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+neon,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR16]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,+sve,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR17]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR18]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,+rdm,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR19:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+jsconv,+neon,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR20:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,+rdm,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR21]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+jsconv,+neon,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR22]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+f64mm,+fullfp16,+neon,+sve,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR23]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+complxnum,+fullfp16,+neon,+rdm,+sme,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR24]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+f32mm,+fullfp16,+i8mm,+neon,+sha2,+sha3,+sve,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR25]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+dit,+fullfp16,+neon,+sve,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR26]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccpp,+rcpc,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR27]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccdp,+ccpp,+jsconv,+neon,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR28]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fptoint,+rcpc,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR29]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fullfp16,+neon,+sve,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR30]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-sha3,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR31]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-bitperm,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR32]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+mte,+neon,+sve,+sve2,+sve2-sm4,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR33]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops,+mte,+rcpc,+rcpc3,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR34]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,+sm4,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR35]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,+neon,+rdm,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR13]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR14]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sb,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR15]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,+sve,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR16]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR17]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,+rdm,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR18:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+jsconv,+neon,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR19:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,+rdm,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR20]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+jsconv,+neon,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR21]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+f64mm,+fullfp16,+neon,+sha2,+sve,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR22]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+complxnum,+fullfp16,+neon,+rdm,+sme,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR23]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+f32mm,+fullfp16,+i8mm,+neon,+sha2,+sha3,+sve,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR24]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccpp,+rcpc,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR25]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccdp,+ccpp,+jsconv,+neon,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR26]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fptoint,+rcpc,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR27]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fullfp16,+neon,+sme,+sve,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR28]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-sha3,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR29]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-bitperm,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR30]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,+sve,+sve2,+sve2-sm4,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR31]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops,+rcpc,+rcpc3,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR32]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,+sm4,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR33]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,+neon,+rdm,-fp-armv8,-v9.5a" }
 //.
 // CHECK-NOFMV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" }
 // CHECK-NOFMV: attributes #[[ATTR1:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" }
diff --git a/clang/test/CodeGen/bitfield-access-unit.c b/clang/test/CodeGen/bitfield-access-unit.c
index 1aed2e7202fc6..d0553c5183eef 100644
--- a/clang/test/CodeGen/bitfield-access-unit.c
+++ b/clang/test/CodeGen/bitfield-access-unit.c
@@ -222,6 +222,24 @@ struct G {
 // LAYOUT-DWN32-NEXT: <CGBitFieldInfo Offset:{{[0-9]+}} Size:7 IsSigned:1 StorageSize:8 StorageOffset:4
 // CHECK-NEXT: ]>
 
+struct __attribute__((aligned(8))) H {
+  char a;
+  unsigned b : 24; // on expensive alignment we want this to stay 24
+  unsigned c __attribute__((aligned(8))); // Think 'long long' or lp64 ptr
+} h;
+// CHECK-LABEL: LLVMType:%struct.H =
+// LAYOUT-FLEX-SAME: type <{ i8, i32, [3 x i8], i32, [4 x i8] }>
+// LAYOUT-STRICT-SAME: type { i8, [3 x i8], [4 x i8], i32, [4 x i8] }
+// LAYOUT-DWN32-FLEX-SAME: type <{ i8, i32, [3 x i8], i32, [4 x i8] }>
+// LAYOUT-DWN32-STRICT-SAME: type { i8, [3 x i8], [4 x i8], i32, [4 x i8] }
+// CHECK: BitFields:[
+// LAYOUT-FLEX-NEXT: <CGBitFieldInfo Offset:{{[0-9]+}} Size:24 IsSigned:0 StorageSize:32 StorageOffset:1
+// LAYOUT-STRICT-NEXT: <CGBitFieldInfo Offset:{{[0-9]+}} Size:24 IsSigned:0 StorageSize:24 StorageOffset:1
+
+// LAYOUT-DWN32-FLEX-NEXT: <CGBitFieldInfo Offset:{{[0-9]+}} Size:24 IsSigned:0 StorageSize:32 StorageOffset:1
+// LAYOUT-DWN32-STRICT-NEXT: <CGBitFieldInfo Offset:{{[0-9]+}} Size:24 IsSigned:0 StorageSize:24 StorageOffset:1
+// CHECK-NEXT: ]>
+
 #if _LP64
 struct A64 {
   int a : 16;
diff --git a/clang/test/CodeGen/builtin-allow-runtime-check.cpp b/clang/test/CodeGen/builtin-allow-runtime-check.cpp
new file mode 100644
index 0000000000000..db3f59a9d48a1
--- /dev/null
+++ b/clang/test/CodeGen/builtin-allow-runtime-check.cpp
@@ -0,0 +1,29 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -cc1 -triple x86_64-pc-linux-gnu -emit-llvm -o - %s | FileCheck %s
+
+static_assert(__has_builtin(__builtin_allow_runtime_check), "");
+
+// CHECK-LABEL: define dso_local noundef zeroext i1 @_Z4testv(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i1 @llvm.allow.runtime.check(metadata !"mycheck")
+// CHECK-NEXT:    ret i1 [[TMP0]]
+//
+bool test() {
+  return __builtin_allow_runtime_check("mycheck");
+}
+
+// CHECK-LABEL: define dso_local noundef zeroext i1 @_Z10test_twicev(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call i1 @llvm.allow.runtime.check(metadata !"mycheck")
+// CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[TMP0]] to i32
+// CHECK-NEXT:    [[TMP1:%.*]] = call i1 @llvm.allow.runtime.check(metadata !"mycheck")
+// CHECK-NEXT:    [[CONV1:%.*]] = zext i1 [[TMP1]] to i32
+// CHECK-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[CONV1]]
+// CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[OR]], 0
+// CHECK-NEXT:    ret i1 [[TOBOOL]]
+//
+bool test_twice() {
+  return __builtin_allow_runtime_check("mycheck") | __builtin_allow_runtime_check("mycheck");
+}
diff --git a/clang/test/CodeGen/pgo-force-function-attrs.ll b/clang/test/CodeGen/pgo-force-function-attrs.ll
new file mode 100644
index 0000000000000..3e9ea95e4df41
--- /dev/null
+++ b/clang/test/CodeGen/pgo-force-function-attrs.ll
@@ -0,0 +1,12 @@
+; RUN: %clang_cc1 -O2 -mllvm -pgo-cold-func-opt=optsize -mllvm -enable-pgo-force-function-attrs -fprofile-sample-use=%S/Inputs/pgo-sample.prof %s -emit-llvm -o - | FileCheck %s --check-prefix=OPTSIZE
+; Check that no profile means no optsize
+; RUN: %clang_cc1 -O2 -mllvm -pgo-cold-func-opt=optsize -mllvm -enable-pgo-force-function-attrs %s -emit-llvm -o - | FileCheck %s --check-prefix=NONE
+; Check that no -pgo-cold-func-opt=optsize means no optsize
+; RUN: %clang_cc1 -O2 -mllvm -enable-pgo-force-function-attrs -fprofile-sample-use=%S/Inputs/pgo-sample.prof %s -emit-llvm -o - | FileCheck %s --check-prefix=NONE
+
+; NONE-NOT: optsize
+; OPTSIZE: optsize
+
+define void @f() cold {
+  ret void
+}
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index acff367d50eb9..c184f314f68f8 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -251,11 +251,11 @@
 
 // RUN: %clang_cc1 -triple spir-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=SPIR
-// SPIR: target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+// SPIR: target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-G1"
 
 // RUN: %clang_cc1 -triple spir64-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=SPIR64
-// SPIR64: target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+// SPIR64: target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-G1"
 
 // RUN: %clang_cc1 -triple bpfel -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=BPFEL
diff --git a/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp b/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp
index 7953f902bf09b..d439f96982eb4 100644
--- a/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp
+++ b/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs
 // RUN: %clang_cc1 -std=c++11 -triple aarch64-linux-gnu -emit-llvm %s -o - | FileCheck %s
 
-int __attribute__((target_clones("ls64_v+fp16", "default"))) foo_ovl(int) { return 1; }
-int __attribute__((target_clones("ls64_accdata+ls64"))) foo_ovl(void) { return 2; }
+int __attribute__((target_clones("ls64+fp16", "default"))) foo_ovl(int) { return 1; }
+int __attribute__((target_clones("sme+ls64"))) foo_ovl(void) { return 2; }
 
 int bar() {
   return foo_ovl(1) + foo_ovl();
@@ -35,9 +35,6 @@ void run_foo_tml() {
   Mc4.foo_tml();
 }
 
-
-
-
 //.
 // CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
 // CHECK: @_Z7foo_ovli.ifunc = weak_odr alias i32 (i32), ptr @_Z7foo_ovli
@@ -49,7 +46,7 @@ void run_foo_tml() {
 // CHECK: @_ZN7MyClassIssE7foo_tmlEv = weak_odr ifunc i32 (ptr), ptr @_ZN7MyClassIssE7foo_tmlEv.resolver
 // CHECK: @_ZN7MyClassIisE7foo_tmlEv = weak_odr ifunc i32 (ptr), ptr @_ZN7MyClassIisE7foo_tmlEv.resolver
 //.
-// CHECK-LABEL: @_Z7foo_ovli._Mfp16Mls64_v(
+// CHECK-LABEL: @_Z7foo_ovli._Mfp16Mls64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4
@@ -60,17 +57,17 @@ void run_foo_tml() {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 4503599627436032
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 4503599627436032
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 274877923328
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 274877923328
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @_Z7foo_ovli._Mfp16Mls64_v
+// CHECK-NEXT:    ret ptr @_Z7foo_ovli._Mfp16Mls64
 // CHECK:       resolver_else:
 // CHECK-NEXT:    ret ptr @_Z7foo_ovli.default
 //
 //
-// CHECK-LABEL: @_Z7foo_ovlv._Mls64Mls64_accdata(
+// CHECK-LABEL: @_Z7foo_ovlv._Mls64Msme(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
 //
@@ -79,12 +76,12 @@ void run_foo_tml() {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 11258999068426240
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 11258999068426240
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 283467841536
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 283467841536
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @_Z7foo_ovlv._Mls64Mls64_accdata
+// CHECK-NEXT:    ret ptr @_Z7foo_ovlv._Mls64Msme
 // CHECK:       resolver_else:
 // CHECK-NEXT:    ret ptr @_Z7foo_ovlv.default
 //
@@ -114,16 +111,16 @@ void run_foo_tml() {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 36310271995674624
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36310271995674624
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1236950581248
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1236950581248
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
 // CHECK-NEXT:    ret ptr @_ZN7MyClassIssE7foo_tmlEv._Msme-f64f64Mssbs
 // CHECK:       resolver_else:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 16777216
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 16777216
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 2097152
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 2097152
 // CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK:       resolver_return1:
@@ -136,16 +133,16 @@ void run_foo_tml() {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 36310271995674624
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36310271995674624
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1236950581248
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1236950581248
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
 // CHECK-NEXT:    ret ptr @_ZN7MyClassIisE7foo_tmlEv._Msme-f64f64Mssbs
 // CHECK:       resolver_else:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 16777216
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 16777216
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 2097152
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 2097152
 // CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK:       resolver_return1:
@@ -231,7 +228,7 @@ void run_foo_tml() {
 //
 //.
 // CHECK: attributes #[[ATTR0:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon" }
-// CHECK: attributes #[[ATTR1:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ls64" }
+// CHECK: attributes #[[ATTR1:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme" }
 // CHECK: attributes #[[ATTR2:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
 // CHECK: attributes #[[ATTR3:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fptoint" }
 // CHECK: attributes #[[ATTR4:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme,+sme-f64f64" }
diff --git a/clang/test/CodeGenCXX/attr-target-version.cpp b/clang/test/CodeGenCXX/attr-target-version.cpp
index 8b7273fe3bb51..24433b8f20b7d 100644
--- a/clang/test/CodeGenCXX/attr-target-version.cpp
+++ b/clang/test/CodeGenCXX/attr-target-version.cpp
@@ -3,7 +3,7 @@
 
 int __attribute__((target_version("sme-f64f64+bf16"))) foo(int) { return 1; }
 int __attribute__((target_version("default"))) foo(int) { return 2; }
-int __attribute__((target_version("sm4+ebf16"))) foo(void) { return 3; }
+int __attribute__((target_version("sm4+bf16"))) foo(void) { return 3; }
 int __attribute__((target_version("default"))) foo(void) { return 4; }
 
 struct MyClass {
@@ -89,7 +89,7 @@ int bar() {
 // CHECK-NEXT:    ret i32 2
 //
 //
-// CHECK-LABEL: @_Z3foov._Mebf16Msm4(
+// CHECK-LABEL: @_Z3foov._Mbf16Msm4(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
 //
@@ -246,8 +246,8 @@ int bar() {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 36028797153181696
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36028797153181696
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1099520016384
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1099520016384
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
@@ -260,12 +260,12 @@ int bar() {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 268435488
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 268435488
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 8388640
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 8388640
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @_Z3foov._Mebf16Msm4
+// CHECK-NEXT:    ret ptr @_Z3foov._Mbf16Msm4
 // CHECK:       resolver_else:
 // CHECK-NEXT:    ret ptr @_Z3foov.default
 //
@@ -274,8 +274,8 @@ int bar() {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1073741824
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1073741824
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 33554432
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 33554432
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
@@ -288,8 +288,8 @@ int bar() {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 65536
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 65536
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 16384
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 16384
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
diff --git a/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp b/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp
deleted file mode 100644
index 95deee8bb1f1f..0000000000000
--- a/clang/test/CodeGenCXX/control-flow-in-stmt-expr.cpp
+++ /dev/null
@@ -1,393 +0,0 @@
-// RUN: %clang_cc1 --std=c++20 -fexceptions -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck -check-prefixes=EH %s
-// RUN: %clang_cc1 --std=c++20 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck -check-prefixes=NOEH,CHECK %s
-
-struct Printy {
-  Printy(const char *name) : name(name) {}
-  ~Printy() {}
-  const char *name;
-};
-
-int foo() { return 2; }
-
-struct Printies {
-  Printy a;
-  Printy b;
-  Printy c;
-};
-
-void ParenInit() {
-  // CHECK-LABEL: define dso_local void @_Z9ParenInitv()
-  // CHECK: [[CLEANUP_DEST:%.+]] = alloca i32, align 4
-  Printies ps(Printy("a"), 
-              // CHECK: call void @_ZN6PrintyC1EPKc
-              ({
-                if (foo()) return;
-                // CHECK:     if.then:
-                // CHECK-NEXT:   store i32 1, ptr [[CLEANUP_DEST]], align 4
-                // CHECK-NEXT:   br label %cleanup
-                Printy("b");
-                // CHECK:     if.end:
-                // CHECK-NEXT:  call void @_ZN6PrintyC1EPKc
-              }),
-              ({
-                if (foo()) return;
-                // CHECK:     if.then{{.*}}:
-                // CHECK-NEXT:  store i32 1, ptr [[CLEANUP_DEST]], align 4
-                // CHECK-NEXT:  call void @_ZN6PrintyD1Ev
-                // CHECK-NEXT:  br label %cleanup
-                Printy("c");
-                // CHECK:     if.end{{.*}}:
-                // CHECK-NEXT:  call void @_ZN6PrintyC1EPKc
-                // CHECK-NEXT:  call void @_ZN8PrintiesD1Ev
-                // CHECK-NEXT:  br label %return
-              }));
-  // CHECK:     cleanup:
-  // CHECK-NEXT:  call void @_ZN6PrintyD1Ev
-  // CHECK-NEXT:  br label %return
-}
-
-void break_in_stmt_expr() {
-  // Verify that the "break" in "if.then".calls dtor before jumping to "for.end".
-
-  // CHECK-LABEL: define dso_local void @_Z18break_in_stmt_exprv()
-  Printies p{Printy("a"), 
-            // CHECK: call void @_ZN6PrintyC1EPKc
-            ({
-                for (;;) {
-                    Printies ps{
-                      Printy("b"), 
-                      // CHECK: for.cond:
-                      // CHECK:   call void @_ZN6PrintyC1EPKc
-                      ({
-                        if (foo()) {
-                          break;
-                          // CHECK:       if.then:
-                          // CHECK-NEXT:    call void @_ZN6PrintyD1Ev
-                          // CHECK-NEXT:    br label %for.end
-                        }
-                        Printy("c");
-                        // CHECK:       if.end:
-                        // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
-                      }),
-                      Printy("d")};
-                      // CHECK:           call void @_ZN6PrintyC1EPKc
-                      // CHECK-NEXT:      call void @_ZN8PrintiesD1Ev
-                      // CHECK-NEXT:      br label %for.cond
-                }
-                Printy("e");
-  // CHECK:       for.end:
-  // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
-              }),
-              Printy("f")};
-  // CHECK:         call void @_ZN6PrintyC1EPKc
-  // CHECK-NEXT:    call void @_ZN8PrintiesD1Ev
-}
-
-void goto_in_stmt_expr() {
-  // Verify that:
-  //  - correct branch fixups for deactivated normal cleanups are generated correctly.
-
-  // CHECK-LABEL: define dso_local void @_Z17goto_in_stmt_exprv()
-  // CHECK: [[CLEANUP_DEST_SLOT:%cleanup.dest.slot.*]] = alloca i32, align 4
-  {
-    Printies p1{Printy("a"), // CHECK: call void @_ZN6PrintyC1EPKc
-                ({
-                  {
-                    Printies p2{Printy("b"),
-                                // CHECK: call void @_ZN6PrintyC1EPKc
-                                ({
-                                  if (foo() == 1) {
-                                    goto in;
-                                    // CHECK:       if.then:
-                                    // CHECK-NEXT:    store i32 2, ptr [[CLEANUP_DEST_SLOT]], align 4
-                                    // CHECK-NEXT:    br label %[[CLEANUP1:.+]]
-                                  }
-                                  if (foo() == 2) {
-                                    goto out;
-                                    // CHECK:       if.then{{.*}}:
-                                    // CHECK-NEXT:    store i32 3, ptr [[CLEANUP_DEST_SLOT]], align 4
-                                    // CHECK-NEXT:    br label %[[CLEANUP1]]
-                                  }
-                                  Printy("c");
-                                  // CHECK:       if.end{{.*}}:
-                                  // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
-                                }),
-                                Printy("d")};
-                                // CHECK:           call void @_ZN6PrintyC1EPKc
-                                // CHECK-NEXT:      call void @_ZN8PrintiesD1Ev
-                                // CHECK-NEXT:      br label %in
-
-                  }
-                in:
-                  Printy("e");
-                // CHECK:       in:                                               ; preds = %if.end{{.*}}, %[[CLEANUP1]]
-                // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
-                }),
-                Printy("f")};
-                // CHECK:         call void @_ZN6PrintyC1EPKc
-                // CHECK-NEXT:    call void @_ZN8PrintiesD1Ev
-                // CHECK-NEXT:    br label %out
-  }
-out:
-  return;
-  // CHECK:       out:
-  // CHECK-NEXT:    ret void
-
-  // CHECK:       [[CLEANUP1]]:                                          ; preds = %if.then{{.*}}, %if.then
-  // CHECK-NEXT:    call void @_ZN6PrintyD1Ev
-  // CHECK-NEXT:    %cleanup.dest = load i32, ptr [[CLEANUP_DEST_SLOT]], align 4
-  // CHECK-NEXT:    switch i32 %cleanup.dest, label %[[CLEANUP2:.+]] [
-  // CHECK-NEXT:      i32 2, label %in
-  // CHECK-NEXT:    ]
-
-  // CHECK:       [[CLEANUP2]]:                                         ; preds = %[[CLEANUP1]]
-  // CHECK-NEXT:    call void @_ZN6PrintyD1Ev
-  // CHECK-NEXT:    %cleanup.dest{{.*}} = load i32, ptr [[CLEANUP_DEST_SLOT]], align 4
-  // CHECK-NEXT:    switch i32 %cleanup.dest{{.*}}, label %unreachable [
-  // CHECK-NEXT:      i32 3, label %out
-  // CHECK-NEXT:    ]
-}
-
-void ArrayInit() {
-  // Printy arr[4] = {ctorA, ctorB, stmt-exprC, stmt-exprD};
-  // Verify that:
-  //  - We do the necessary stores for array cleanups (endOfInit and last constructed element).
-  //  - We update the array init element correctly for ctorA, ctorB and stmt-exprC.
-  //  - stmt-exprC and stmt-exprD share the array body dtor code (see %cleanup).
-
-  // CHECK-LABEL: define dso_local void @_Z9ArrayInitv()
-  // CHECK: %arrayinit.endOfInit = alloca ptr, align 8
-  // CHECK: %cleanup.dest.slot = alloca i32, align 4
-  // CHECK: %arrayinit.begin = getelementptr inbounds [4 x %struct.Printy], ptr %arr, i64 0, i64 0
-  // CHECK: store ptr %arrayinit.begin, ptr %arrayinit.endOfInit, align 8
-  Printy arr[4] = {
-    Printy("a"),
-    // CHECK: call void @_ZN6PrintyC1EPKc(ptr noundef nonnull align 8 dereferenceable(8) %arrayinit.begin, ptr noundef @.str)
-    // CHECK: [[ARRAYINIT_ELEMENT1:%.+]] = getelementptr inbounds %struct.Printy, ptr %arrayinit.begin, i64 1
-    // CHECK: store ptr [[ARRAYINIT_ELEMENT1]], ptr %arrayinit.endOfInit, align 8
-    Printy("b"),
-    // CHECK: call void @_ZN6PrintyC1EPKc(ptr noundef nonnull align 8 dereferenceable(8) [[ARRAYINIT_ELEMENT1]], ptr noundef @.str.1)
-    // CHECK: [[ARRAYINIT_ELEMENT2:%.+]] = getelementptr inbounds %struct.Printy, ptr [[ARRAYINIT_ELEMENT1]], i64 1
-    // CHECK: store ptr [[ARRAYINIT_ELEMENT2]], ptr %arrayinit.endOfInit, align 8
-    ({
-    // CHECK: br i1 {{.*}}, label %if.then, label %if.end
-      if (foo()) {
-        return;
-      // CHECK:       if.then:
-      // CHECK-NEXT:    store i32 1, ptr %cleanup.dest.slot, align 4
-      // CHECK-NEXT:    br label %cleanup
-      }
-      // CHECK:       if.end:
-      Printy("c");
-      // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
-      // CHECK-NEXT:    %arrayinit.element2 = getelementptr inbounds %struct.Printy, ptr %arrayinit.element1, i64 1
-      // CHECK-NEXT:    store ptr %arrayinit.element2, ptr %arrayinit.endOfInit, align 8
-    }),
-    ({
-    // CHECK: br i1 {{%.+}} label %[[IF_THEN2:.+]], label %[[IF_END2:.+]]
-      if (foo()) {
-        return;
-      // CHECK:       [[IF_THEN2]]:
-      // CHECK-NEXT:    store i32 1, ptr %cleanup.dest.slot, align 4
-      // CHECK-NEXT:    br label %cleanup
-      }
-      // CHECK:       [[IF_END2]]:
-      Printy("d");
-      // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
-      // CHECK-NEXT:    %array.begin = getelementptr inbounds [4 x %struct.Printy], ptr %arr, i32 0, i32 0
-      // CHECK-NEXT:    %0 = getelementptr inbounds %struct.Printy, ptr %array.begin, i64 4
-      // CHECK-NEXT:    br label %[[ARRAY_DESTROY_BODY1:.+]]
-  }),
-  };
-
-  // CHECK:       [[ARRAY_DESTROY_BODY1]]:
-  // CHECK-NEXT:    %arraydestroy.elementPast{{.*}} = phi ptr [ %0, %[[IF_END2]] ], [ %arraydestroy.element{{.*}}, %[[ARRAY_DESTROY_BODY1]] ]
-  // CHECK-NEXT:    %arraydestroy.element{{.*}} = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast{{.*}}, i64 -1
-  // CHECK-NEXT:    call void @_ZN6PrintyD1Ev
-  // CHECK-NEXT:    %arraydestroy.done{{.*}} = icmp eq ptr %arraydestroy.element{{.*}}, %array.begin
-  // CHECK-NEXT:    br i1 %arraydestroy.done{{.*}}, label %[[ARRAY_DESTROY_DONE1:.+]], label %[[ARRAY_DESTROY_BODY1]]
-
-  // CHECK:       [[ARRAY_DESTROY_DONE1]]:
-  // CHECK-NEXT:    ret void
-
-  // CHECK:       cleanup:
-  // CHECK-NEXT:    %1 = load ptr, ptr %arrayinit.endOfInit, align 8
-  // CHECK-NEXT:    %arraydestroy.isempty = icmp eq ptr %arrayinit.begin, %1
-  // CHECK-NEXT:    br i1 %arraydestroy.isempty, label %[[ARRAY_DESTROY_DONE2:.+]], label %[[ARRAY_DESTROY_BODY2:.+]]
-
-  // CHECK:       [[ARRAY_DESTROY_BODY2]]:
-  // CHECK-NEXT:    %arraydestroy.elementPast = phi ptr [ %1, %cleanup ], [ %arraydestroy.element, %[[ARRAY_DESTROY_BODY2]] ]
-  // CHECK-NEXT:    %arraydestroy.element = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast, i64 -1
-  // CHECK-NEXT:    call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element)
-  // CHECK-NEXT:    %arraydestroy.done = icmp eq ptr %arraydestroy.element, %arrayinit.begin
-  // CHECK-NEXT:    br i1 %arraydestroy.done, label %[[ARRAY_DESTROY_DONE2]], label %[[ARRAY_DESTROY_BODY2]]
-
-  // CHECK:       [[ARRAY_DESTROY_DONE2]]:
-  // CHECK-NEXT:    br label %[[ARRAY_DESTROY_DONE1]]
-}
-
-void ArraySubobjects() {
-  struct S {
-    Printy arr1[2];
-    Printy arr2[2];
-    Printy p;
-  };
-  // CHECK-LABEL: define dso_local void @_Z15ArraySubobjectsv()
-  // CHECK: %arrayinit.endOfInit = alloca ptr, align 8
-  S s{{Printy("a"), Printy("b")},
-      // CHECK: call void @_ZN6PrintyC1EPKc
-      // CHECK: call void @_ZN6PrintyC1EPKc
-      {Printy("a"),
-      // CHECK: [[ARRAYINIT_BEGIN:%.+]] = getelementptr inbounds [2 x %struct.Printy]
-      // CHECK: store ptr [[ARRAYINIT_BEGIN]], ptr %arrayinit.endOfInit, align 8
-      // CHECK: call void @_ZN6PrintyC1EPKc
-      // CHECK: [[ARRAYINIT_ELEMENT:%.+]] = getelementptr inbounds %struct.Printy
-      // CHECK: store ptr [[ARRAYINIT_ELEMENT]], ptr %arrayinit.endOfInit, align 8
-      ({
-         if (foo()) {
-           return;
-           // CHECK:      if.then:
-           // CHECK-NEXT:   [[V0:%.+]] = load ptr, ptr %arrayinit.endOfInit, align 8
-           // CHECK-NEXT:   %arraydestroy.isempty = icmp eq ptr [[ARRAYINIT_BEGIN]], [[V0]]
-           // CHECK-NEXT:   br i1 %arraydestroy.isempty, label %[[ARRAY_DESTROY_DONE:.+]], label %[[ARRAY_DESTROY_BODY:.+]]
-         }
-         Printy("b");
-       })
-      },
-      Printy("c")
-      // CHECK:       if.end:
-      // CHECK-NEXT:    call void @_ZN6PrintyC1EPKc
-      // CHECK:         call void @_ZN6PrintyC1EPKc
-      // CHECK-NEXT:    call void @_ZZ15ArraySubobjectsvEN1SD1Ev
-      // CHECK-NEXT:    br label %return
-    };
-    // CHECK:       return:
-    // CHECK-NEXT:    ret void
-
-    // CHECK:       [[ARRAY_DESTROY_BODY]]:
-    // CHECK-NEXT:    %arraydestroy.elementPast = phi ptr [ %0, %if.then ], [ %arraydestroy.element, %[[ARRAY_DESTROY_BODY]] ]
-    // CHECK-NEXT:    %arraydestroy.element = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast, i64 -1
-    // CHECK-NEXT:    call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element)
-    // CHECK-NEXT:    %arraydestroy.done = icmp eq ptr %arraydestroy.element, [[ARRAYINIT_BEGIN]]
-    // CHECK-NEXT:    br i1 %arraydestroy.done, label %[[ARRAY_DESTROY_DONE]], label %[[ARRAY_DESTROY_BODY]]
-
-    // CHECK:       [[ARRAY_DESTROY_DONE]]
-    // CHECK-NEXT:    [[ARRAY_BEGIN:%.+]] = getelementptr inbounds [2 x %struct.Printy], ptr %arr1, i32 0, i32 0
-    // CHECK-NEXT:    [[V1:%.+]] = getelementptr inbounds %struct.Printy, ptr [[ARRAY_BEGIN]], i64 2
-    // CHECK-NEXT:    br label %[[ARRAY_DESTROY_BODY2:.+]]
-
-    // CHECK:       [[ARRAY_DESTROY_BODY2]]:
-    // CHECK-NEXT:    %arraydestroy.elementPast5 = phi ptr [ %1, %[[ARRAY_DESTROY_DONE]] ], [ %arraydestroy.element6, %[[ARRAY_DESTROY_BODY2]] ]
-    // CHECK-NEXT:    %arraydestroy.element6 = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast5, i64 -1
-    // CHECK-NEXT:    call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element6)
-    // CHECK-NEXT:    %arraydestroy.done7 = icmp eq ptr %arraydestroy.element6, [[ARRAY_BEGIN]]
-    // CHECK-NEXT:    br i1 %arraydestroy.done7, label %[[ARRAY_DESTROY_DONE2:.+]], label %[[ARRAY_DESTROY_BODY2]]
-
-
-    // CHECK:     [[ARRAY_DESTROY_DONE2]]:
-    // CHECK-NEXT:  br label %return
-}
-
-void LambdaInit() {
-  // CHECK-LABEL: define dso_local void @_Z10LambdaInitv()
-  auto S = [a = Printy("a"), b = ({
-                               if (foo()) {
-                                 return;
-                                 // CHECK:       if.then:
-                                 // CHECK-NEXT:    call void @_ZN6PrintyD1Ev
-                                 // CHECK-NEXT:    br label %return
-                               }
-                               Printy("b");
-                             })]() { return a; };
-}
-
-void LifetimeExtended() {
-  // CHECK-LABEL: define dso_local void @_Z16LifetimeExtendedv
-  struct PrintyRefBind {
-    const Printy &a;
-    const Printy &b;
-  };
-  PrintyRefBind ps = {Printy("a"), ({
-                        if (foo()) {
-                          return;
-                          // CHECK: if.then:
-                          // CHECK-NEXT: call void @_ZN6PrintyD1Ev
-                          // CHECK-NEXT: br label %return
-                        }
-                        Printy("b");
-                      })};
-}
-
-void NewArrayInit() {
-  // CHECK-LABEL: define dso_local void @_Z12NewArrayInitv()
-  // CHECK: %array.init.end = alloca ptr, align 8
-  // CHECK: store ptr %0, ptr %array.init.end, align 8
-  Printy *array = new Printy[3]{
-    "a",
-    // CHECK: call void @_ZN6PrintyC1EPKc
-    // CHECK: store ptr %array.exp.next, ptr %array.init.end, align 8
-    "b", 
-    // CHECK: call void @_ZN6PrintyC1EPKc
-    // CHECK: store ptr %array.exp.next1, ptr %array.init.end, align 8
-    ({
-        if (foo()) {
-          return;
-          // CHECK: if.then:
-          // CHECK:   br i1 %arraydestroy.isempty, label %arraydestroy.done{{.*}}, label %arraydestroy.body
-        }
-        "b";
-        // CHECK: if.end:
-        // CHECK:   call void @_ZN6PrintyC1EPKc
-    })};
-  // CHECK:       arraydestroy.body:
-  // CHECK-NEXT:    %arraydestroy.elementPast = phi ptr [ %{{.*}}, %if.then ], [ %arraydestroy.element, %arraydestroy.body ]
-  // CHECK-NEXT:    %arraydestroy.element = getelementptr inbounds %struct.Printy, ptr %arraydestroy.elementPast, i64 -1
-  // CHECK-NEXT:    call void @_ZN6PrintyD1Ev(ptr noundef nonnull align 8 dereferenceable(8) %arraydestroy.element)
-  // CHECK-NEXT:    %arraydestroy.done = icmp eq ptr %arraydestroy.element, %0
-  // CHECK-NEXT:    br i1 %arraydestroy.done, label %arraydestroy.done{{.*}}, label %arraydestroy.body
-
-  // CHECK:       arraydestroy.done{{.*}}:                               ; preds = %arraydestroy.body, %if.then
-  // CHECK-NEXT:    br label %return
-}
-
-void DestroyInConditionalCleanup() {
-  // EH-LABEL: DestroyInConditionalCleanupv()
-  // NOEH-LABEL: DestroyInConditionalCleanupv()
-  struct A {
-    A() {}
-    ~A() {}
-  };
-
-  struct Value {
-    Value(A) {}
-    ~Value() {}
-  };
-
-  struct V2 {
-    Value K;
-    Value V;
-  };
-  // Verify we use conditional cleanups.
-  (void)(foo() ? V2{A(), A()} : V2{A(), A()});
-  // NOEH:   cond.true:
-  // NOEH:      call void @_ZZ27DestroyInConditionalCleanupvEN1AC1Ev
-  // NOEH:      store ptr %{{.*}}, ptr %cond-cleanup.save
-
-  // EH:   cond.true:
-  // EH:        invoke void @_ZZ27DestroyInConditionalCleanupvEN1AC1Ev
-  // EH:        store ptr %{{.*}}, ptr %cond-cleanup.save
-}
-
-void ArrayInitWithContinue() {
-  // CHECK-LABEL: @_Z21ArrayInitWithContinuev
-  // Verify that we start to emit the array destructor.
-  // CHECK: %arrayinit.endOfInit = alloca ptr, align 8
-  for (int i = 0; i < 1; ++i) {
-    Printy arr[2] = {"a", ({
-                       if (foo()) {
-                         continue;
-                       }
-                       "b";
-                     })};
-  }
-}
diff --git a/clang/test/CodeGenCXX/module-funcs-from-imports.cppm b/clang/test/CodeGenCXX/module-funcs-from-imports.cppm
index 33cdf437110a9..a2a9122fc3913 100644
--- a/clang/test/CodeGenCXX/module-funcs-from-imports.cppm
+++ b/clang/test/CodeGenCXX/module-funcs-from-imports.cppm
@@ -23,6 +23,21 @@ int func_in_gmf_not_called() {
     return 44;
 }
 
+template <class T>
+class A {
+public:
+    __attribute__((always_inline))
+    inline constexpr int getValue() {
+        return 43;
+    }
+
+    inline constexpr int getValue2() {
+        return 43;
+    }
+};
+
+extern template class A<char>;
+
 //--- M.cppm
 module;
 #include "foo.h"
@@ -47,17 +62,21 @@ int always_inline_func() {
     return 45;
 }
 
+export using ::A;
+
 //--- Use.cpp
 import M;
 int use() {
-    return exported_func() + always_inline_func();
+    A<char> a;
+    return exported_func() + always_inline_func() +
+           a.getValue() + a.getValue2();
 }
 
-// Checks that none of the function (except the always_inline_func) in the importees
-// are generated in the importer's code.
 // CHECK-O0: define{{.*}}_Z3usev(
 // CHECK-O0: declare{{.*}}_ZW1M13exported_funcv(
-// CHECK-O0: define{{.*}}available_externally{{.*}}_ZW1M18always_inline_funcv(
+// CHECK-O0: declare{{.*}}_ZW1M18always_inline_funcv(
+// CHECK-O0: define{{.*}}@_ZN1AIcE8getValueEv(
+// CHECK-O0: declare{{.*}}@_ZN1AIcE9getValue2Ev(
 // CHECK-O0-NOT: func_in_gmf
 // CHECK-O0-NOT: func_in_gmf_not_called
 // CHECK-O0-NOT: non_exported_func
@@ -68,7 +87,9 @@ int use() {
 // O0 to keep consistent ABI.
 // CHECK-O1: define{{.*}}_Z3usev(
 // CHECK-O1: declare{{.*}}_ZW1M13exported_funcv(
-// CHECK-O1: define{{.*}}available_externally{{.*}}_ZW1M18always_inline_funcv(
+// CHECK-O1: declare{{.*}}_ZW1M18always_inline_funcv(
+// CHECK-O1: define{{.*}}@_ZN1AIcE8getValueEv(
+// CHECK-O1: declare{{.*}}@_ZN1AIcE9getValue2Ev(
 // CHECK-O1-NOT: func_in_gmf
 // CHECK-O1-NOT: func_in_gmf_not_called
 // CHECK-O1-NOT: non_exported_func
diff --git a/clang/test/CodeGenCoroutines/coro-suspend-cleanups.cpp b/clang/test/CodeGenCoroutines/coro-suspend-cleanups.cpp
deleted file mode 100644
index 06cc2069dbe9a..0000000000000
--- a/clang/test/CodeGenCoroutines/coro-suspend-cleanups.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-// RUN: %clang_cc1 --std=c++20 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s
-
-#include "Inputs/coroutine.h"
-
-struct Printy {
-  Printy(const char *name) : name(name) {}
-  ~Printy() {}
-  const char *name;
-};
-
-struct coroutine {
-  struct promise_type;
-  std::coroutine_handle<promise_type> handle;
-  ~coroutine() {
-    if (handle) handle.destroy();
-  }
-};
-
-struct coroutine::promise_type {
-  coroutine get_return_object() {
-    return {std::coroutine_handle<promise_type>::from_promise(*this)};
-  }
-  std::suspend_never initial_suspend() noexcept { return {}; }
-  std::suspend_always final_suspend() noexcept { return {}; }
-  void return_void() {}
-  void unhandled_exception() {}
-};
-
-struct Awaiter : std::suspend_always {
-  Printy await_resume() { return {"awaited"}; }
-};
-
-int foo() { return 2; }
-
-coroutine ArrayInitCoro() {
-  // Verify that:
-  //  - We do the necessary stores for array cleanups.
-  //  - Array cleanups are called by await.cleanup.
-  //  - We activate the cleanup after the first element and deactivate it in await.ready (see cleanup.isactive).
-
-  // CHECK-LABEL: define dso_local void @_Z13ArrayInitCorov
-  // CHECK: %arrayinit.endOfInit = alloca ptr, align 8
-  // CHECK: %cleanup.isactive = alloca i1, align 1
-  Printy arr[2] = {
-    Printy("a"),
-    // CHECK:       %arrayinit.begin = getelementptr inbounds [2 x %struct.Printy], ptr %arr.reload.addr, i64 0, i64 0
-    // CHECK-NEXT:  %arrayinit.begin.spill.addr = getelementptr inbounds %_Z13ArrayInitCorov.Frame, ptr %0, i32 0, i32 10
-    // CHECK-NEXT:  store ptr %arrayinit.begin, ptr %arrayinit.begin.spill.addr, align 8
-    // CHECK-NEXT:  store i1 true, ptr %cleanup.isactive.reload.addr, align 1
-    // CHECK-NEXT:  store ptr %arrayinit.begin, ptr %arrayinit.endOfInit.reload.addr, align 8
-    // CHECK-NEXT:  call void @_ZN6PrintyC1EPKc(ptr noundef nonnull align 8 dereferenceable(8) %arrayinit.begin, ptr noundef @.str)
-    // CHECK-NEXT:  %arrayinit.element = getelementptr inbounds %struct.Printy, ptr %arrayinit.begin, i64 1
-    // CHECK-NEXT:  %arrayinit.element.spill.addr = getelementptr inbounds %_Z13ArrayInitCorov.Frame, ptr %0, i32 0, i32 11
-    // CHECK-NEXT:  store ptr %arrayinit.element, ptr %arrayinit.element.spill.addr, align 8
-    // CHECK-NEXT:  store ptr %arrayinit.element, ptr %arrayinit.endOfInit.reload.addr, align 8
-    co_await Awaiter{}
-    // CHECK-NEXT:  @_ZNSt14suspend_always11await_readyEv
-    // CHECK-NEXT:  br i1 %{{.+}}, label %await.ready, label %CoroSave30
-  };
-  // CHECK:       await.cleanup:                                    ; preds = %AfterCoroSuspend{{.*}}
-  // CHECK-NEXT:    br label %cleanup{{.*}}.from.await.cleanup
-
-  // CHECK:       cleanup{{.*}}.from.await.cleanup:                      ; preds = %await.cleanup
-  // CHECK:         br label %cleanup{{.*}}
-
-  // CHECK:       await.ready:
-  // CHECK-NEXT:    %arrayinit.element.reload.addr = getelementptr inbounds %_Z13ArrayInitCorov.Frame, ptr %0, i32 0, i32 11
-  // CHECK-NEXT:    %arrayinit.element.reload = load ptr, ptr %arrayinit.element.reload.addr, align 8
-  // CHECK-NEXT:    call void @_ZN7Awaiter12await_resumeEv
-  // CHECK-NEXT:    store i1 false, ptr %cleanup.isactive.reload.addr, align 1
-  // CHECK-NEXT:    br label %cleanup{{.*}}.from.await.ready
-
-  // CHECK:       cleanup{{.*}}:                                         ; preds = %cleanup{{.*}}.from.await.ready, %cleanup{{.*}}.from.await.cleanup
-  // CHECK:         %cleanup.is_active = load i1, ptr %cleanup.isactive.reload.addr, align 1
-  // CHECK-NEXT:    br i1 %cleanup.is_active, label %cleanup.action, label %cleanup.done
-
-  // CHECK:       cleanup.action:
-  // CHECK:         %arraydestroy.isempty = icmp eq ptr %arrayinit.begin.reload{{.*}}, %{{.*}}
-  // CHECK-NEXT:    br i1 %arraydestroy.isempty, label %arraydestroy.done{{.*}}, label %arraydestroy.body.from.cleanup.action
-  // Ignore rest of the array cleanup.
-}
-
-coroutine ArrayInitWithCoReturn() {
-  // CHECK-LABEL: define dso_local void @_Z21ArrayInitWithCoReturnv
-  // Verify that we start to emit the array destructor.
-  // CHECK: %arrayinit.endOfInit = alloca ptr, align 8
-  Printy arr[2] = {"a", ({
-                      if (foo()) {
-                        co_return;
-                      }
-                      "b";
-                    })};
-}
diff --git a/clang/test/Driver/riscv-features.c b/clang/test/Driver/riscv-features.c
index ce4947d2bc47b..5e1db5ba1ed3e 100644
--- a/clang/test/Driver/riscv-features.c
+++ b/clang/test/Driver/riscv-features.c
@@ -38,8 +38,8 @@
 // RUN: %clang --target=riscv32-unknown-elf -### %s -mno-strict-align 2>&1 | FileCheck %s -check-prefix=FAST-UNALIGNED-ACCESS
 // RUN: %clang --target=riscv32-unknown-elf -### %s -mstrict-align 2>&1 | FileCheck %s -check-prefix=NO-FAST-UNALIGNED-ACCESS
 
-// FAST-UNALIGNED-ACCESS: "-target-feature" "+fast-unaligned-access"
-// NO-FAST-UNALIGNED-ACCESS: "-target-feature" "-fast-unaligned-access"
+// FAST-UNALIGNED-ACCESS: "-target-feature" "+unaligned-scalar-mem" "-target-feature" "+unaligned-vector-mem"
+// NO-FAST-UNALIGNED-ACCESS: "-target-feature" "-unaligned-scalar-mem" "-target-feature" "-unaligned-vector-mem"
 
 // RUN: %clang --target=riscv32-unknown-elf -### %s 2>&1 | FileCheck %s -check-prefix=NOUWTABLE
 // RUN: %clang --target=riscv32-unknown-elf -fasynchronous-unwind-tables -### %s 2>&1 | FileCheck %s -check-prefix=UWTABLE
diff --git a/clang/test/Driver/windows-seh-async-verify.cpp b/clang/test/Driver/windows-seh-async-verify.cpp
new file mode 100644
index 0000000000000..ace93cf44a31d
--- /dev/null
+++ b/clang/test/Driver/windows-seh-async-verify.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang --target=x86_64-pc-windows -fasync-exceptions -fsyntax-only -### %s 2>&1 | FileCheck %s
+// RUN: %clang_cl --target=x86_64-pc-windows /EHa -fsyntax-only -### -- %s 2>&1 | FileCheck %s
+// RUN: %clang --target=x86_64-pc-windows-gnu -fasync-exceptions -fsyntax-only -### %s 2>&1 | FileCheck %s --check-prefixes=GNU-ALL,GNU
+// RUN: %clang_cl --target=x86_64-pc-windows-gnu /EHa -fsyntax-only -### -- %s 2>&1 | FileCheck %s --check-prefixes=GNU-ALL,CL-GNU
+
+// CHECK-NOT: warning
+// GNU: warning: argument unused during compilation: '-fasync-exceptions' [-Wunused-command-line-argument]
+// CL-GNU: warning: argument unused during compilation: '/EHa' [-Wunused-command-line-argument]
+
+// CHECK: -fasync-exceptions
+// GNU-ALL-NOT: -fasync-exceptions
+struct S {
+    union _Un {
+        ~_Un() {}
+        char _Buf[12];
+    };
+    _Un _un;
+};
+
+struct Embed {
+    S v2;
+};
+
+void PR62449() { Embed v{}; }
diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip
index 2e5f521a5feae..1271868a53b86 100644
--- a/clang/test/Headers/__clang_hip_math.hip
+++ b/clang/test/Headers/__clang_hip_math.hip
@@ -1685,7 +1685,7 @@ extern "C" __device__ double test_j1(double x) {
 // DEFAULT-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // DEFAULT-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// DEFAULT-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float
+// DEFAULT-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float
 // DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]]
 // DEFAULT-NEXT:    [[MUL8_I:%.*]] = fmul contract float [[__X1_0_I3]], [[DIV_I]]
 // DEFAULT-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]]
@@ -1718,7 +1718,7 @@ extern "C" __device__ double test_j1(double x) {
 // FINITEONLY-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // FINITEONLY-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// FINITEONLY-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float
+// FINITEONLY-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float
 // FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract float [[CONV_I]], [[Y]]
 // FINITEONLY-NEXT:    [[MUL8_I:%.*]] = fmul nnan ninf contract float [[__X1_0_I3]], [[DIV_I]]
 // FINITEONLY-NEXT:    [[SUB_I]] = fsub nnan ninf contract float [[MUL8_I]], [[__X0_0_I2]]
@@ -1751,7 +1751,7 @@ extern "C" __device__ double test_j1(double x) {
 // APPROX-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // APPROX-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // APPROX-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// APPROX-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float
+// APPROX-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float
 // APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]]
 // APPROX-NEXT:    [[MUL8_I:%.*]] = fmul contract float [[__X1_0_I3]], [[DIV_I]]
 // APPROX-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]]
@@ -1788,7 +1788,7 @@ extern "C" __device__ float test_jnf(int x, float y) {
 // DEFAULT-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // DEFAULT-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// DEFAULT-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double
+// DEFAULT-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double
 // DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]]
 // DEFAULT-NEXT:    [[MUL8_I:%.*]] = fmul contract double [[__X1_0_I3]], [[DIV_I]]
 // DEFAULT-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]]
@@ -1821,7 +1821,7 @@ extern "C" __device__ float test_jnf(int x, float y) {
 // FINITEONLY-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // FINITEONLY-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// FINITEONLY-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double
+// FINITEONLY-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double
 // FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract double [[CONV_I]], [[Y]]
 // FINITEONLY-NEXT:    [[MUL8_I:%.*]] = fmul nnan ninf contract double [[__X1_0_I3]], [[DIV_I]]
 // FINITEONLY-NEXT:    [[SUB_I]] = fsub nnan ninf contract double [[MUL8_I]], [[__X0_0_I2]]
@@ -1854,7 +1854,7 @@ extern "C" __device__ float test_jnf(int x, float y) {
 // APPROX-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // APPROX-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // APPROX-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// APPROX-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double
+// APPROX-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double
 // APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]]
 // APPROX-NEXT:    [[MUL8_I:%.*]] = fmul contract double [[__X1_0_I3]], [[DIV_I]]
 // APPROX-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]]
@@ -4222,7 +4222,7 @@ extern "C" __device__ double test_y1(double x) {
 // DEFAULT-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // DEFAULT-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// DEFAULT-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float
+// DEFAULT-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float
 // DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]]
 // DEFAULT-NEXT:    [[MUL8_I:%.*]] = fmul contract float [[__X1_0_I3]], [[DIV_I]]
 // DEFAULT-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]]
@@ -4255,7 +4255,7 @@ extern "C" __device__ double test_y1(double x) {
 // FINITEONLY-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // FINITEONLY-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// FINITEONLY-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float
+// FINITEONLY-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float
 // FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract float [[CONV_I]], [[Y]]
 // FINITEONLY-NEXT:    [[MUL8_I:%.*]] = fmul nnan ninf contract float [[__X1_0_I3]], [[DIV_I]]
 // FINITEONLY-NEXT:    [[SUB_I]] = fsub nnan ninf contract float [[MUL8_I]], [[__X0_0_I2]]
@@ -4288,7 +4288,7 @@ extern "C" __device__ double test_y1(double x) {
 // APPROX-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // APPROX-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // APPROX-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// APPROX-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float
+// APPROX-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to float
 // APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]]
 // APPROX-NEXT:    [[MUL8_I:%.*]] = fmul contract float [[__X1_0_I3]], [[DIV_I]]
 // APPROX-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]]
@@ -4325,7 +4325,7 @@ extern "C" __device__ float test_ynf(int x, float y) {
 // DEFAULT-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // DEFAULT-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// DEFAULT-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double
+// DEFAULT-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double
 // DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]]
 // DEFAULT-NEXT:    [[MUL8_I:%.*]] = fmul contract double [[__X1_0_I3]], [[DIV_I]]
 // DEFAULT-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]]
@@ -4358,7 +4358,7 @@ extern "C" __device__ float test_ynf(int x, float y) {
 // FINITEONLY-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // FINITEONLY-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// FINITEONLY-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double
+// FINITEONLY-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double
 // FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract double [[CONV_I]], [[Y]]
 // FINITEONLY-NEXT:    [[MUL8_I:%.*]] = fmul nnan ninf contract double [[__X1_0_I3]], [[DIV_I]]
 // FINITEONLY-NEXT:    [[SUB_I]] = fsub nnan ninf contract double [[MUL8_I]], [[__X0_0_I2]]
@@ -4391,7 +4391,7 @@ extern "C" __device__ float test_ynf(int x, float y) {
 // APPROX-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
 // APPROX-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
 // APPROX-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
-// APPROX-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double
+// APPROX-NEXT:    [[CONV_I:%.*]] = uitofp nneg i32 [[MUL_I]] to double
 // APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]]
 // APPROX-NEXT:    [[MUL8_I:%.*]] = fmul contract double [[__X1_0_I3]], [[DIV_I]]
 // APPROX-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]]
diff --git a/clang/test/Index/USR/func-type.cpp b/clang/test/Index/USR/func-type.cpp
index ff1cd37a7fc42..459a8cd6da558 100644
--- a/clang/test/Index/USR/func-type.cpp
+++ b/clang/test/Index/USR/func-type.cpp
@@ -16,3 +16,15 @@ void Func( void (* (*)(int, int))(int, int) );
 // CHECK: {{[0-9]+}}:6 | function/C | Func | c:@F@Func#*F*Fv(#I#I)(#I#I)# |
 void Func( void (* (*)(int, int, int))(int) );
 // CHECK: {{[0-9]+}}:6 | function/C | Func | c:@F@Func#*F*Fv(#I)(#I#I#I)# |
+
+// Functions with parameter types that only differ in top-level cv-qualification should generate the same USR.
+
+void f( const int );
+// CHECK: {{[0-9]+}}:6 | function/C | f | c:@F@f#I# |
+void f( int );
+// CHECK: {{[0-9]+}}:6 | function/C | f | c:@F@f#I# |
+
+void g( int );
+// CHECK: {{[0-9]+}}:6 | function/C | g | c:@F@g#I# |
+void g( const int );
+// CHECK: {{[0-9]+}}:6 | function/C | g | c:@F@g#I# |
diff --git a/clang/test/Modules/hashing-decls-in-exprs-from-gmf-2.cppm b/clang/test/Modules/hashing-decls-in-exprs-from-gmf-2.cppm
new file mode 100644
index 0000000000000..66143102cb9e4
--- /dev/null
+++ b/clang/test/Modules/hashing-decls-in-exprs-from-gmf-2.cppm
@@ -0,0 +1,44 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf %t/A.cppm -emit-module-interface -o %t/A.pcm
+// RUN: %clang_cc1 -std=c++20 -fskip-odr-check-in-gmf %t/test.cpp -fprebuilt-module-path=%t -fsyntax-only -verify
+
+//--- header.h
+#pragma once
+template <class _Tp>
+class Optional {};
+
+template <class _Tp>
+concept C = requires(const _Tp& __t) {
+    []<class _Up>(const Optional<_Up>&) {}(__t);
+};
+
+//--- func.h
+#include "header.h"
+template <C T>
+void func() {}
+
+//--- test_func.h
+#include "func.h"
+
+inline void test_func() {
+    func<Optional<int>>();
+}
+
+//--- A.cppm
+module;
+#include "header.h"
+#include "test_func.h"
+export module A;
+export using ::test_func;
+
+//--- test.cpp
+// expected-no-diagnostics
+import A;
+#include "test_func.h"
+
+void test() {
+    test_func();
+}
diff --git a/clang/test/ParserOpenACC/parse-clauses.c b/clang/test/ParserOpenACC/parse-clauses.c
index 2369df58308a7..4462f0df540f2 100644
--- a/clang/test/ParserOpenACC/parse-clauses.c
+++ b/clang/test/ParserOpenACC/parse-clauses.c
@@ -376,16 +376,13 @@ void SelfClause() {
 #pragma acc serial self(i > j, seq
   for(;;){}
 
-  // expected-warning@+2{{left operand of comma operator has no effect}}
-  // expected-warning@+1{{OpenACC clause 'self' not yet implemented, clause ignored}}
+  // expected-warning@+1{{left operand of comma operator has no effect}}
 #pragma acc serial self(i, j)
   for(;;){}
 
-  // expected-warning@+1{{OpenACC clause 'self' not yet implemented, clause ignored}}
 #pragma acc serial self(i > j)
   for(;;){}
 
-  // expected-warning@+2{{OpenACC clause 'self' not yet implemented, clause ignored}}
   // expected-warning@+1{{OpenACC clause 'seq' not yet implemented, clause ignored}}
 #pragma acc serial self(1+5>3), seq
   for(;;){}
diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index ec7764bb53818..646043681fe33 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -79,6 +79,7 @@
 // CHECK-NOT: __riscv_za128rs {{.*$}}
 // CHECK-NOT: __riscv_za64rs {{.*$}}
 // CHECK-NOT: __riscv_zacas {{.*$}}
+// CHECK-NOT: __riscv_zama16b {{.*$}}
 // CHECK-NOT: __riscv_zawrs {{.*$}}
 // CHECK-NOT: __riscv_zba {{.*$}}
 // CHECK-NOT: __riscv_zbb {{.*$}}
@@ -704,6 +705,12 @@
 // RUN:   -o - | FileCheck --check-prefix=CHECK-ZACAS-EXT %s
 // CHECK-ZACAS-EXT: __riscv_zacas 1000000{{$}}
 
+// RUN: %clang --target=riscv32 -march=rv32izama16b -x c -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-ZAMA16B-EXT %s
+// RUN: %clang --target=riscv64 -march=rv64izama16b  -x c -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-ZAMA16B-EXT %s
+// CHECK-ZAMA16B-EXT: __riscv_zama16b  1000000{{$}}
+
 // RUN: %clang --target=riscv32-unknown-linux-gnu \
 // RUN:   -march=rv32izawrs -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-ZAWRS-EXT %s
diff --git a/clang/test/Sema/aarch64-cpu-supports.c b/clang/test/Sema/aarch64-cpu-supports.c
index ddeed7c5bc9e9..cf8e043dfb1e3 100644
--- a/clang/test/Sema/aarch64-cpu-supports.c
+++ b/clang/test/Sema/aarch64-cpu-supports.c
@@ -20,7 +20,7 @@ int test_aarch64_features(void) {
   // expected-warning@+1 {{invalid cpu feature string}}
   if (__builtin_cpu_supports("default"))
     return 6;
-  if (__builtin_cpu_supports(" ssbs + bti "))
+  if (__builtin_cpu_supports(" ssbs + crc "))
     return 7;
   return 0;
 }
diff --git a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c
index 6a1feeb9bf539..55c97c73e8b69 100644
--- a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c
+++ b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // RUN: %clang_cc1  -triple aarch64-none-linux-gnu -target-feature +sve \
-// RUN:   -target-feature +sme2 -target-feature +sve2 -target-feature +neon -Waarch64-sme-attributes -fsyntax-only -verify %s
+// RUN:   -target-feature +sme2 -target-feature +sve2 -target-feature +neon -fsyntax-only -verify %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -33,7 +33,6 @@ svuint32_t incompat_sve_sm(svbool_t pg, svuint32_t a, int16_t b) __arm_streaming
   return __builtin_sve_svld1_gather_u32base_index_u32(pg, a, b);
 }
 
-// expected-warning@+1 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}}
 __arm_locally_streaming svuint32_t incompat_sve_ls(svbool_t pg, svuint32_t a, int64_t b) {
   // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming function}}
   return __builtin_sve_svld1_gather_u32base_index_u32(pg, a, b);
@@ -49,7 +48,6 @@ svuint32_t incompat_sve2_sm(svbool_t pg, svuint32_t a, int64_t b) __arm_streamin
   return __builtin_sve_svldnt1_gather_u32base_index_u32(pg, a, b);
 }
 
-// expected-warning@+1 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}}
 __arm_locally_streaming svuint32_t incompat_sve2_ls(svbool_t pg, svuint32_t a, int64_t b) {
   // expected-warning@+1 {{builtin call has undefined behaviour when called from a streaming function}}
   return __builtin_sve_svldnt1_gather_u32base_index_u32(pg, a, b);
@@ -70,7 +68,6 @@ svfloat64_t streaming_caller_sve(svbool_t pg, svfloat64_t a, float64_t b) __arm_
   return svadd_n_f64_m(pg, a, b);
 }
 
-// expected-warning@+1 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}}
 __arm_locally_streaming svfloat64_t locally_streaming_caller_sve(svbool_t pg, svfloat64_t a, float64_t b) {
   // expected-no-warning
   return svadd_n_f64_m(pg, a, b);
@@ -86,7 +83,6 @@ svint16_t streaming_caller_sve2(svint16_t op1, svint16_t op2) __arm_streaming {
   return svmul_lane_s16(op1, op2, 0);
 }
 
-// expected-warning@+1 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}}
 __arm_locally_streaming svint16_t locally_streaming_caller_sve2(svint16_t op1, svint16_t op2) {
   // expected-no-warning
   return svmul_lane_s16(op1, op2, 0);
diff --git a/clang/test/Sema/aarch64-sme-func-attrs.c b/clang/test/Sema/aarch64-sme-func-attrs.c
index 12de16509ccb8..bfc8768c3f36e 100644
--- a/clang/test/Sema/aarch64-sme-func-attrs.c
+++ b/clang/test/Sema/aarch64-sme-func-attrs.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -Waarch64-sme-attributes -fsyntax-only -verify %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -Waarch64-sme-attributes -fsyntax-only -verify=expected-cpp -x c++ %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -fsyntax-only -verify=expected-cpp -x c++ %s
 
 // Valid attributes
 
@@ -496,135 +496,3 @@ void fmv_caller() {
     just_fine();
     incompatible_locally_streaming();
 }
-
-void sme_streaming_with_vl_arg(__SVInt8_t a) __arm_streaming { }
-
-__SVInt8_t sme_streaming_returns_vl(void) __arm_streaming { __SVInt8_t r; return r; }
-
-void sme_streaming_compatible_with_vl_arg(__SVInt8_t a) __arm_streaming_compatible { }
-
-__SVInt8_t sme_streaming_compatible_returns_vl(void) __arm_streaming_compatible { __SVInt8_t r; return r; }
-
-void sme_no_streaming_with_vl_arg(__SVInt8_t a) { }
-
-__SVInt8_t sme_no_streaming_returns_vl(void) { __SVInt8_t r; return r; }
-
-// expected-warning@+2 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}}
-// expected-cpp-warning@+1 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}}
-__arm_locally_streaming void sme_locally_streaming_with_vl_arg(__SVInt8_t a) { }
-
-// expected-warning@+2 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}}
-// expected-cpp-warning@+1 {{passing/returning a VL-dependent argument to/from a __arm_locally_streaming function. The streaming and non-streaming vector lengths may be different}}
-__arm_locally_streaming __SVInt8_t sme_locally_streaming_returns_vl(void) { __SVInt8_t r; return r; }
-
-void sme_no_streaming_calling_streaming_with_vl_args() {
-  __SVInt8_t a;
-  // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}}
-  // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}}
-  sme_streaming_with_vl_arg(a);
-}
-
-void sme_no_streaming_calling_streaming_with_return_vl() {
-  // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}}
-  // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}}
-  __SVInt8_t r = sme_streaming_returns_vl();
-}
-
-void sme_streaming_calling_non_streaming_with_vl_args(void) __arm_streaming {
-  __SVInt8_t a;
-  // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}}
-  // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}}
-  sme_no_streaming_with_vl_arg(a);
-}
-
-void sme_streaming_calling_non_streaming_with_return_vl(void) __arm_streaming {
-  // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}}
-  // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}}
-  __SVInt8_t r = sme_no_streaming_returns_vl();
-}
-
-void sme_no_streaming_calling_streaming_with_vl_args_param(__SVInt8_t arg, void (*sc)( __SVInt8_t arg) __arm_streaming) {
-  // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}}
-  // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}}
-  sc(arg);
-}
-
-__SVInt8_t sme_no_streaming_calling_streaming_return_vl_param(__SVInt8_t (*s)(void) __arm_streaming) {
-  // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}}
-  // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}}
-  return s();
-}
-
-void sme_streaming_compatible_calling_streaming_with_vl_args(__SVInt8_t arg) __arm_streaming_compatible {
-  // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}}
-  // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}}
-  sme_streaming_with_vl_arg(arg);
-}
-
-void sme_streaming_compatible_calling_sme_streaming_return_vl(void) __arm_streaming_compatible {
-  // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}}
-  // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}}
-  __SVInt8_t r = sme_streaming_returns_vl();
-}
-
-void sme_streaming_compatible_calling_no_streaming_with_vl_args(__SVInt8_t arg) __arm_streaming_compatible {
-  // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}}
-  // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}}
-  sme_no_streaming_with_vl_arg(arg);
-}
-
-void sme_streaming_compatible_calling_no_sme_streaming_return_vl(void) __arm_streaming_compatible {
-  // expected-warning@+2 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}}
-  // expected-cpp-warning@+1 {{passing a VL-dependent argument to/from a function that has a different streaming-mode. The streaming and non-streaming vector lengths may be different}}
-  __SVInt8_t r = sme_no_streaming_returns_vl();
-}
-
-void sme_streaming_calling_streaming(__SVInt8_t arg, void (*s)( __SVInt8_t arg) __arm_streaming) __arm_streaming {
-  s(arg);
-}
-
-__SVInt8_t sme_streaming_calling_streaming_return_vl(__SVInt8_t (*s)(void) __arm_streaming) __arm_streaming {
-  return s();
-}
-
-void sme_streaming_calling_streaming_with_vl_args(__SVInt8_t a) __arm_streaming {
-  sme_streaming_with_vl_arg(a);
-}
-
-void sme_streaming_calling_streaming_with_return_vl(void) __arm_streaming {
-  __SVInt8_t r = sme_streaming_returns_vl();
-}
-
-void sme_streaming_calling_streaming_compatible_with_vl_args(__SVInt8_t a) __arm_streaming {
-  sme_streaming_compatible_with_vl_arg(a);
-}
-
-void sme_streaming_calling_streaming_compatible_with_return_vl(void) __arm_streaming {
-  __SVInt8_t r = sme_streaming_compatible_returns_vl();
-}
-
-void sme_no_streaming_calling_streaming_compatible_with_vl_args() {
-  __SVInt8_t a;
-  sme_streaming_compatible_with_vl_arg(a);
-}
-
-void sme_no_streaming_calling_streaming_compatible_with_return_vl() {
-  __SVInt8_t r = sme_streaming_compatible_returns_vl();
-}
-
-void sme_no_streaming_calling_non_streaming_compatible_with_vl_args() {
-  __SVInt8_t a;
-  sme_no_streaming_with_vl_arg(a);
-}
-
-void sme_no_streaming_calling_non_streaming_compatible_with_return_vl() {
-  __SVInt8_t r = sme_no_streaming_returns_vl();
-}
-
-void sme_streaming_compatible_calling_streaming_compatible_with_vl_args(__SVInt8_t arg) __arm_streaming_compatible {
-  sme_streaming_compatible_with_vl_arg(arg);
-}
-
-void sme_streaming_compatible_calling_streaming_compatible_with_return_vl(void) __arm_streaming_compatible {
-  __SVInt8_t r = sme_streaming_compatible_returns_vl();
-}
diff --git a/clang/test/Sema/alias-unused-win.cpp b/clang/test/Sema/alias-unused-win.cpp
index 47c96d4117517..97d57a3bbd1e3 100644
--- a/clang/test/Sema/alias-unused-win.cpp
+++ b/clang/test/Sema/alias-unused-win.cpp
@@ -7,7 +7,7 @@ extern "C" {
 static int f(void) { return 42; } // cxx-warning{{unused function 'f'}}
 int g(void) __attribute__((alias("f")));
 
-static int foo [] = { 42, 0xDEAD }; // cxx-warning{{variable 'foo' is not needed and will not be emitted}}
+static int foo [] = { 42, 0xDEAD };
 extern typeof(foo) bar __attribute__((unused, alias("foo")));
 
 static int __attribute__((overloadable)) f0(int x) { return x; } // expected-warning{{unused function 'f0'}}
diff --git a/clang/test/Sema/alias-unused.cpp b/clang/test/Sema/alias-unused.cpp
index dc8e46f072d74..c0b541c880e52 100644
--- a/clang/test/Sema/alias-unused.cpp
+++ b/clang/test/Sema/alias-unused.cpp
@@ -14,24 +14,26 @@ extern typeof(foo) bar __attribute__((unused, alias("foo")));
 /// We report a warning in C++ mode because the internal linkage `resolver` gets
 /// mangled as it does not have a language linkage. GCC does not mangle
 /// `resolver` or report a warning.
-static int (*resolver(void))(void) { return f; } // expected-warning{{unused function 'resolver'}}
+static int (*resolver(void))(void) { return f; } // cxx-warning{{unused function 'resolver'}}
 int ifunc(void) __attribute__((ifunc("resolver")));
 
-static int __attribute__((overloadable)) f0(int x) { return x; } // expected-warning{{unused function 'f0'}}
+static int __attribute__((overloadable)) f0(int x) { return x; }
 static float __attribute__((overloadable)) f0(float x) { return x; } // expected-warning{{unused function 'f0'}}
 int g0(void) __attribute__((alias("_ZL2f0i")));
 
 #ifdef __cplusplus
-static int f1() { return 42; } // expected-warning{{unused function 'f1'}}
+static int f1() { return 42; }
 int g1(void) __attribute__((alias("_ZL2f1v")));
 }
 
-static int f2(int) { return 42; } // expected-warning{{unused function 'f2'}}
-static int f2() { return 42; } // expected-warning{{unused function 'f2'}}
+/// We demangle alias/ifunc target and mark all found functions as used.
+
+static int f2(int) { return 42; } // cxx-warning{{unused function 'f2'}}
+static int f2() { return 42; }
 int g2() __attribute__((alias("_ZL2f2v")));
 
-static int (*resolver1())() { return f; } // expected-warning{{unused function 'resolver1'}}
-static int (*resolver1(int))() { return f; } // expected-warning{{unused function 'resolver1'}}
+static int (*resolver1())() { return f; } // cxx-warning{{unused function 'resolver1'}}
+static int (*resolver1(int))() { return f; }
 int ifunc1() __attribute__((ifunc("_ZL9resolver1i")));
 
 /// TODO: We should report "unused function" for f3(int).
diff --git a/clang/test/Sema/attr-target-clones-aarch64.c b/clang/test/Sema/attr-target-clones-aarch64.c
index bc3fceab82825..511e8a9411804 100644
--- a/clang/test/Sema/attr-target-clones-aarch64.c
+++ b/clang/test/Sema/attr-target-clones-aarch64.c
@@ -9,7 +9,7 @@ void __attribute__((target_clones("ssbs+ls64"))) warn2(void);
 
 // expected-error@+2 {{'target_clones' and 'target_version' attributes are not compatible}}
 // expected-note@+1 {{conflicting attribute is here}}
-void __attribute__((target_version("sve-bf16"), target_clones("sme+memtag"))) not_compat(void);
+void __attribute__((target_version("bf16"), target_clones("sme+memtag"))) not_compat(void);
 
 int redecl(void);
 int __attribute__((target_clones("frintts", "simd+fp", "default"))) redecl(void) { return 1; }
@@ -21,26 +21,25 @@ int __attribute__((target_clones("sve+dotprod"))) redecl3(void);
 int redecl3(void);
 
 int __attribute__((target_clones("rng", "fp16fml+fp", "default"))) redecl4(void);
-// expected-error@+3 {{'target_clones' attribute does not match previous declaration}}
+// expected-error@+2 {{'target_clones' attribute does not match previous declaration}}
 // expected-note@-2 {{previous declaration is here}}
-// expected-warning@+1 {{version list contains entries that don't impact code generation}}
-int __attribute__((target_clones("dgh+memtag+rpres+ls64_v", "ebf16+dpb+sha1", "default"))) redecl4(void) { return 1; }
+int __attribute__((target_clones("dpb2+memtag+rpres+ls64", "bf16+dpb+sha2", "default"))) redecl4(void) { return 1; }
 
 int __attribute__((target_version("flagm2"))) redef2(void) { return 1; }
 // expected-error@+2 {{multiversioned function redeclarations require identical target attributes}}
 // expected-note@-2 {{previous declaration is here}}
 int __attribute__((target_clones("flagm2", "default"))) redef2(void) { return 1; }
 
-int __attribute__((target_clones("f32mm", "f64mm", "sha1+fp"))) redef3(void) { return 1; }
+int __attribute__((target_clones("f32mm", "f64mm", "sha3+fp"))) redef3(void) { return 1; }
 // expected-error@+2 {{'target_clones' attribute does not match previous declaration}}
 // expected-note@-2 {{previous declaration is here}}
-int __attribute__((target_clones("f32mm", "sha1+fp", "f64mm"))) redef3(void) { return 1; }
+int __attribute__((target_clones("f32mm", "sha3+fp", "f64mm"))) redef3(void) { return 1; }
 
 int __attribute__((target_clones("rdm+lse+rdm", "lse+rdm"))) dup1(void) { return 1; }
 // expected-warning@+1 {{version list contains duplicate entries}}
 int __attribute__((target_clones("rdm+lse+rdm", "rdm+lse+rdm"))) dup2(void) { return 2; }
 // expected-warning@+1 {{version list contains duplicate entries}}
-int __attribute__((target_clones("rcpc2+sve2-pmull128", "rcpc2+sve2-pmull128"))) dup3(void) { return 3; }
+int __attribute__((target_clones("rcpc2+sve2-aes", "rcpc2+sve2-aes"))) dup3(void) { return 3; }
 // expected-warning@+1 {{version list contains duplicate entries}}
 void __attribute__((target_clones("sha3", "default", "default"))) dup4(void);
 // expected-warning@+2 {{version list contains duplicate entries}}
@@ -49,7 +48,7 @@ int __attribute__((target_clones("fp", "fp", "crc+dotprod", "dotprod+crc"))) dup
 
 // expected-warning@+1 {{version list contains duplicate entries}}
 int __attribute__((target_clones("fp16+memtag", "memtag+fp16"))) dup6(void) { return 6; }
-int __attribute__((target_clones("simd+ssbs2", "simd+dpb2"))) dup7(void) { return 7; }
+int __attribute__((target_clones("simd+ssbs", "simd+dpb2"))) dup7(void) { return 7; }
 
 // expected-warning@+1 {{unsupported '' in the 'target_clones' attribute string;}}
 void __attribute__((target_clones(""))) empty_target_1(void);
@@ -72,13 +71,13 @@ empty_target_5(void);
 void __attribute__((target_clones("sve2-bitperm", "sve2-bitperm")))
 dupe_normal(void);
 
-void __attribute__((target_clones("default"), target_clones("memtag3+bti"))) dupe_normal2(void);
+void __attribute__((target_clones("default"), target_clones("memtag+mops"))) dupe_normal2(void);
 
 int mv_after_use(void);
 int useage(void) {
   return mv_after_use();
 }
 // expected-error@+1 {{function declaration cannot become a multiversioned function after first usage}}
-int __attribute__((target_clones("sve2-sha3+ssbs2", "sm4"))) mv_after_use(void) { return 1; }
+int __attribute__((target_clones("sve2-sha3+ssbs", "sm4"))) mv_after_use(void) { return 1; }
 // expected-error@+1 {{'main' cannot be a multiversioned function}}
-int __attribute__((target_clones("sve-i8mm"))) main() { return 1; }
+int __attribute__((target_clones("sve2-aes"))) main() { return 1; }
diff --git a/clang/test/Sema/attr-target-version.c b/clang/test/Sema/attr-target-version.c
index cd5be459456eb..4ff8e1eee619e 100644
--- a/clang/test/Sema/attr-target-version.c
+++ b/clang/test/Sema/attr-target-version.c
@@ -16,7 +16,7 @@ int __attribute__((target_version("aes"))) foo(void) { return 1; }
 int __attribute__((target_version("default"))) foo(void) { return 2; }
 
 //expected-note@+1 {{previous definition is here}}
-int __attribute__((target_version("sha3 + pmull "))) foo(void) { return 1; }
+int __attribute__((target_version("sha3 + aes "))) foo(void) { return 1; }
 //expected-note@-1 {{previous definition is here}}
 
 //expected-error@+1 {{redefinition of 'foo'}}
@@ -32,11 +32,11 @@ __attribute__ ((target("bf16,sve,sve2,dotprod"))) int func(void) { return 1; }
 __attribute__ ((target("default"))) int func(void) { return 0; }
 
 //expected-note@+1 {{previous declaration is here}}
-void __attribute__((target_version("bti+flagm2"))) one(void) {}
+void __attribute__((target_version("mops+flagm2"))) one(void) {}
 //expected-error@+1 {{multiversioned function redeclarations require identical target attributes}}
-void __attribute__((target_version("flagm2+bti"))) one(void) {}
+void __attribute__((target_version("flagm2+mops"))) one(void) {}
 
-void __attribute__((target_version("ssbs+sha1"))) two(void) {}
+void __attribute__((target_version("ssbs+sha2"))) two(void) {}
 void __attribute__((target_version("ssbs+fp16fml"))) two(void) {}
 
 //expected-error@+1 {{'main' cannot be a multiversioned function}}
@@ -44,7 +44,7 @@ int __attribute__((target_version("lse"))) main(void) { return 1; }
 
 // It is ok for the default version to appear first.
 int default_first(void) { return 1; }
-int __attribute__((target_version("dit"))) default_first(void) { return 2; }
+int __attribute__((target_version("crc"))) default_first(void) { return 2; }
 int __attribute__((target_version("mops"))) default_first(void) { return 3; }
 
 // It is ok if the default version is between other versions.
@@ -77,7 +77,7 @@ void __attribute__((target_version("rdm+rng+crc"))) redef(void) {}
 void __attribute__((target_version("rdm+rng+crc"))) redef(void) {}
 
 int def(void);
-void __attribute__((target_version("dit"))) nodef(void);
+void __attribute__((target_version("crc"))) nodef(void);
 void __attribute__((target_version("ls64"))) nodef(void);
 void __attribute__((target_version("aes"))) ovl(void);
 void __attribute__((target_version("default"))) ovl(void);
@@ -89,12 +89,12 @@ int bar() {
   return def();
 }
 // expected-error@+1 {{function declaration cannot become a multiversioned function after first usage}}
-int __attribute__((target_version("sha1"))) def(void) { return 1; }
+int __attribute__((target_version("sha3"))) def(void) { return 1; }
 
 int __attribute__((target_version("sve"))) prot();
 // expected-error@-1 {{multiversioned function must have a prototype}}
 
-int __attribute__((target_version("pmull"))) rtype(int);
+int __attribute__((target_version("aes"))) rtype(int);
 // expected-error@+1 {{multiversioned function declaration has a different return type}}
 float __attribute__((target_version("rdm"))) rtype(int);
 
@@ -102,7 +102,7 @@ int __attribute__((target_version("sha2"))) combine(void) { return 1; }
 // expected-error@+1 {{multiversioned function declaration has a different calling convention}}
 int __attribute__((aarch64_vector_pcs, target_version("sha3"))) combine(void) { return 2; }
 
-int __attribute__((target_version("fp+aes+pmull+rcpc"))) unspec_args() { return -1; }
+int __attribute__((target_version("fp+aes+sha3+rcpc"))) unspec_args() { return -1; }
 // expected-error@-1 {{multiversioned function must have a prototype}}
 // expected-error@+1 {{multiversioned function must have a prototype}}
 int __attribute__((target_version("default"))) unspec_args() { return 0; }
diff --git a/clang/test/Sema/builtin-allow-runtime-check.c b/clang/test/Sema/builtin-allow-runtime-check.c
new file mode 100644
index 0000000000000..b656861000075
--- /dev/null
+++ b/clang/test/Sema/builtin-allow-runtime-check.c
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -fsyntax-only -triple x86_64-pc-linux-gnu -verify %s
+// RUN: %clang_cc1 -fsyntax-only -triple aarch64-linux-gnu -verify %s
+
+extern const char *str;
+
+int main(void) {
+  int r = 0;
+
+  r |= __builtin_allow_runtime_check(); // expected-error {{too few arguments to function call}}
+
+  r |= __builtin_allow_runtime_check(str); // expected-error {{expression is not a string literal}}
+
+  r |= __builtin_allow_runtime_check(5); // expected-error {{incompatible integer to pointer conversion}} expected-error {{expression is not a string literal}}
+
+  r |= __builtin_allow_runtime_check("a", "b");  // expected-error {{too many arguments to function call}}
+
+  r |= __builtin_allow_runtime_check("");
+
+  r |= __builtin_allow_runtime_check("check");
+
+  str = __builtin_allow_runtime_check("check2");  // expected-error {{incompatible integer to pointer conversion}}
+
+  return r;
+}
diff --git a/clang/test/Sema/recover-expr-gh88008-nocrash.c b/clang/test/Sema/recover-expr-gh88008-nocrash.c
new file mode 100644
index 0000000000000..5500b33dd0e85
--- /dev/null
+++ b/clang/test/Sema/recover-expr-gh88008-nocrash.c
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 %s -verify -fsyntax-only -std=c90
+
+struct S {
+  int v;
+};
+
+struct T; // expected-note {{forward declaration of 'struct T'}}
+
+void gh88008_nocrash(struct T *t) {
+  struct S s = { .v = t->y }; // expected-error {{incomplete definition of type 'struct T'}}
+}
diff --git a/clang/test/Sema/static-assert.c b/clang/test/Sema/static-assert.c
index ae5e8076e0bed..4e9e6b7ee558b 100644
--- a/clang/test/Sema/static-assert.c
+++ b/clang/test/Sema/static-assert.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -std=c11 -Wgnu-folding-constant -fsyntax-only -verify=expected,c %s
-// RUN: %clang_cc1 -fms-compatibility -Wgnu-folding-constant -DMS -fsyntax-only -verify=expected,ms,c %s
-// RUN: %clang_cc1 -std=c99 -pedantic -Wgnu-folding-constant -fsyntax-only -verify=expected,ext,c %s
+// RUN: %clang_cc1 -std=c11 -Wgnu-folding-constant -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fms-compatibility -Wgnu-folding-constant -DMS -fsyntax-only -verify=expected,ms %s
+// RUN: %clang_cc1 -std=c99 -pedantic -Wgnu-folding-constant -fsyntax-only -verify=expected,ext %s
 // RUN: %clang_cc1 -xc++ -std=c++11 -pedantic -fsyntax-only -verify=expected,ext,cxx %s
 
 _Static_assert("foo", "string is nonzero"); // ext-warning {{'_Static_assert' is a C11 extension}}
@@ -57,8 +57,7 @@ UNION(char[2], short) u2 = { .one = { 'a', 'b' } }; // ext-warning 3 {{'_Static_
 typedef UNION(char, short) U3; // expected-error {{static assertion failed due to requirement 'sizeof(char) == sizeof(short)': type size mismatch}} \
                                // expected-note{{evaluates to '1 == 2'}} \
                                // ext-warning 3 {{'_Static_assert' is a C11 extension}}
-typedef UNION(float, 0.5f) U4; // c-error {{expected a type}} \
-                               // cxx-error {{type name requires a specifier or qualifier}} \
+typedef UNION(float, 0.5f) U4; // expected-error {{expected a type}} \
                                // ext-warning 3 {{'_Static_assert' is a C11 extension}}
 
 // After defining the assert macro in MS-compatibility mode, we should
diff --git a/clang/test/SemaCXX/PR84020.cpp b/clang/test/SemaCXX/PR84020.cpp
new file mode 100644
index 0000000000000..8ea5dcc4527ae
--- /dev/null
+++ b/clang/test/SemaCXX/PR84020.cpp
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -std=c++20 -verify %s
+// RUN: %clang_cc1 -std=c++23 -verify %s
+// expected-no-diagnostics
+
+struct B {
+    template <typename S>
+    void foo();
+
+    void bar();
+};
+
+template <typename T, typename S>
+struct A : T {
+    auto foo() {
+        static_assert(requires { T::template foo<S>(); });
+        static_assert(requires { T::bar(); });
+    }
+};
+
+int main() {
+    A<B, double> a;
+    a.foo();
+}
diff --git a/clang/test/SemaCXX/attr-target-version.cpp b/clang/test/SemaCXX/attr-target-version.cpp
index b3385f043590f..ae2923cc303c2 100644
--- a/clang/test/SemaCXX/attr-target-version.cpp
+++ b/clang/test/SemaCXX/attr-target-version.cpp
@@ -25,13 +25,13 @@ int __attribute__((target_version("dpb"))) diff_link(void);
 
 int __attribute__((target_version("memtag"))) diff_link1(void) { return 1; }
 //expected-error@+1 {{multiversioned function declaration has a different linkage}}
-static int __attribute__((target_version("bti"))) diff_link1(void);
+static int __attribute__((target_version("mops"))) diff_link1(void);
 
 int __attribute__((target_version("flagm2"))) diff_link2(void) { return 1; }
 extern int __attribute__((target_version("flagm"))) diff_link2(void);
 
 namespace {
-static int __attribute__((target_version("memtag3"))) diff_link2(void) { return 2; }
+static int __attribute__((target_version("memtag"))) diff_link2(void) { return 2; }
 int __attribute__((target_version("sve2-bitperm"))) diff_link2(void) { return 1; }
 } // namespace
 
@@ -49,7 +49,7 @@ double __attribute__((target_version("rcpc"))) diff_type1(void);
 
 auto __attribute__((target_version("rcpc2"))) diff_type2(void) -> int { return 1; }
 //expected-error@+1 {{multiversioned function declaration has a different return type}}
-auto __attribute__((target_version("sve-bf16"))) diff_type2(void) -> long { return (long)1; }
+auto __attribute__((target_version("sve2-aes"))) diff_type2(void) -> long { return (long)1; }
 
 int __attribute__((target_version("fp16fml"))) diff_type3(void) noexcept(false) { return 1; }
 //expected-error@+2 {{exception specification in declaration does not match previous declaration}}
@@ -75,7 +75,7 @@ auto __attribute__((target_version("dpb2"))) ret3(void) -> int { return 1; }
 class Cls {
   __attribute__((target_version("rng"))) Cls();
   // expected-error@-1 {{attribute 'target_version' multiversioned functions do not yet support constructors}}
-  __attribute__((target_version("sve-i8mm"))) ~Cls();
+  __attribute__((target_version("sve2-aes"))) ~Cls();
   // expected-error@-1 {{attribute 'target_version' multiversioned functions do not yet support destructors}}
 
   Cls &__attribute__((target_version("f32mm"))) operator=(const Cls &) = default;
@@ -98,11 +98,11 @@ __attribute__((target_version("jscvt"))) void Decl();
 } // namespace Nms
 
 class Out {
-  int __attribute__((target_version("bti"))) func(void);
-  int __attribute__((target_version("ssbs2"))) func(void);
+  int __attribute__((target_version("mops"))) func(void);
+  int __attribute__((target_version("ssbs"))) func(void);
 };
-int __attribute__((target_version("bti"))) Out::func(void) { return 1; }
-int __attribute__((target_version("ssbs2"))) Out::func(void) { return 2; }
+int __attribute__((target_version("mops"))) Out::func(void) { return 1; }
+int __attribute__((target_version("ssbs"))) Out::func(void) { return 2; }
 // expected-error@+3 {{out-of-line definition of 'func' does not match any declaration in 'Out'}}
 // expected-note@-3 {{member declaration nearly matches}}
 // expected-note@-3 {{member declaration nearly matches}}
diff --git a/clang/test/SemaCXX/builtins.cpp b/clang/test/SemaCXX/builtins.cpp
index 567094c94c171..080b4476c7eec 100644
--- a/clang/test/SemaCXX/builtins.cpp
+++ b/clang/test/SemaCXX/builtins.cpp
@@ -76,6 +76,11 @@ using ConstMemFnType = int (Dummy::*)() const;
 
 void foo() {}
 
+void test_builtin_empty_parentheses_diags() {
+  __is_trivially_copyable(); // expected-error {{expected a type}}
+  __is_trivially_copyable(1); // expected-error {{expected a type}}
+}
+
 void test_builtin_launder_diags(void *vp, const void *cvp, FnType *fnp,
                                 MemFnType mfp, ConstMemFnType cmfp, int (&Arr)[5]) {
   __builtin_launder(vp);   // expected-error {{void pointer argument to '__builtin_launder' is not allowed}}
diff --git a/clang/test/SemaCXX/deprecated-builtins.cpp b/clang/test/SemaCXX/deprecated-builtins.cpp
index 849b9b014fff2..fafc1da4da13e 100644
--- a/clang/test/SemaCXX/deprecated-builtins.cpp
+++ b/clang/test/SemaCXX/deprecated-builtins.cpp
@@ -17,3 +17,8 @@ void f() {
     a = __has_trivial_destructor(A);  // expected-warning-re {{__has_trivial_destructor {{.*}} use __is_trivially_destructible}}
 
 }
+
+void test_builtin_empty_parentheses_diags(void) {
+    __has_nothrow_copy(); // expected-error {{expected a type}}
+    __has_nothrow_copy(1); // expected-error {{expected a type}}
+}
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-suggestions-crashes.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-suggestions-crashes.cpp
new file mode 100644
index 0000000000000..bf4faec184ee1
--- /dev/null
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-suggestions-crashes.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -std=c++20 -Wunsafe-buffer-usage \
+// RUN:            -fsafe-buffer-usage-suggestions \
+// RUN:            %s -verify %s
+
+char * unsafe_pointer; // expected-warning{{'unsafe_pointer' is an unsafe pointer used for buffer access}}
+
+void test(char * param) {
+}
+
+void dre_parenthesized() {
+  test(&(unsafe_pointer)[1]); // no-crash // expected-note{{used in buffer access here}}
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-clause-ast.cpp b/clang/test/SemaOpenACC/compute-construct-clause-ast.cpp
index 018f0b68c7810..6d2efcf81eb6e 100644
--- a/clang/test/SemaOpenACC/compute-construct-clause-ast.cpp
+++ b/clang/test/SemaOpenACC/compute-construct-clause-ast.cpp
@@ -110,6 +110,50 @@ void TemplFunc() {
   // CHECK-NEXT: CXXBoolLiteralExpr
   // CHECK-NEXT: NullStmt
 
+#pragma acc serial self
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}serial
+  // CHECK-NEXT: self clause
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc kernels self(T::SomeFloat)
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}kernels
+  // CHECK-NEXT: self clause
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>' lvalue
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc parallel self(T::SomeFloat) if (T::SomeFloat)
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel
+  // CHECK-NEXT: self clause
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>' lvalue
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T'
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>' lvalue
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+#pragma acc serial if(T::SomeFloat) self(T::SomeFloat)
+  while(true);
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}serial
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>' lvalue
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T'
+  // CHECK-NEXT: self clause
+  // CHECK-NEXT: DependentScopeDeclRefExpr{{.*}} '<dependent type>' lvalue
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'T'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
   // Match the instantiation:
   // CHECK: FunctionDecl{{.*}}TemplFunc{{.*}}implicit_instantiation
   // CHECK-NEXT: TemplateArgument type 'InstTy'
@@ -171,6 +215,53 @@ void TemplFunc() {
   // CHECK-NEXT: WhileStmt
   // CHECK-NEXT: CXXBoolLiteralExpr
   // CHECK-NEXT: NullStmt
+
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}serial
+  // CHECK-NEXT: self clause
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}kernels
+  // CHECK-NEXT: self clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'bool' <FloatingToBoolean>
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'float' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'const float' lvalue Var{{.*}} 'SomeFloat' 'const float'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'InstTy'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}parallel
+  // CHECK-NEXT: self clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'bool' <FloatingToBoolean>
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'float' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'const float' lvalue Var{{.*}} 'SomeFloat' 'const float'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'InstTy'
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'bool' <FloatingToBoolean>
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'float' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'const float' lvalue Var{{.*}} 'SomeFloat' 'const float'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'InstTy'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
+  // CHECK-NEXT: OpenACCComputeConstruct{{.*}}serial
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'bool' <FloatingToBoolean>
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'float' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'const float' lvalue Var{{.*}} 'SomeFloat' 'const float'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'InstTy'
+  // CHECK-NEXT: self clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'bool' <FloatingToBoolean>
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'float' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}} 'const float' lvalue Var{{.*}} 'SomeFloat' 'const float'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpec 'InstTy'
+  // CHECK-NEXT: WhileStmt
+  // CHECK-NEXT: CXXBoolLiteralExpr
+  // CHECK-NEXT: NullStmt
+
 }
 
 struct BoolConversion{ operator bool() const;};
diff --git a/clang/test/SemaOpenACC/compute-construct-self-clause.c b/clang/test/SemaOpenACC/compute-construct-self-clause.c
new file mode 100644
index 0000000000000..fbed2953419a2
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-self-clause.c
@@ -0,0 +1,82 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+void BoolExpr(int *I, float *F) {
+  typedef struct {} SomeStruct;
+  struct C{};
+  // expected-error@+1{{expected expression}}
+#pragma acc parallel self (struct C f())
+  while(0);
+
+  // expected-error@+1{{unexpected type name 'SomeStruct': expected expression}}
+#pragma acc serial self (SomeStruct)
+  while(0);
+
+  // expected-error@+1{{unexpected type name 'SomeStruct': expected expression}}
+#pragma acc serial self (SomeStruct())
+  while(0);
+
+  SomeStruct S;
+  // expected-error@+1{{statement requires expression of scalar type ('SomeStruct' invalid)}}
+#pragma acc serial self (S)
+  while(0);
+
+#pragma acc parallel self (I)
+  while(0);
+
+#pragma acc serial self (F)
+  while(0);
+
+#pragma acc kernels self (*I < *F)
+  while(0);
+}
+
+void WarnMaybeNotUsed(int val1, int val2) {
+
+  // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc parallel self if(val1)
+  while(0);
+
+  // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc parallel self(val1) if(val1)
+  while(0);
+
+  // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc parallel if(val1) self
+  while(0);
+
+  // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc parallel if(val1) self(val2)
+  while(0);
+
+  // The below don't warn because one side or the other has an error, thus is
+  // not added to the AST.
+
+  // expected-error@+1{{use of undeclared identifier 'invalid'}}
+#pragma acc parallel self if(invalid)
+  while(0);
+
+  // expected-error@+1{{use of undeclared identifier 'invalid'}}
+#pragma acc parallel self(invalid) if(val1)
+  while(0);
+
+  // expected-error@+2{{expected expression}}
+  // expected-error@+1{{use of undeclared identifier 'invalid'}}
+#pragma acc parallel self() if(invalid)
+  while(0);
+
+  // expected-error@+1{{use of undeclared identifier 'invalid'}}
+#pragma acc parallel if(invalid) self
+  while(0);
+
+  // expected-error@+1{{use of undeclared identifier 'invalid'}}
+#pragma acc parallel if(val2) self(invalid)
+  while(0);
+
+  // expected-error@+1{{use of undeclared identifier 'invalid'}}
+#pragma acc parallel if(invalid) self(val1)
+  while(0);
+}
diff --git a/clang/test/SemaOpenACC/compute-construct-self-clause.cpp b/clang/test/SemaOpenACC/compute-construct-self-clause.cpp
new file mode 100644
index 0000000000000..60edbdc2b1191
--- /dev/null
+++ b/clang/test/SemaOpenACC/compute-construct-self-clause.cpp
@@ -0,0 +1,99 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+struct NoBoolConversion{};
+struct BoolConversion{
+  operator bool();
+};
+
+template <typename T, typename U>
+void BoolExpr() {
+  // expected-error@+1{{value of type 'NoBoolConversion' is not contextually convertible to 'bool'}}
+#pragma acc parallel self (NoBoolConversion{})
+  while(0);
+  // expected-error@+2{{no member named 'NotValid' in 'NoBoolConversion'}}
+  // expected-note@#INST{{in instantiation of function template specialization}}
+#pragma acc parallel self (T::NotValid)
+  while(0);
+
+#pragma acc parallel self (BoolConversion{})
+  while(0);
+
+  // expected-error@+1{{value of type 'NoBoolConversion' is not contextually convertible to 'bool'}}
+#pragma acc parallel self (T{})
+  while(0);
+
+#pragma acc parallel self (U{})
+  while(0);
+}
+
+struct HasBool {
+  static constexpr bool B = true;
+};
+
+template<typename T>
+void WarnMaybeNotUsed() {
+  // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc parallel self if(T::B)
+  while(0);
+
+  // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc parallel self(T::B) if(T::B)
+  while(0);
+
+  // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc parallel if(T::B) self
+  while(0);
+
+  // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc parallel if(T::B) self(T::B)
+  while(0);
+
+  // We still warn in the cases of dependent failures, since the diagnostic
+  // happens immediately rather than during instantiation.
+
+  // expected-error@+4{{no member named 'Invalid' in 'HasBool'}}
+  // expected-note@#NOT_USED_INST{{in instantiation of function template specialization 'WarnMaybeNotUsed<HasBool>' requested here}}
+  // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc parallel self if(T::Invalid)
+  while(0);
+
+  // expected-error@+3{{no member named 'Invalid' in 'HasBool'}}
+  // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc parallel self(T::Invalid) if(T::B)
+  while(0);
+
+  // expected-error@+3{{no member named 'Invalid' in 'HasBool'}}
+  // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc parallel self(T::B) if(T::Invalid)
+  while(0);
+
+  // expected-error@+3{{no member named 'Invalid' in 'HasBool'}}
+  // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc parallel if(T::Invalid) self
+  while(0);
+
+  // expected-error@+3{{no member named 'Invalid' in 'HasBool'}}
+  // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc parallel if(T::Invalid) self(T::B)
+  while(0);
+
+  // expected-error@+3{{no member named 'Invalid' in 'HasBool'}}
+  // expected-warning@+2{{OpenACC construct 'self' has no effect when an 'if' clause evaluates to true}}
+  // expected-note@+1{{previous clause is here}}
+#pragma acc parallel if(T::B) self(T::Invalid)
+  while(0);
+}
+
+void Instantiate() {
+  BoolExpr<NoBoolConversion, BoolConversion>(); // #INST
+  WarnMaybeNotUsed<HasBool>(); // #NOT_USED_INST
+}
diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
index eaa76dd43e41d..f42af7e330e17 100644
--- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp
+++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
@@ -72,6 +72,7 @@ enum ResourceDirRecipeKind {
   RDRK_InvokeCompiler,
 };
 
+static std::string OutputFileName = "-";
 static ScanningMode ScanMode = ScanningMode::DependencyDirectivesScan;
 static ScanningOutputFormat Format = ScanningOutputFormat::Make;
 static ScanningOptimizations OptimizeArgs;
@@ -98,8 +99,8 @@ static bool RoundTripArgs = DoRoundTripDefault;
 static void ParseArgs(int argc, char **argv) {
   ScanDepsOptTable Tbl;
   llvm::StringRef ToolName = argv[0];
-  llvm::BumpPtrAllocator A;
-  llvm::StringSaver Saver{A};
+  llvm::BumpPtrAllocator Alloc;
+  llvm::StringSaver Saver{Alloc};
   llvm::opt::InputArgList Args =
       Tbl.parseArgs(argc, argv, OPT_UNKNOWN, Saver, [&](StringRef Msg) {
         llvm::errs() << Msg << '\n';
@@ -175,6 +176,9 @@ static void ParseArgs(int argc, char **argv) {
   if (const llvm::opt::Arg *A = Args.getLastArg(OPT_module_files_dir_EQ))
     ModuleFilesDir = A->getValue();
 
+  if (const llvm::opt::Arg *A = Args.getLastArg(OPT_o))
+    OutputFileName = A->getValue();
+
   EagerLoadModules = Args.hasArg(OPT_eager_load_pcm);
 
   if (const llvm::opt::Arg *A = Args.getLastArg(OPT_j)) {
@@ -186,14 +190,8 @@ static void ParseArgs(int argc, char **argv) {
     }
   }
 
-  if (const llvm::opt::Arg *A = Args.getLastArg(OPT_compilation_database_EQ)) {
+  if (const llvm::opt::Arg *A = Args.getLastArg(OPT_compilation_database_EQ))
     CompilationDB = A->getValue();
-  } else if (Format != ScanningOutputFormat::P1689) {
-    llvm::errs() << ToolName
-                 << ": for the --compiilation-database option: must be "
-                    "specified at least once!";
-    std::exit(1);
-  }
 
   if (const llvm::opt::Arg *A = Args.getLastArg(OPT_module_name_EQ))
     ModuleName = A->getValue();
@@ -225,9 +223,8 @@ static void ParseArgs(int argc, char **argv) {
 
   RoundTripArgs = Args.hasArg(OPT_round_trip_args);
 
-  if (auto *A = Args.getLastArgNoClaim(OPT_DASH_DASH))
-    CommandLine.insert(CommandLine.end(), A->getValues().begin(),
-                       A->getValues().end());
+  if (const llvm::opt::Arg *A = Args.getLastArgNoClaim(OPT_DASH_DASH))
+    CommandLine.assign(A->getValues().begin(), A->getValues().end());
 }
 
 class SharedStream {
@@ -426,6 +423,11 @@ class FullDeps {
   }
 
   void printFullOutput(raw_ostream &OS) {
+    // Skip sorting modules and constructing the JSON object if the output
+    // cannot be observed anyway. This makes timings less noisy.
+    if (&OS == &llvm::nulls())
+      return;
+
     // Sort the modules by name to get a deterministic order.
     std::vector<IndexedModuleID> ModuleIDs;
     for (auto &&M : Modules)
@@ -694,38 +696,28 @@ static std::string getModuleCachePath(ArrayRef<std::string> Args) {
   return std::string(Path);
 }
 
-// getCompilationDataBase - If -compilation-database is set, load the
-// compilation database from the specified file. Otherwise if the we're
-// generating P1689 format, trying to generate the compilation database
-// form specified command line after the positional parameter "--".
+/// Attempts to construct the compilation database from '-compilation-database'
+/// or from the arguments following the positional '--'.
 static std::unique_ptr<tooling::CompilationDatabase>
-getCompilationDataBase(int argc, char **argv, std::string &ErrorMessage) {
+getCompilationDatabase(int argc, char **argv, std::string &ErrorMessage) {
   ParseArgs(argc, argv);
 
+  if (!(CommandLine.empty() ^ CompilationDB.empty())) {
+    llvm::errs() << "The compilation command line must be provided either via "
+                    "'-compilation-database' or after '--'.";
+    return nullptr;
+  }
+
   if (!CompilationDB.empty())
     return tooling::JSONCompilationDatabase::loadFromFile(
         CompilationDB, ErrorMessage,
         tooling::JSONCommandLineSyntax::AutoDetect);
 
-  if (Format != ScanningOutputFormat::P1689) {
-    llvm::errs() << "the --compilation-database option: must be specified at "
-                    "least once!";
-    return nullptr;
-  }
-
-  // Trying to get the input file, the output file and the command line options
-  // from the positional parameter "--".
-  char **DoubleDash = std::find(argv, argv + argc, StringRef("--"));
-  if (DoubleDash == argv + argc) {
-    llvm::errs() << "The command line arguments is required after '--' in "
-                    "P1689 per file mode.";
-    return nullptr;
-  }
-
   llvm::IntrusiveRefCntPtr<DiagnosticsEngine> Diags =
       CompilerInstance::createDiagnostics(new DiagnosticOptions);
   driver::Driver TheDriver(CommandLine[0], llvm::sys::getDefaultTargetTriple(),
                            *Diags);
+  TheDriver.setCheckInputsExist(false);
   std::unique_ptr<driver::Compilation> C(
       TheDriver.BuildCompilation(CommandLine));
   if (!C || C->getJobs().empty())
@@ -740,7 +732,8 @@ getCompilationDataBase(int argc, char **argv, std::string &ErrorMessage) {
 
   FrontendOptions &FEOpts = CI->getFrontendOpts();
   if (FEOpts.Inputs.size() != 1) {
-    llvm::errs() << "Only one input file is allowed in P1689 per file mode.";
+    llvm::errs()
+        << "Exactly one input file is required in the per-file mode ('--').\n";
     return nullptr;
   }
 
@@ -749,8 +742,9 @@ getCompilationDataBase(int argc, char **argv, std::string &ErrorMessage) {
   auto LastCmd = C->getJobs().end();
   LastCmd--;
   if (LastCmd->getOutputFilenames().size() != 1) {
-    llvm::errs() << "The command line should provide exactly one output file "
-                    "in P1689 per file mode.\n";
+    llvm::errs()
+        << "Exactly one output file is required in the per-file mode ('--').\n";
+    return nullptr;
   }
   StringRef OutputFile = LastCmd->getOutputFilenames().front();
 
@@ -790,7 +784,7 @@ getCompilationDataBase(int argc, char **argv, std::string &ErrorMessage) {
 int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) {
   std::string ErrorMessage;
   std::unique_ptr<tooling::CompilationDatabase> Compilations =
-      getCompilationDataBase(argc, argv, ErrorMessage);
+      getCompilationDatabase(argc, argv, ErrorMessage);
   if (!Compilations) {
     llvm::errs() << ErrorMessage << "\n";
     return 1;
@@ -864,8 +858,25 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) {
       });
 
   SharedStream Errs(llvm::errs());
-  // Print out the dependency results to STDOUT by default.
-  SharedStream DependencyOS(llvm::outs());
+
+  std::optional<llvm::raw_fd_ostream> FileOS;
+  llvm::raw_ostream &ThreadUnsafeDependencyOS = [&]() -> llvm::raw_ostream & {
+    if (OutputFileName == "-")
+      return llvm::outs();
+
+    if (OutputFileName == "/dev/null")
+      return llvm::nulls();
+
+    std::error_code EC;
+    FileOS.emplace(OutputFileName, EC);
+    if (EC) {
+      llvm::errs() << "Failed to open output file '" << OutputFileName
+                   << "': " << llvm::errorCodeToError(EC) << '\n';
+      std::exit(1);
+    }
+    return *FileOS;
+  }();
+  SharedStream DependencyOS(ThreadUnsafeDependencyOS);
 
   std::vector<tooling::CompileCommand> Inputs =
       AdjustingCompilations->getAllCompileCommands();
@@ -1006,9 +1017,9 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) {
       HadErrors = true;
 
   if (Format == ScanningOutputFormat::Full)
-    FD->printFullOutput(llvm::outs());
+    FD->printFullOutput(ThreadUnsafeDependencyOS);
   else if (Format == ScanningOutputFormat::P1689)
-    PD.printDependencies(llvm::outs());
+    PD.printDependencies(ThreadUnsafeDependencyOS);
 
   return HadErrors;
 }
diff --git a/clang/tools/clang-scan-deps/Opts.td b/clang/tools/clang-scan-deps/Opts.td
index 5cd5d1a9fb37b..4837ce6f070d7 100644
--- a/clang/tools/clang-scan-deps/Opts.td
+++ b/clang/tools/clang-scan-deps/Opts.td
@@ -11,6 +11,8 @@ multiclass Eq<string name, string help> {
 def help : Flag<["--"], "help">, HelpText<"Display this help">;
 def version : Flag<["--"], "version">, HelpText<"Display the version">;
 
+def o : Arg<"o", "Destination of the primary output">;
+
 defm mode : Eq<"mode", "The preprocessing mode used to compute the dependencies">;
 
 defm format : Eq<"format", "The output format for the dependencies">;
@@ -37,4 +39,4 @@ def verbose : F<"v", "Use verbose output">;
 
 def round_trip_args : F<"round-trip-args", "verify that command-line arguments are canonical by parsing and re-serializing">;
 
-def DASH_DASH : Option<["--"], "", KIND_REMAINING_ARGS>;
\ No newline at end of file
+def DASH_DASH : Option<["--"], "", KIND_REMAINING_ARGS>;
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index f304786ff9dff..2ef599d2cd26f 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2791,6 +2791,10 @@ void OpenACCClauseEnqueue::VisitDefaultClause(const OpenACCDefaultClause &C) {}
 void OpenACCClauseEnqueue::VisitIfClause(const OpenACCIfClause &C) {
   Visitor.AddStmt(C.getConditionExpr());
 }
+void OpenACCClauseEnqueue::VisitSelfClause(const OpenACCSelfClause &C) {
+  if (C.hasConditionExpr())
+    Visitor.AddStmt(C.getConditionExpr());
+}
 } // namespace
 
 void EnqueueVisitor::EnqueueChildren(const OpenACCClause *C) {
diff --git a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
index f58ce4aebcbfc..9c1dc1a76db63 100644
--- a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
+++ b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
@@ -977,6 +977,36 @@ TEST(ExprMutationAnalyzerTest, FollowFuncArgModified) {
                          "void f() { int x; g(x); }");
   Results = match(withEnclosingCompound(declRefTo("x")), AST->getASTContext());
   EXPECT_THAT(mutatedBy(Results, AST.get()), ElementsAre("g(x)"));
+
+  AST = buildASTFromCode(
+      StdRemoveReference + StdForward +
+      "template <class T> void f1(T &&a);"
+      "template <class T> void f2(T &&a);"
+      "template <class T> void f1(T &&a) { f2<T>(std::forward<T>(a)); }"
+      "template <class T> void f2(T &&a) { f1<T>(std::forward<T>(a)); }"
+      "void f() { int x; f1(x); }");
+  Results = match(withEnclosingCompound(declRefTo("x")), AST->getASTContext());
+  EXPECT_FALSE(isMutated(Results, AST.get()));
+
+  AST = buildASTFromCode(
+      StdRemoveReference + StdForward +
+      "template <class T> void f1(T &&a);"
+      "template <class T> void f2(T &&a);"
+      "template <class T> void f1(T &&a) { f2<T>(std::forward<T>(a)); }"
+      "template <class T> void f2(T &&a) { f1<T>(std::forward<T>(a)); a++; }"
+      "void f() { int x; f1(x); }");
+  Results = match(withEnclosingCompound(declRefTo("x")), AST->getASTContext());
+  EXPECT_THAT(mutatedBy(Results, AST.get()), ElementsAre("f1(x)"));
+
+  AST = buildASTFromCode(
+      StdRemoveReference + StdForward +
+      "template <class T> void f1(T &&a);"
+      "template <class T> void f2(T &&a);"
+      "template <class T> void f1(T &&a) { f2<T>(std::forward<T>(a)); a++; }"
+      "template <class T> void f2(T &&a) { f1<T>(std::forward<T>(a)); }"
+      "void f() { int x; f1(x); }");
+  Results = match(withEnclosingCompound(declRefTo("x")), AST->getASTContext());
+  EXPECT_THAT(mutatedBy(Results, AST.get()), ElementsAre("f1(x)"));
 }
 
 TEST(ExprMutationAnalyzerTest, FollowFuncArgNotModified) {
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index 00dafb2988c69..97ec32126c1dc 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -3098,6 +3098,58 @@ TEST(TransferTest, ResultObjectLocationForCXXOperatorCallExpr) {
       });
 }
 
+// Check that the `std::strong_ordering` object returned by builtin `<=>` has a
+// correctly modeled result object location.
+TEST(TransferTest, ResultObjectLocationForBuiltinSpaceshipOperator) {
+  std::string Code = R"(
+    namespace std {
+      // This is the minimal definition required to get
+      // `Sema::CheckComparisonCategoryType()` to accept this fake.
+      struct strong_ordering {
+        enum class ordering { less, equal, greater };
+        ordering o;
+        static const strong_ordering less;
+        static const strong_ordering equivalent;
+        static const strong_ordering equal;
+        static const strong_ordering greater;
+      };
+
+      inline constexpr strong_ordering strong_ordering::less =
+        { strong_ordering::ordering::less };
+      inline constexpr strong_ordering strong_ordering::equal =
+        { strong_ordering::ordering::equal };
+      inline constexpr strong_ordering strong_ordering::equivalent =
+        { strong_ordering::ordering::equal };
+      inline constexpr strong_ordering strong_ordering::greater =
+        { strong_ordering::ordering::greater };
+    }
+    void target(int i, int j) {
+      auto ordering = i <=> j;
+      // [[p]]
+    }
+  )";
+  using ast_matchers::binaryOperator;
+  using ast_matchers::hasOperatorName;
+  using ast_matchers::match;
+  using ast_matchers::selectFirst;
+  using ast_matchers::traverse;
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+
+        auto *Spaceship = selectFirst<BinaryOperator>(
+            "op",
+            match(binaryOperator(hasOperatorName("<=>")).bind("op"), ASTCtx));
+
+        EXPECT_EQ(
+            &Env.getResultObjectLocation(*Spaceship),
+            &getLocForDecl<RecordStorageLocation>(ASTCtx, Env, "ordering"));
+      },
+      LangStandard::lang_cxx20);
+}
+
 TEST(TransferTest, ResultObjectLocationForStdInitializerListExpr) {
   std::string Code = R"(
     namespace std {
@@ -3130,6 +3182,58 @@ TEST(TransferTest, ResultObjectLocationForStdInitializerListExpr) {
       });
 }
 
+TEST(TransferTest, ResultObjectLocationForStmtExpr) {
+  std::string Code = R"(
+    struct S {};
+    void target() {
+      S s = ({ S(); });
+      // [[p]]
+    }
+  )";
+  using ast_matchers::cxxConstructExpr;
+  using ast_matchers::match;
+  using ast_matchers::selectFirst;
+  using ast_matchers::traverse;
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+
+        auto *Construct = selectFirst<CXXConstructExpr>(
+            "construct", match(cxxConstructExpr().bind("construct"), ASTCtx));
+
+        EXPECT_EQ(&Env.getResultObjectLocation(*Construct),
+                  &getLocForDecl<RecordStorageLocation>(ASTCtx, Env, "s"));
+      });
+}
+
+TEST(TransferTest, ResultObjectLocationForBuiltinBitCastExpr) {
+  std::string Code = R"(
+    struct S { int i; };
+    void target(int i) {
+      S s = __builtin_bit_cast(S, i);
+      // [[p]]
+    }
+  )";
+  using ast_matchers::explicitCastExpr;
+  using ast_matchers::match;
+  using ast_matchers::selectFirst;
+  using ast_matchers::traverse;
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+
+        auto *BuiltinBitCast = selectFirst<BuiltinBitCastExpr>(
+            "cast", match(explicitCastExpr().bind("cast"), ASTCtx));
+
+        EXPECT_EQ(&Env.getResultObjectLocation(*BuiltinBitCast),
+                  &getLocForDecl<RecordStorageLocation>(ASTCtx, Env, "s"));
+      });
+}
+
 TEST(TransferTest, ResultObjectLocationPropagatesThroughConditionalOperator) {
   std::string Code = R"(
     struct A {
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index 04e1acc270500..56f1fdf9ef574 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -2266,7 +2266,7 @@ static void emitNeonTypeDefs(const std::string& types, raw_ostream &OS) {
       InIfdef = false;
     }
     if (!InIfdef && IsA64) {
-      OS << "#ifdef __aarch64__\n";
+      OS << "#if defined(__aarch64__) || defined(__arm64ec__)\n";
       InIfdef = true;
     }
 
@@ -2299,7 +2299,7 @@ static void emitNeonTypeDefs(const std::string& types, raw_ostream &OS) {
         InIfdef = false;
       }
       if (!InIfdef && IsA64) {
-        OS << "#ifdef __aarch64__\n";
+        OS << "#if defined(__aarch64__) || defined(__arm64ec__)\n";
         InIfdef = true;
       }
 
@@ -2381,7 +2381,7 @@ void NeonEmitter::run(raw_ostream &OS) {
   OS << "#include <arm_vector_types.h>\n";
 
   // For now, signedness of polynomial types depends on target
-  OS << "#ifdef __aarch64__\n";
+  OS << "#if defined(__aarch64__) || defined(__arm64ec__)\n";
   OS << "typedef uint8_t poly8_t;\n";
   OS << "typedef uint16_t poly16_t;\n";
   OS << "typedef uint64_t poly64_t;\n";
@@ -2582,7 +2582,7 @@ void NeonEmitter::runVectorTypes(raw_ostream &OS) {
   OS << "typedef float float32_t;\n";
   OS << "typedef __fp16 float16_t;\n";
 
-  OS << "#ifdef __aarch64__\n";
+  OS << "#if defined(__aarch64__) || defined(__arm64ec__)\n";
   OS << "typedef double float64_t;\n";
   OS << "#endif\n\n";
 
diff --git a/clang/www/c_status.html b/clang/www/c_status.html
index 9893170ae8473..dfc1afefda245 100644
--- a/clang/www/c_status.html
+++ b/clang/www/c_status.html
@@ -295,11 +295,6 @@ <h2 id="c99">C99 implementation status</h2>
       <td>N570</td>
       <td class="full" align="center">Yes</td>
     </tr>
-    <tr>
-      <td>new structure type compatibility (tag compatibility)</td>
-      <td>N522</td>
-      <td class="unknown" align="center">Unknown</td>
-    </tr>
     <tr>
       <td>additional predefined macro names</td>
       <td>Unknown</td>
diff --git a/compiler-rt/cmake/Modules/CompilerRTUtils.cmake b/compiler-rt/cmake/Modules/CompilerRTUtils.cmake
index e8e5f612d5b03..6d413f6753bc0 100644
--- a/compiler-rt/cmake/Modules/CompilerRTUtils.cmake
+++ b/compiler-rt/cmake/Modules/CompilerRTUtils.cmake
@@ -368,6 +368,12 @@ macro(construct_compiler_rt_default_triple)
           "Default triple for which compiler-rt runtimes will be built.")
   endif()
 
+  if ("${CMAKE_C_COMPILER_ID}" MATCHES "Clang")
+    execute_process(COMMAND ${CMAKE_C_COMPILER} --target=${COMPILER_RT_DEFAULT_TARGET_TRIPLE} -print-effective-triple
+                    OUTPUT_VARIABLE COMPILER_RT_DEFAULT_TARGET_TRIPLE
+                    OUTPUT_STRIP_TRAILING_WHITESPACE)
+  endif()
+
   string(REPLACE "-" ";" LLVM_TARGET_TRIPLE_LIST ${COMPILER_RT_DEFAULT_TARGET_TRIPLE})
   list(GET LLVM_TARGET_TRIPLE_LIST 0 COMPILER_RT_DEFAULT_TARGET_ARCH)
 
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64.c b/compiler-rt/lib/builtins/cpu_model/aarch64.c
index 17bddfca46f09..1ac4d85a0c139 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64.c
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64.c
@@ -67,13 +67,10 @@ enum CPUFeatures {
   FEAT_FP,
   FEAT_SIMD,
   FEAT_CRC,
-  FEAT_SHA1,
   FEAT_SHA2,
   FEAT_SHA3,
   FEAT_AES,
-  FEAT_PMULL,
   FEAT_FP16,
-  FEAT_DIT,
   FEAT_DPB,
   FEAT_DPB2,
   FEAT_JSCVT,
@@ -81,35 +78,23 @@ enum CPUFeatures {
   FEAT_RCPC,
   FEAT_RCPC2,
   FEAT_FRINTTS,
-  FEAT_DGH,
   FEAT_I8MM,
   FEAT_BF16,
-  FEAT_EBF16,
   FEAT_RPRES,
   FEAT_SVE,
-  FEAT_SVE_BF16,
-  FEAT_SVE_EBF16,
-  FEAT_SVE_I8MM,
   FEAT_SVE_F32MM,
   FEAT_SVE_F64MM,
   FEAT_SVE2,
   FEAT_SVE_AES,
-  FEAT_SVE_PMULL128,
   FEAT_SVE_BITPERM,
   FEAT_SVE_SHA3,
   FEAT_SVE_SM4,
   FEAT_SME,
   FEAT_MEMTAG,
-  FEAT_MEMTAG2,
-  FEAT_MEMTAG3,
   FEAT_SB,
   FEAT_PREDRES,
   FEAT_SSBS,
-  FEAT_SSBS2,
-  FEAT_BTI,
   FEAT_LS64,
-  FEAT_LS64_V,
-  FEAT_LS64_ACCDATA,
   FEAT_WFXT,
   FEAT_SME_F64,
   FEAT_SME_I64,
@@ -117,7 +102,7 @@ enum CPUFeatures {
   FEAT_RCPC3,
   FEAT_MOPS,
   FEAT_MAX,
-  FEAT_EXT = 62, // Reserved to indicate presence of additional features field
+  FEAT_EXT = 47, // Reserved to indicate presence of additional features field
                  // in __aarch64_cpu_features
   FEAT_INIT      // Used as flag of features initialization completion
 };
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc
index 6fef109567b61..19114c4abdfca 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc
@@ -35,13 +35,10 @@ void __init_cpu_features_resolver(void) {
       {"hw.optional.floatingpoint", FEAT_FP},
       {"hw.optional.AdvSIMD", FEAT_SIMD},
       {"hw.optional.armv8_crc32", FEAT_CRC},
-      {"hw.optional.arm.FEAT_SHA1", FEAT_SHA1},
       {"hw.optional.arm.FEAT_SHA256", FEAT_SHA2},
       {"hw.optional.arm.FEAT_SHA3", FEAT_SHA3},
       {"hw.optional.arm.FEAT_AES", FEAT_AES},
-      {"hw.optional.arm.FEAT_PMULL", FEAT_PMULL},
       {"hw.optional.arm.FEAT_FP16", FEAT_FP16},
-      {"hw.optional.arm.FEAT_DIT", FEAT_DIT},
       {"hw.optional.arm.FEAT_DPB", FEAT_DPB},
       {"hw.optional.arm.FEAT_DPB2", FEAT_DPB2},
       {"hw.optional.arm.FEAT_JSCVT", FEAT_JSCVT},
@@ -53,8 +50,7 @@ void __init_cpu_features_resolver(void) {
       {"hw.optional.arm.FEAT_BF16", FEAT_BF16},
       {"hw.optional.arm.FEAT_SB", FEAT_SB},
       {"hw.optional.arm.FEAT_SPECRES", FEAT_PREDRES},
-      {"hw.optional.arm.FEAT_SSBS", FEAT_SSBS2},
-      {"hw.optional.arm.FEAT_BTI", FEAT_BTI},
+      {"hw.optional.arm.FEAT_SSBS", FEAT_SSBS},
   };
 
   for (size_t I = 0, E = sizeof(feature_checks) / sizeof(feature_checks[0]);
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc
index d8e0280f40416..579aa00dc5c72 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/fuchsia.inc
@@ -22,10 +22,6 @@ void __init_cpu_features_resolver() {
     setCPUFeature(FEAT_SIMD);
   if (features & ZX_ARM64_FEATURE_ISA_AES)
     setCPUFeature(FEAT_AES);
-  if (features & ZX_ARM64_FEATURE_ISA_PMULL)
-    setCPUFeature(FEAT_PMULL);
-  if (features & ZX_ARM64_FEATURE_ISA_SHA1)
-    setCPUFeature(FEAT_SHA1);
   if (features & ZX_ARM64_FEATURE_ISA_SHA256)
     setCPUFeature(FEAT_SHA2);
   if (features & ZX_ARM64_FEATURE_ISA_CRC32)
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc
index 32a21a2fba9a3..54797852e4aaa 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc
@@ -16,8 +16,6 @@ static void __init_cpu_features_constructor(unsigned long hwcap,
     hwcap2 = arg->_hwcap2;
   if (hwcap & HWCAP_CRC32)
     setCPUFeature(FEAT_CRC);
-  if (hwcap & HWCAP_PMULL)
-    setCPUFeature(FEAT_PMULL);
   if (hwcap & HWCAP_FLAGM)
     setCPUFeature(FEAT_FLAGM);
   if (hwcap2 & HWCAP2_FLAGM2) {
@@ -34,16 +32,12 @@ static void __init_cpu_features_constructor(unsigned long hwcap,
     setCPUFeature(FEAT_FP16);
     setCPUFeature(FEAT_FP);
   }
-  if (hwcap & HWCAP_DIT)
-    setCPUFeature(FEAT_DIT);
   if (hwcap & HWCAP_ASIMDRDM)
     setCPUFeature(FEAT_RDM);
   if (hwcap & HWCAP_ILRCPC)
     setCPUFeature(FEAT_RCPC2);
   if (hwcap & HWCAP_AES)
     setCPUFeature(FEAT_AES);
-  if (hwcap & HWCAP_SHA1)
-    setCPUFeature(FEAT_SHA1);
   if (hwcap & HWCAP_SHA2)
     setCPUFeature(FEAT_SHA2);
   if (hwcap & HWCAP_JSCVT)
@@ -53,22 +47,11 @@ static void __init_cpu_features_constructor(unsigned long hwcap,
   if (hwcap & HWCAP_SB)
     setCPUFeature(FEAT_SB);
   if (hwcap & HWCAP_SSBS)
-    setCPUFeature(FEAT_SSBS2);
-  if (hwcap2 & HWCAP2_MTE) {
+    setCPUFeature(FEAT_SSBS);
+  if (hwcap2 & HWCAP2_MTE)
     setCPUFeature(FEAT_MEMTAG);
-    setCPUFeature(FEAT_MEMTAG2);
-  }
-  if (hwcap2 & HWCAP2_MTE3) {
-    setCPUFeature(FEAT_MEMTAG);
-    setCPUFeature(FEAT_MEMTAG2);
-    setCPUFeature(FEAT_MEMTAG3);
-  }
   if (hwcap2 & HWCAP2_SVEAES)
     setCPUFeature(FEAT_SVE_AES);
-  if (hwcap2 & HWCAP2_SVEPMULL) {
-    setCPUFeature(FEAT_SVE_AES);
-    setCPUFeature(FEAT_SVE_PMULL128);
-  }
   if (hwcap2 & HWCAP2_SVEBITPERM)
     setCPUFeature(FEAT_SVE_BITPERM);
   if (hwcap2 & HWCAP2_SVESHA3)
@@ -83,22 +66,12 @@ static void __init_cpu_features_constructor(unsigned long hwcap,
     setCPUFeature(FEAT_RNG);
   if (hwcap2 & HWCAP2_I8MM)
     setCPUFeature(FEAT_I8MM);
-  if (hwcap2 & HWCAP2_EBF16)
-    setCPUFeature(FEAT_EBF16);
-  if (hwcap2 & HWCAP2_SVE_EBF16)
-    setCPUFeature(FEAT_SVE_EBF16);
-  if (hwcap2 & HWCAP2_DGH)
-    setCPUFeature(FEAT_DGH);
   if (hwcap2 & HWCAP2_FRINT)
     setCPUFeature(FEAT_FRINTTS);
-  if (hwcap2 & HWCAP2_SVEI8MM)
-    setCPUFeature(FEAT_SVE_I8MM);
   if (hwcap2 & HWCAP2_SVEF32MM)
     setCPUFeature(FEAT_SVE_F32MM);
   if (hwcap2 & HWCAP2_SVEF64MM)
     setCPUFeature(FEAT_SVE_F64MM);
-  if (hwcap2 & HWCAP2_BTI)
-    setCPUFeature(FEAT_BTI);
   if (hwcap2 & HWCAP2_RPRES)
     setCPUFeature(FEAT_RPRES);
   if (hwcap2 & HWCAP2_WFXT)
@@ -141,9 +114,6 @@ static void __init_cpu_features_constructor(unsigned long hwcap,
       // ID_AA64ZFR0_EL1.SVEver == 0b0001
       if (extractBits(ftr, 0, 4) == 0x1)
         setCPUFeature(FEAT_SVE2);
-      // ID_AA64ZFR0_EL1.BF16 != 0b0000
-      if (extractBits(ftr, 20, 4) != 0x0)
-        setCPUFeature(FEAT_SVE_BF16);
     }
     getCPUFeature(ID_AA64ISAR0_EL1, ftr);
     // ID_AA64ISAR0_EL1.SHA3 != 0b0000
@@ -168,12 +138,6 @@ static void __init_cpu_features_constructor(unsigned long hwcap,
     // ID_AA64ISAR1_EL1.LS64 >= 0b0001
     if (extractBits(ftr, 60, 4) >= 0x1)
       setCPUFeature(FEAT_LS64);
-    // ID_AA64ISAR1_EL1.LS64 >= 0b0010
-    if (extractBits(ftr, 60, 4) >= 0x2)
-      setCPUFeature(FEAT_LS64_V);
-    // ID_AA64ISAR1_EL1.LS64 >= 0b0011
-    if (extractBits(ftr, 60, 4) >= 0x3)
-      setCPUFeature(FEAT_LS64_ACCDATA);
   } else {
     // Set some features in case of no CPUID support
     if (hwcap & (HWCAP_FP | HWCAP_FPHP)) {
@@ -187,8 +151,6 @@ static void __init_cpu_features_constructor(unsigned long hwcap,
       setCPUFeature(FEAT_RCPC);
     if (hwcap2 & HWCAP2_BF16 || hwcap2 & HWCAP2_EBF16)
       setCPUFeature(FEAT_BF16);
-    if (hwcap2 & HWCAP2_SVEBF16)
-      setCPUFeature(FEAT_SVE_BF16);
     if (hwcap2 & HWCAP2_SVE2 && hwcap & HWCAP_SVE)
       setCPUFeature(FEAT_SVE2);
     if (hwcap & HWCAP_SHA3)
diff --git a/compiler-rt/lib/sanitizer_common/CMakeLists.txt b/compiler-rt/lib/sanitizer_common/CMakeLists.txt
index f2b4ac72ae157..66f2d259aa5fd 100644
--- a/compiler-rt/lib/sanitizer_common/CMakeLists.txt
+++ b/compiler-rt/lib/sanitizer_common/CMakeLists.txt
@@ -122,9 +122,6 @@ set(SANITIZER_IMPL_HEADERS
   sanitizer_asm.h
   sanitizer_atomic.h
   sanitizer_atomic_clang.h
-  sanitizer_atomic_clang_mips.h
-  sanitizer_atomic_clang_other.h
-  sanitizer_atomic_clang_x86.h
   sanitizer_atomic_msvc.h
   sanitizer_bitvector.h
   sanitizer_bvgraph.h
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_atomic.h b/compiler-rt/lib/sanitizer_common/sanitizer_atomic.h
index 46f06957228c9..0609a11ffdebb 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_atomic.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_atomic.h
@@ -18,12 +18,24 @@
 namespace __sanitizer {
 
 enum memory_order {
+// If the __atomic atomic builtins are supported (Clang/GCC), use the
+// compiler provided macro values so that we can map the atomic operations
+// to __atomic_* directly.
+#ifdef __ATOMIC_SEQ_CST
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_consume = __ATOMIC_CONSUME,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
+#else
   memory_order_relaxed = 1 << 0,
   memory_order_consume = 1 << 1,
   memory_order_acquire = 1 << 2,
   memory_order_release = 1 << 3,
   memory_order_acq_rel = 1 << 4,
   memory_order_seq_cst = 1 << 5
+#endif
 };
 
 struct atomic_uint8_t {
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang.h b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang.h
index 4318d64d16cfa..1414092e38d7e 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang.h
@@ -14,60 +14,63 @@
 #ifndef SANITIZER_ATOMIC_CLANG_H
 #define SANITIZER_ATOMIC_CLANG_H
 
-#if defined(__i386__) || defined(__x86_64__)
-# include "sanitizer_atomic_clang_x86.h"
-#else
-# include "sanitizer_atomic_clang_other.h"
-#endif
-
 namespace __sanitizer {
 
-// We would like to just use compiler builtin atomic operations
-// for loads and stores, but they are mostly broken in clang:
-// - they lead to vastly inefficient code generation
-// (http://llvm.org/bugs/show_bug.cgi?id=17281)
-// - 64-bit atomic operations are not implemented on x86_32
-// (http://llvm.org/bugs/show_bug.cgi?id=15034)
-// - they are not implemented on ARM
-// error: undefined reference to '__atomic_load_4'
+// We use the compiler builtin atomic operations for loads and stores, which
+// generates correct code for all architectures, but may require libatomic
+// on platforms where e.g. 64-bit atomics are not supported natively.
 
 // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
 // for mappings of the memory model to different processors.
 
-inline void atomic_signal_fence(memory_order) {
+inline void atomic_signal_fence(memory_order mo) { __atomic_signal_fence(mo); }
+
+inline void atomic_thread_fence(memory_order mo) { __atomic_thread_fence(mo); }
+
+inline void proc_yield(int cnt) {
+  __asm__ __volatile__("" ::: "memory");
+#if defined(__i386__) || defined(__x86_64__)
+  for (int i = 0; i < cnt; i++) __asm__ __volatile__("pause");
   __asm__ __volatile__("" ::: "memory");
+#endif
 }
 
-inline void atomic_thread_fence(memory_order) {
-  __sync_synchronize();
+template <typename T>
+inline typename T::Type atomic_load(const volatile T *a, memory_order mo) {
+  DCHECK(mo == memory_order_relaxed || mo == memory_order_consume ||
+         mo == memory_order_acquire || mo == memory_order_seq_cst);
+  DCHECK(!((uptr)a % sizeof(*a)));
+  return __atomic_load_n(&a->val_dont_use, mo);
 }
 
-template<typename T>
-inline typename T::Type atomic_fetch_add(volatile T *a,
-    typename T::Type v, memory_order mo) {
-  (void)mo;
+template <typename T>
+inline void atomic_store(volatile T *a, typename T::Type v, memory_order mo) {
+  DCHECK(mo == memory_order_relaxed || mo == memory_order_release ||
+         mo == memory_order_seq_cst);
   DCHECK(!((uptr)a % sizeof(*a)));
-  return __sync_fetch_and_add(&a->val_dont_use, v);
+  __atomic_store_n(&a->val_dont_use, v, mo);
 }
 
-template<typename T>
-inline typename T::Type atomic_fetch_sub(volatile T *a,
-    typename T::Type v, memory_order mo) {
+template <typename T>
+inline typename T::Type atomic_fetch_add(volatile T *a, typename T::Type v,
+                                         memory_order mo) {
+  DCHECK(!((uptr)a % sizeof(*a)));
+  return __atomic_fetch_add(&a->val_dont_use, v, mo);
+}
+
+template <typename T>
+inline typename T::Type atomic_fetch_sub(volatile T *a, typename T::Type v,
+                                         memory_order mo) {
   (void)mo;
   DCHECK(!((uptr)a % sizeof(*a)));
-  return __sync_fetch_and_add(&a->val_dont_use, -v);
+  return __atomic_fetch_sub(&a->val_dont_use, v, mo);
 }
 
-template<typename T>
-inline typename T::Type atomic_exchange(volatile T *a,
-    typename T::Type v, memory_order mo) {
+template <typename T>
+inline typename T::Type atomic_exchange(volatile T *a, typename T::Type v,
+                                        memory_order mo) {
   DCHECK(!((uptr)a % sizeof(*a)));
-  if (mo & (memory_order_release | memory_order_acq_rel | memory_order_seq_cst))
-    __sync_synchronize();
-  v = __sync_lock_test_and_set(&a->val_dont_use, v);
-  if (mo == memory_order_seq_cst)
-    __sync_synchronize();
-  return v;
+  return __atomic_exchange_n(&a->val_dont_use, v, mo);
 }
 
 template <typename T>
@@ -82,9 +85,8 @@ inline bool atomic_compare_exchange_strong(volatile T *a, typename T::Type *cmp,
                                    __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
 }
 
-template<typename T>
-inline bool atomic_compare_exchange_weak(volatile T *a,
-                                         typename T::Type *cmp,
+template <typename T>
+inline bool atomic_compare_exchange_weak(volatile T *a, typename T::Type *cmp,
                                          typename T::Type xchg,
                                          memory_order mo) {
   return atomic_compare_exchange_strong(a, cmp, xchg, mo);
@@ -92,13 +94,6 @@ inline bool atomic_compare_exchange_weak(volatile T *a,
 
 }  // namespace __sanitizer
 
-// This include provides explicit template instantiations for atomic_uint64_t
-// on MIPS32, which does not directly support 8 byte atomics. It has to
-// proceed the template definitions above.
-#if defined(_MIPS_SIM) && defined(_ABIO32) && _MIPS_SIM == _ABIO32
-#  include "sanitizer_atomic_clang_mips.h"
-#endif
-
 #undef ATOMIC_ORDER
 
 #endif  // SANITIZER_ATOMIC_CLANG_H
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_mips.h b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_mips.h
deleted file mode 100644
index f3d3052e5b7c5..0000000000000
--- a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_mips.h
+++ /dev/null
@@ -1,117 +0,0 @@
-//===-- sanitizer_atomic_clang_mips.h ---------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of ThreadSanitizer/AddressSanitizer runtime.
-// Not intended for direct inclusion. Include sanitizer_atomic.h.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef SANITIZER_ATOMIC_CLANG_MIPS_H
-#define SANITIZER_ATOMIC_CLANG_MIPS_H
-
-namespace __sanitizer {
-
-// MIPS32 does not support atomics > 4 bytes. To address this lack of
-// functionality, the sanitizer library provides helper methods which use an
-// internal spin lock mechanism to emulate atomic operations when the size is
-// 8 bytes.
-static void __spin_lock(volatile int *lock) {
-  while (__sync_lock_test_and_set(lock, 1))
-    while (*lock) {
-    }
-}
-
-static void __spin_unlock(volatile int *lock) { __sync_lock_release(lock); }
-
-// Make sure the lock is on its own cache line to prevent false sharing.
-// Put it inside a struct that is aligned and padded to the typical MIPS
-// cacheline which is 32 bytes.
-static struct {
-  int lock;
-  char pad[32 - sizeof(int)];
-} __attribute__((aligned(32))) lock = {0, {0}};
-
-template <>
-inline atomic_uint64_t::Type atomic_fetch_add(volatile atomic_uint64_t *ptr,
-                                              atomic_uint64_t::Type val,
-                                              memory_order mo) {
-  DCHECK(mo &
-         (memory_order_relaxed | memory_order_release | memory_order_seq_cst));
-  DCHECK(!((uptr)ptr % sizeof(*ptr)));
-
-  atomic_uint64_t::Type ret;
-
-  __spin_lock(&lock.lock);
-  ret = *(const_cast<atomic_uint64_t::Type volatile *>(&ptr->val_dont_use));
-  ptr->val_dont_use = ret + val;
-  __spin_unlock(&lock.lock);
-
-  return ret;
-}
-
-template <>
-inline atomic_uint64_t::Type atomic_fetch_sub(volatile atomic_uint64_t *ptr,
-                                              atomic_uint64_t::Type val,
-                                              memory_order mo) {
-  return atomic_fetch_add(ptr, -val, mo);
-}
-
-template <>
-inline bool atomic_compare_exchange_strong(volatile atomic_uint64_t *ptr,
-                                           atomic_uint64_t::Type *cmp,
-                                           atomic_uint64_t::Type xchg,
-                                           memory_order mo) {
-  DCHECK(mo &
-         (memory_order_relaxed | memory_order_release | memory_order_seq_cst));
-  DCHECK(!((uptr)ptr % sizeof(*ptr)));
-
-  typedef atomic_uint64_t::Type Type;
-  Type cmpv = *cmp;
-  Type prev;
-  bool ret = false;
-
-  __spin_lock(&lock.lock);
-  prev = *(const_cast<Type volatile *>(&ptr->val_dont_use));
-  if (prev == cmpv) {
-    ret = true;
-    ptr->val_dont_use = xchg;
-  }
-  __spin_unlock(&lock.lock);
-
-  return ret;
-}
-
-template <>
-inline atomic_uint64_t::Type atomic_load(const volatile atomic_uint64_t *ptr,
-                                         memory_order mo) {
-  DCHECK(mo &
-         (memory_order_relaxed | memory_order_release | memory_order_seq_cst));
-  DCHECK(!((uptr)ptr % sizeof(*ptr)));
-
-  atomic_uint64_t::Type zero = 0;
-  volatile atomic_uint64_t *Newptr =
-      const_cast<volatile atomic_uint64_t *>(ptr);
-  return atomic_fetch_add(Newptr, zero, mo);
-}
-
-template <>
-inline void atomic_store(volatile atomic_uint64_t *ptr, atomic_uint64_t::Type v,
-                         memory_order mo) {
-  DCHECK(mo &
-         (memory_order_relaxed | memory_order_release | memory_order_seq_cst));
-  DCHECK(!((uptr)ptr % sizeof(*ptr)));
-
-  __spin_lock(&lock.lock);
-  ptr->val_dont_use = v;
-  __spin_unlock(&lock.lock);
-}
-
-}  // namespace __sanitizer
-
-#endif  // SANITIZER_ATOMIC_CLANG_MIPS_H
-
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_other.h b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_other.h
deleted file mode 100644
index 557082a636b87..0000000000000
--- a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_other.h
+++ /dev/null
@@ -1,85 +0,0 @@
-//===-- sanitizer_atomic_clang_other.h --------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of ThreadSanitizer/AddressSanitizer runtime.
-// Not intended for direct inclusion. Include sanitizer_atomic.h.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef SANITIZER_ATOMIC_CLANG_OTHER_H
-#define SANITIZER_ATOMIC_CLANG_OTHER_H
-
-namespace __sanitizer {
-
-
-inline void proc_yield(int cnt) {
-  __asm__ __volatile__("" ::: "memory");
-}
-
-template<typename T>
-inline typename T::Type atomic_load(
-    const volatile T *a, memory_order mo) {
-  DCHECK(mo & (memory_order_relaxed | memory_order_consume
-      | memory_order_acquire | memory_order_seq_cst));
-  DCHECK(!((uptr)a % sizeof(*a)));
-  typename T::Type v;
-
-  if (sizeof(*a) < 8 || sizeof(void*) == 8) {
-    // Assume that aligned loads are atomic.
-    if (mo == memory_order_relaxed) {
-      v = a->val_dont_use;
-    } else if (mo == memory_order_consume) {
-      // Assume that processor respects data dependencies
-      // (and that compiler won't break them).
-      __asm__ __volatile__("" ::: "memory");
-      v = a->val_dont_use;
-      __asm__ __volatile__("" ::: "memory");
-    } else if (mo == memory_order_acquire) {
-      __asm__ __volatile__("" ::: "memory");
-      v = a->val_dont_use;
-      __sync_synchronize();
-    } else {  // seq_cst
-      // E.g. on POWER we need a hw fence even before the store.
-      __sync_synchronize();
-      v = a->val_dont_use;
-      __sync_synchronize();
-    }
-  } else {
-    __atomic_load(const_cast<typename T::Type volatile *>(&a->val_dont_use), &v,
-                  __ATOMIC_SEQ_CST);
-  }
-  return v;
-}
-
-template<typename T>
-inline void atomic_store(volatile T *a, typename T::Type v, memory_order mo) {
-  DCHECK(mo & (memory_order_relaxed | memory_order_release
-      | memory_order_seq_cst));
-  DCHECK(!((uptr)a % sizeof(*a)));
-
-  if (sizeof(*a) < 8 || sizeof(void*) == 8) {
-    // Assume that aligned stores are atomic.
-    if (mo == memory_order_relaxed) {
-      a->val_dont_use = v;
-    } else if (mo == memory_order_release) {
-      __sync_synchronize();
-      a->val_dont_use = v;
-      __asm__ __volatile__("" ::: "memory");
-    } else {  // seq_cst
-      __sync_synchronize();
-      a->val_dont_use = v;
-      __sync_synchronize();
-    }
-  } else {
-    __atomic_store(&a->val_dont_use, &v, __ATOMIC_SEQ_CST);
-  }
-}
-
-}  // namespace __sanitizer
-
-#endif  // #ifndef SANITIZER_ATOMIC_CLANG_OTHER_H
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_x86.h b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_x86.h
deleted file mode 100644
index b81a354d20987..0000000000000
--- a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_clang_x86.h
+++ /dev/null
@@ -1,113 +0,0 @@
-//===-- sanitizer_atomic_clang_x86.h ----------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of ThreadSanitizer/AddressSanitizer runtime.
-// Not intended for direct inclusion. Include sanitizer_atomic.h.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef SANITIZER_ATOMIC_CLANG_X86_H
-#define SANITIZER_ATOMIC_CLANG_X86_H
-
-namespace __sanitizer {
-
-inline void proc_yield(int cnt) {
-  __asm__ __volatile__("" ::: "memory");
-  for (int i = 0; i < cnt; i++)
-    __asm__ __volatile__("pause");
-  __asm__ __volatile__("" ::: "memory");
-}
-
-template<typename T>
-inline typename T::Type atomic_load(
-    const volatile T *a, memory_order mo) {
-  DCHECK(mo & (memory_order_relaxed | memory_order_consume
-      | memory_order_acquire | memory_order_seq_cst));
-  DCHECK(!((uptr)a % sizeof(*a)));
-  typename T::Type v;
-
-  if (sizeof(*a) < 8 || sizeof(void*) == 8) {
-    // Assume that aligned loads are atomic.
-    if (mo == memory_order_relaxed) {
-      v = a->val_dont_use;
-    } else if (mo == memory_order_consume) {
-      // Assume that processor respects data dependencies
-      // (and that compiler won't break them).
-      __asm__ __volatile__("" ::: "memory");
-      v = a->val_dont_use;
-      __asm__ __volatile__("" ::: "memory");
-    } else if (mo == memory_order_acquire) {
-      __asm__ __volatile__("" ::: "memory");
-      v = a->val_dont_use;
-      // On x86 loads are implicitly acquire.
-      __asm__ __volatile__("" ::: "memory");
-    } else {  // seq_cst
-      // On x86 plain MOV is enough for seq_cst store.
-      __asm__ __volatile__("" ::: "memory");
-      v = a->val_dont_use;
-      __asm__ __volatile__("" ::: "memory");
-    }
-  } else {
-    // 64-bit load on 32-bit platform.
-    __asm__ __volatile__(
-        "movq %1, %%mm0;"  // Use mmx reg for 64-bit atomic moves
-        "movq %%mm0, %0;"  // (ptr could be read-only)
-        "emms;"            // Empty mmx state/Reset FP regs
-        : "=m" (v)
-        : "m" (a->val_dont_use)
-        : // mark the mmx registers as clobbered
-#ifdef __MMX__
-          "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",
-#endif  // #ifdef __MMX__
-          "memory");
-  }
-  return v;
-}
-
-template<typename T>
-inline void atomic_store(volatile T *a, typename T::Type v, memory_order mo) {
-  DCHECK(mo & (memory_order_relaxed | memory_order_release
-      | memory_order_seq_cst));
-  DCHECK(!((uptr)a % sizeof(*a)));
-
-  if (sizeof(*a) < 8 || sizeof(void*) == 8) {
-    // Assume that aligned stores are atomic.
-    if (mo == memory_order_relaxed) {
-      a->val_dont_use = v;
-    } else if (mo == memory_order_release) {
-      // On x86 stores are implicitly release.
-      __asm__ __volatile__("" ::: "memory");
-      a->val_dont_use = v;
-      __asm__ __volatile__("" ::: "memory");
-    } else {  // seq_cst
-      // On x86 stores are implicitly release.
-      __asm__ __volatile__("" ::: "memory");
-      a->val_dont_use = v;
-      __sync_synchronize();
-    }
-  } else {
-    // 64-bit store on 32-bit platform.
-    __asm__ __volatile__(
-        "movq %1, %%mm0;"  // Use mmx reg for 64-bit atomic moves
-        "movq %%mm0, %0;"
-        "emms;"            // Empty mmx state/Reset FP regs
-        : "=m" (a->val_dont_use)
-        : "m" (v)
-        : // mark the mmx registers as clobbered
-#ifdef __MMX__
-          "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",
-#endif  // #ifdef __MMX__
-          "memory");
-    if (mo == memory_order_seq_cst)
-      __sync_synchronize();
-  }
-}
-
-}  // namespace __sanitizer
-
-#endif  // #ifndef SANITIZER_ATOMIC_CLANG_X86_H
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_msvc.h b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_msvc.h
index 31317adcdfc99..d80bfdbf6a081 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_atomic_msvc.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_atomic_msvc.h
@@ -70,8 +70,8 @@ inline void proc_yield(int cnt) {
 template<typename T>
 inline typename T::Type atomic_load(
     const volatile T *a, memory_order mo) {
-  DCHECK(mo & (memory_order_relaxed | memory_order_consume
-      | memory_order_acquire | memory_order_seq_cst));
+  DCHECK(mo == memory_order_relaxed || mo == memory_order_consume ||
+         mo == memory_order_acquire || mo == memory_order_seq_cst);
   DCHECK(!((uptr)a % sizeof(*a)));
   typename T::Type v;
   // FIXME(dvyukov): 64-bit load is not atomic on 32-bits.
@@ -87,8 +87,8 @@ inline typename T::Type atomic_load(
 
 template<typename T>
 inline void atomic_store(volatile T *a, typename T::Type v, memory_order mo) {
-  DCHECK(mo & (memory_order_relaxed | memory_order_release
-      | memory_order_seq_cst));
+  DCHECK(mo == memory_order_relaxed || mo == memory_order_release ||
+         mo == memory_order_seq_cst);
   DCHECK(!((uptr)a % sizeof(*a)));
   // FIXME(dvyukov): 64-bit store is not atomic on 32-bits.
   if (mo == memory_order_relaxed) {
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/mmap_56bit_test.c b/compiler-rt/test/sanitizer_common/TestCases/Linux/mmap_56bit_test.c
index b14ac7bcf1924..02220cb78e6c9 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/mmap_56bit_test.c
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/mmap_56bit_test.c
@@ -1,6 +1,9 @@
-// RUN: %clangxx %s -pie -fPIE -o %t && %run %t
+// RUN: %clang %s -pie -fPIE -o %t && %run %t
 // REQUIRES: x86_64-target-arch
 
+// FIXME: Fails Asan, as expected, with 5lvl page tables.
+// UNSUPPORTED: x86_64-target-arch
+
 #include <assert.h>
 #include <stdio.h>
 #include <sys/mman.h>
diff --git a/flang/cmake/modules/AddFlangOffloadRuntime.cmake b/flang/cmake/modules/AddFlangOffloadRuntime.cmake
index 6fb6213e90fc4..e34d3851187ac 100644
--- a/flang/cmake/modules/AddFlangOffloadRuntime.cmake
+++ b/flang/cmake/modules/AddFlangOffloadRuntime.cmake
@@ -10,7 +10,7 @@ set(FLANG_EXPERIMENTAL_OMP_OFFLOAD_BUILD "off" CACHE STRING
 set(FLANG_OMP_DEVICE_ARCHITECTURES "all" CACHE STRING
   "List of OpenMP device architectures to be used to compile the Fortran runtime (e.g. 'gfx1103;sm_90')")
 
-macro(enable_cuda_compilation files)
+macro(enable_cuda_compilation name files)
   if (FLANG_EXPERIMENTAL_CUDA_RUNTIME)
     if (BUILD_SHARED_LIBS)
       message(FATAL_ERROR
@@ -52,6 +52,10 @@ macro(enable_cuda_compilation files)
       include_directories(AFTER ${FLANG_LIBCUDACXX_PATH}/include)
       add_compile_definitions(RT_USE_LIBCUDACXX=1)
     endif()
+
+    # Add an OBJECT library consisting of CUDA PTX.
+    llvm_add_library(${name}PTX OBJECT PARTIAL_SOURCES_INTENDED ${files})
+    set_property(TARGET obj.${name}PTX PROPERTY CUDA_PTX_COMPILATION ON)
   endif()
 endmacro()
 
diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md
index ccb93e104dab6..848619cb65d90 100644
--- a/flang/docs/Intrinsics.md
+++ b/flang/docs/Intrinsics.md
@@ -657,6 +657,14 @@ CALL CO_REDUCE
 CALL CO_SUM
 ```
 
+### Inquiry Functions
+ACCESS (GNU extension) is not supported on Windows. Otherwise:
+```
+CHARACTER(LEN=*) :: path = 'path/to/file'
+IF (ACCESS(path, 'rwx')) &
+  ...
+```
+
 ## Non-standard intrinsics
 ### PGI
 ```
diff --git a/flang/include/flang/Lower/CallInterface.h b/flang/include/flang/Lower/CallInterface.h
index 80b0576425377..a11e81b6593de 100644
--- a/flang/include/flang/Lower/CallInterface.h
+++ b/flang/include/flang/Lower/CallInterface.h
@@ -391,9 +391,6 @@ class CallerInterface : public CallInterface<CallerInterface> {
     llvm_unreachable("getting host associated type in CallerInterface");
   }
 
-  /// Set attributes on MLIR function.
-  void setFuncAttrs(mlir::func::FuncOp) const {}
-
 private:
   /// Check that the input vector is complete.
   bool verifyActualInputs() const;
@@ -444,7 +441,6 @@ class CalleeInterface : public CallInterface<CalleeInterface> {
   bool hasHostAssociated() const;
   mlir::Type getHostAssociatedTy() const;
   mlir::Value getHostAssociatedTuple() const;
-  void setFuncAttrs(mlir::func::FuncOp) const;
 
 private:
   Fortran::lower::pft::FunctionLikeUnit &funit;
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index dff1cdb20cbfe..92790a691e473 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -3190,4 +3190,61 @@ def fir_CUDADataTransferOp : fir_Op<"cuda_data_transfer", []> {
   }];
 }
 
+def fir_CUDAAllocateOp : fir_Op<"cuda_allocate", [AttrSizedOperandSegments,
+    MemoryEffects<[MemAlloc<DefaultResource>]>]> {
+  let summary = "Perform the device allocation of data of an allocatable";
+
+  let description = [{
+    The fir.cuda_allocate operation performs the allocation on the device
+    of the data of an allocatable. The descriptor passed to the operation
+    is initialized before with the standard flang runtime calls.
+  }];
+
+  let arguments = (ins Arg<fir_ReferenceType, "", [MemRead, MemWrite]>:$box,
+                       Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg,
+                       Optional<AnyIntegerType>:$stream,
+                       Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$pinned,
+                       Arg<Optional<AnyRefOrBoxType>, "", [MemRead]>:$source,
+                       fir_CUDADataAttributeAttr:$cuda_attr,
+                       UnitAttr:$hasStat);
+
+  let results = (outs AnyIntegerType:$stat);
+
+  let assemblyFormat = [{
+    $box `:` qualified(type($box))
+    ( `source` `(` $source^ `:` qualified(type($source) )`)` )?
+    ( `errmsg` `(` $errmsg^ `:` type($errmsg) `)` )?
+    ( `stream` `(` $stream^ `:` type($stream) `)` )?
+    ( `pinned` `(` $pinned^ `:` type($pinned) `)` )?
+    attr-dict `->` type($stat)
+  }];
+
+  let hasVerifier = 1;
+}
+
+def fir_CUDADeallocateOp : fir_Op<"cuda_deallocate",
+    [MemoryEffects<[MemFree<DefaultResource>]>]> {
+  let summary = "Perform the device deallocation of data of an allocatable";
+
+  let description = [{
+    The fir.cuda_deallocate operation performs the deallocation on the device
+    of the data of an allocatable.
+  }];
+
+  let arguments = (ins Arg<fir_ReferenceType, "", [MemRead, MemWrite]>:$box,
+                       Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg,
+                       fir_CUDADataAttributeAttr:$cuda_attr,
+                       UnitAttr:$hasStat);
+
+  let results = (outs AnyIntegerType:$stat);
+
+  let assemblyFormat = [{
+    $box `:` qualified(type($box))
+    ( `errmsg` `(` $errmsg^ `:` type($errmsg) `)` )?
+    attr-dict `->` type($stat)
+  }];
+
+  let hasVerifier = 1;
+}
+
 #endif
diff --git a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
index 3266ea3aa7fdc..46b62d8de8d37 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
+++ b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
@@ -104,9 +104,9 @@ static constexpr llvm::StringRef getHostAssocAttrName() {
   return "fir.host_assoc";
 }
 
-/// Attribute to mark an internal procedure.
-static constexpr llvm::StringRef getInternalProcedureAttrName() {
-  return "fir.internal_proc";
+/// Attribute to link an internal procedure to its host procedure symbol.
+static constexpr llvm::StringRef getHostSymbolAttrName() {
+  return "fir.host_symbol";
 }
 
 /// Attribute containing the original name of a function from before the
@@ -122,8 +122,8 @@ bool hasHostAssociationArgument(mlir::func::FuncOp func);
 /// Is the function, \p func an internal procedure ?
 /// Some internal procedures may have access to saved host procedure
 /// variables even when they do not have a tuple argument.
-inline bool isInternalPorcedure(mlir::func::FuncOp func) {
-  return func->hasAttr(fir::getInternalProcedureAttrName());
+inline bool isInternalProcedure(mlir::func::FuncOp func) {
+  return func->hasAttr(fir::getHostSymbolAttrName());
 }
 
 /// Tell if \p value is:
diff --git a/flang/include/flang/Optimizer/Dialect/FIRTypes.td b/flang/include/flang/Optimizer/Dialect/FIRTypes.td
index 4c6a8064991ab..3b876e4642da9 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRTypes.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRTypes.td
@@ -625,6 +625,7 @@ def AnyRefOrBoxLike : TypeConstraint<Or<[AnyReferenceLike.predicate,
 def AnyRefOrBox : TypeConstraint<Or<[fir_ReferenceType.predicate,
     fir_HeapType.predicate, fir_PointerType.predicate,
     IsBaseBoxTypePred]>, "any reference or box">;
+def AnyRefOrBoxType : Type<AnyRefOrBox.predicate, "any legal ref or box type">;
 
 def AnyShapeLike : TypeConstraint<Or<[fir_ShapeType.predicate,
     fir_ShapeShiftType.predicate]>, "any legal shape type">;
diff --git a/flang/include/flang/Runtime/extensions.h b/flang/include/flang/Runtime/extensions.h
index 7d0952206fc19..fef651f3b2eed 100644
--- a/flang/include/flang/Runtime/extensions.h
+++ b/flang/include/flang/Runtime/extensions.h
@@ -44,5 +44,12 @@ std::int64_t RTNAME(Signal)(std::int64_t number, void (*handler)(int));
 // GNU extension subroutine SLEEP(SECONDS)
 void RTNAME(Sleep)(std::int64_t seconds);
 
+// GNU extension function ACCESS(NAME, MODE)
+// TODO: not supported on Windows
+#ifndef _WIN32
+std::int64_t FORTRAN_PROCEDURE_NAME(access)(const char *name,
+    std::int64_t nameLength, const char *mode, std::int64_t modeLength);
+#endif
+
 } // extern "C"
 #endif // FORTRAN_RUNTIME_EXTENSIONS_H_
diff --git a/flang/include/flang/Semantics/openmp-directive-sets.h b/flang/include/flang/Semantics/openmp-directive-sets.h
index 91773ae3ea9a3..842d251b682aa 100644
--- a/flang/include/flang/Semantics/openmp-directive-sets.h
+++ b/flang/include/flang/Semantics/openmp-directive-sets.h
@@ -32,14 +32,14 @@ static const OmpDirectiveSet topDistributeSet{
 
 static const OmpDirectiveSet allDistributeSet{
     OmpDirectiveSet{
-        llvm::omp::OMPD_target_teams_distribute,
-        llvm::omp::OMPD_target_teams_distribute_parallel_do,
-        llvm::omp::OMPD_target_teams_distribute_parallel_do_simd,
-        llvm::omp::OMPD_target_teams_distribute_simd,
-        llvm::omp::OMPD_teams_distribute,
-        llvm::omp::OMPD_teams_distribute_parallel_do,
-        llvm::omp::OMPD_teams_distribute_parallel_do_simd,
-        llvm::omp::OMPD_teams_distribute_simd,
+        Directive::OMPD_target_teams_distribute,
+        Directive::OMPD_target_teams_distribute_parallel_do,
+        Directive::OMPD_target_teams_distribute_parallel_do_simd,
+        Directive::OMPD_target_teams_distribute_simd,
+        Directive::OMPD_teams_distribute,
+        Directive::OMPD_teams_distribute_parallel_do,
+        Directive::OMPD_teams_distribute_parallel_do_simd,
+        Directive::OMPD_teams_distribute_simd,
     } | topDistributeSet,
 };
 
@@ -63,10 +63,24 @@ static const OmpDirectiveSet allDoSet{
     } | topDoSet,
 };
 
+static const OmpDirectiveSet topLoopSet{
+    Directive::OMPD_loop,
+};
+
+static const OmpDirectiveSet allLoopSet{
+    OmpDirectiveSet{
+        Directive::OMPD_parallel_loop,
+        Directive::OMPD_target_parallel_loop,
+        Directive::OMPD_target_teams_loop,
+        Directive::OMPD_teams_loop,
+    } | topLoopSet,
+};
+
 static const OmpDirectiveSet topParallelSet{
     Directive::OMPD_parallel,
     Directive::OMPD_parallel_do,
     Directive::OMPD_parallel_do_simd,
+    Directive::OMPD_parallel_loop,
     Directive::OMPD_parallel_masked_taskloop,
     Directive::OMPD_parallel_masked_taskloop_simd,
     Directive::OMPD_parallel_master_taskloop,
@@ -82,6 +96,7 @@ static const OmpDirectiveSet allParallelSet{
         Directive::OMPD_target_parallel,
         Directive::OMPD_target_parallel_do,
         Directive::OMPD_target_parallel_do_simd,
+        Directive::OMPD_target_parallel_loop,
         Directive::OMPD_target_teams_distribute_parallel_do,
         Directive::OMPD_target_teams_distribute_parallel_do_simd,
         Directive::OMPD_teams_distribute_parallel_do,
@@ -118,12 +133,14 @@ static const OmpDirectiveSet topTargetSet{
     Directive::OMPD_target_parallel,
     Directive::OMPD_target_parallel_do,
     Directive::OMPD_target_parallel_do_simd,
+    Directive::OMPD_target_parallel_loop,
     Directive::OMPD_target_simd,
     Directive::OMPD_target_teams,
     Directive::OMPD_target_teams_distribute,
     Directive::OMPD_target_teams_distribute_parallel_do,
     Directive::OMPD_target_teams_distribute_parallel_do_simd,
     Directive::OMPD_target_teams_distribute_simd,
+    Directive::OMPD_target_teams_loop,
 };
 
 static const OmpDirectiveSet allTargetSet{topTargetSet};
@@ -156,11 +173,12 @@ static const OmpDirectiveSet topTeamsSet{
 
 static const OmpDirectiveSet allTeamsSet{
     OmpDirectiveSet{
-        llvm::omp::OMPD_target_teams,
-        llvm::omp::OMPD_target_teams_distribute,
-        llvm::omp::OMPD_target_teams_distribute_parallel_do,
-        llvm::omp::OMPD_target_teams_distribute_parallel_do_simd,
-        llvm::omp::OMPD_target_teams_distribute_simd,
+        Directive::OMPD_target_teams,
+        Directive::OMPD_target_teams_distribute,
+        Directive::OMPD_target_teams_distribute_parallel_do,
+        Directive::OMPD_target_teams_distribute_parallel_do_simd,
+        Directive::OMPD_target_teams_distribute_simd,
+        Directive::OMPD_target_teams_loop,
     } | topTeamsSet,
 };
 
@@ -178,6 +196,14 @@ static const OmpDirectiveSet allDistributeSimdSet{
 static const OmpDirectiveSet allDoSimdSet{allDoSet & allSimdSet};
 static const OmpDirectiveSet allTaskloopSimdSet{allTaskloopSet & allSimdSet};
 
+static const OmpDirectiveSet compositeConstructSet{
+    Directive::OMPD_distribute_parallel_do,
+    Directive::OMPD_distribute_parallel_do_simd,
+    Directive::OMPD_distribute_simd,
+    Directive::OMPD_do_simd,
+    Directive::OMPD_taskloop_simd,
+};
+
 static const OmpDirectiveSet blockConstructSet{
     Directive::OMPD_master,
     Directive::OMPD_ordered,
@@ -201,12 +227,14 @@ static const OmpDirectiveSet loopConstructSet{
     Directive::OMPD_distribute_simd,
     Directive::OMPD_do,
     Directive::OMPD_do_simd,
+    Directive::OMPD_loop,
     Directive::OMPD_masked_taskloop,
     Directive::OMPD_masked_taskloop_simd,
     Directive::OMPD_master_taskloop,
     Directive::OMPD_master_taskloop_simd,
     Directive::OMPD_parallel_do,
     Directive::OMPD_parallel_do_simd,
+    Directive::OMPD_parallel_loop,
     Directive::OMPD_parallel_masked_taskloop,
     Directive::OMPD_parallel_masked_taskloop_simd,
     Directive::OMPD_parallel_master_taskloop,
@@ -214,17 +242,20 @@ static const OmpDirectiveSet loopConstructSet{
     Directive::OMPD_simd,
     Directive::OMPD_target_parallel_do,
     Directive::OMPD_target_parallel_do_simd,
+    Directive::OMPD_target_parallel_loop,
     Directive::OMPD_target_simd,
     Directive::OMPD_target_teams_distribute,
     Directive::OMPD_target_teams_distribute_parallel_do,
     Directive::OMPD_target_teams_distribute_parallel_do_simd,
     Directive::OMPD_target_teams_distribute_simd,
+    Directive::OMPD_target_teams_loop,
     Directive::OMPD_taskloop,
     Directive::OMPD_taskloop_simd,
     Directive::OMPD_teams_distribute,
     Directive::OMPD_teams_distribute_parallel_do,
     Directive::OMPD_teams_distribute_parallel_do_simd,
     Directive::OMPD_teams_distribute_simd,
+    Directive::OMPD_teams_loop,
     Directive::OMPD_tile,
     Directive::OMPD_unroll,
 };
diff --git a/flang/lib/Decimal/CMakeLists.txt b/flang/lib/Decimal/CMakeLists.txt
index 3d562b8e3ce1e..880b190f1c581 100644
--- a/flang/lib/Decimal/CMakeLists.txt
+++ b/flang/lib/Decimal/CMakeLists.txt
@@ -55,7 +55,7 @@ set(sources
 )
 
 include(AddFlangOffloadRuntime)
-enable_cuda_compilation("${sources}")
+enable_cuda_compilation(FortranDecimal "${sources}")
 enable_omp_offload_compilation("${sources}")
 
 add_flang_library(FortranDecimal INSTALL_WITH_TOOLCHAIN ${sources})
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 42e78fc96e444..1d434d512d0c5 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -14,6 +14,7 @@
 #include "flang/Evaluate/tools.h"
 #include "flang/Lower/AbstractConverter.h"
 #include "flang/Lower/ConvertType.h"
+#include "flang/Lower/ConvertVariable.h"
 #include "flang/Lower/IterationSpace.h"
 #include "flang/Lower/Mangler.h"
 #include "flang/Lower/OpenACC.h"
@@ -368,20 +369,17 @@ class AllocateStmtHelper {
               [&](const Fortran::parser::AllocOpt::Mold &mold) {
                 moldExpr = Fortran::semantics::GetExpr(mold.v.value());
               },
-              [&](const Fortran::parser::AllocOpt::Stream &) {
-                TODO(loc, "CUDA ALLOCATE(STREAM=)");
+              [&](const Fortran::parser::AllocOpt::Stream &stream) {
+                streamExpr = Fortran::semantics::GetExpr(stream.v.value());
               },
-              [&](const Fortran::parser::AllocOpt::Pinned &) {
-                TODO(loc, "CUDA ALLOCATE(PINNED=)");
+              [&](const Fortran::parser::AllocOpt::Pinned &pinned) {
+                pinnedExpr = Fortran::semantics::GetExpr(pinned.v.value());
               },
           },
           allocOption.u);
   }
 
   void lowerAllocation(const Allocation &alloc) {
-    if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
-      TODO(loc, "Allocation of variable with CUDA attributes");
-
     fir::MutableBoxValue boxAddr =
         genMutableBoxValue(converter, loc, alloc.getAllocObj());
 
@@ -456,7 +454,8 @@ class AllocateStmtHelper {
                            const fir::MutableBoxValue &box) {
     if (!box.isDerived() && !errorManager.hasStatSpec() &&
         !alloc.type.IsPolymorphic() && !alloc.hasCoarraySpec() &&
-        !useAllocateRuntime && !box.isPointer()) {
+        !useAllocateRuntime && !box.isPointer() &&
+        !Fortran::semantics::HasCUDAAttr(alloc.getSymbol())) {
       // Pointers must use PointerAllocate so that their deallocations
       // can be validated.
       genInlinedAllocation(alloc, box);
@@ -472,7 +471,12 @@ class AllocateStmtHelper {
       genSetType(alloc, box, loc);
     genSetDeferredLengthParameters(alloc, box);
     genAllocateObjectBounds(alloc, box);
-    mlir::Value stat = genRuntimeAllocate(builder, loc, box, errorManager);
+    mlir::Value stat;
+    if (!Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
+      stat = genRuntimeAllocate(builder, loc, box, errorManager);
+    else
+      stat =
+          genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol());
     fir::factory::syncMutableBoxFromIRBox(builder, loc, box);
     postAllocationAction(alloc);
     errorManager.assignStat(builder, loc, stat);
@@ -602,7 +606,10 @@ class AllocateStmtHelper {
       genSetDeferredLengthParameters(alloc, box);
     genAllocateObjectBounds(alloc, box);
     mlir::Value stat;
-    if (isSource)
+    if (Fortran::semantics::HasCUDAAttr(alloc.getSymbol()))
+      stat =
+          genCudaAllocate(builder, loc, box, errorManager, alloc.getSymbol());
+    else if (isSource)
       stat = genRuntimeAllocateSource(builder, loc, box, exv, errorManager);
     else
       stat = genRuntimeAllocate(builder, loc, box, errorManager);
@@ -717,6 +724,34 @@ class AllocateStmtHelper {
     return nullptr;
   }
 
+  mlir::Value genCudaAllocate(fir::FirOpBuilder &builder, mlir::Location loc,
+                              const fir::MutableBoxValue &box,
+                              ErrorManager &errorManager,
+                              const Fortran::semantics::Symbol &sym) {
+    Fortran::lower::StatementContext stmtCtx;
+    fir::CUDADataAttributeAttr cudaAttr =
+        Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(),
+                                                         sym);
+    mlir::Value errmsg = errMsgExpr ? errorManager.errMsgAddr : nullptr;
+    mlir::Value stream =
+        streamExpr
+            ? fir::getBase(converter.genExprValue(loc, *streamExpr, stmtCtx))
+            : nullptr;
+    mlir::Value pinned =
+        pinnedExpr
+            ? fir::getBase(converter.genExprAddr(loc, *pinnedExpr, stmtCtx))
+            : nullptr;
+    mlir::Value source = sourceExpr ? fir::getBase(sourceExv) : nullptr;
+
+    // Keep return type the same as a standard AllocatableAllocate call.
+    mlir::Type retTy = fir::runtime::getModel<int>()(builder.getContext());
+    return builder
+        .create<fir::CUDAAllocateOp>(
+            loc, retTy, box.getAddr(), errmsg, stream, pinned, source, cudaAttr,
+            errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr)
+        .getResult();
+  }
+
   Fortran::lower::AbstractConverter &converter;
   fir::FirOpBuilder &builder;
   const Fortran::parser::AllocateStmt &stmt;
@@ -724,6 +759,8 @@ class AllocateStmtHelper {
   const Fortran::lower::SomeExpr *moldExpr{nullptr};
   const Fortran::lower::SomeExpr *statExpr{nullptr};
   const Fortran::lower::SomeExpr *errMsgExpr{nullptr};
+  const Fortran::lower::SomeExpr *pinnedExpr{nullptr};
+  const Fortran::lower::SomeExpr *streamExpr{nullptr};
   // If the allocate has a type spec, lenParams contains the
   // value of the length parameters that were specified inside.
   llvm::SmallVector<mlir::Value> lenParams;
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index 05a0c10c70974..2d4d17a2ef12e 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -575,13 +575,6 @@ mlir::Value Fortran::lower::CalleeInterface::getHostAssociatedTuple() const {
   return converter.hostAssocTupleValue();
 }
 
-void Fortran::lower::CalleeInterface::setFuncAttrs(
-    mlir::func::FuncOp func) const {
-  if (funit.parentHasHostAssoc())
-    func->setAttr(fir::getInternalProcedureAttrName(),
-                  mlir::UnitAttr::get(func->getContext()));
-}
-
 //===----------------------------------------------------------------------===//
 // CallInterface implementation: this part is common to both caller and callee.
 //===----------------------------------------------------------------------===//
@@ -589,6 +582,34 @@ void Fortran::lower::CalleeInterface::setFuncAttrs(
 static void addSymbolAttribute(mlir::func::FuncOp func,
                                const Fortran::semantics::Symbol &sym,
                                mlir::MLIRContext &mlirContext) {
+  const Fortran::semantics::Symbol &ultimate = sym.GetUltimate();
+  // The link between an internal procedure and its host procedure is lost
+  // in FIR if the host is BIND(C) since the internal mangling will not
+  // allow retrieving the host bind(C) name, and therefore func.func symbol.
+  // Preserve it as an attribute so that this can be later retrieved.
+  if (Fortran::semantics::ClassifyProcedure(ultimate) ==
+      Fortran::semantics::ProcedureDefinitionClass::Internal) {
+    if (ultimate.owner().kind() ==
+        Fortran::semantics::Scope::Kind::Subprogram) {
+      if (const Fortran::semantics::Symbol *hostProcedure =
+              ultimate.owner().symbol()) {
+        std::string hostName = Fortran::lower::mangle::mangleName(
+            *hostProcedure, /*keepExternalInScope=*/true);
+        func->setAttr(
+            fir::getHostSymbolAttrName(),
+            mlir::SymbolRefAttr::get(
+                &mlirContext, mlir::StringAttr::get(&mlirContext, hostName)));
+      }
+    } else if (ultimate.owner().kind() ==
+               Fortran::semantics::Scope::Kind::MainProgram) {
+      func->setAttr(fir::getHostSymbolAttrName(),
+                    mlir::SymbolRefAttr::get(
+                        &mlirContext,
+                        mlir::StringAttr::get(
+                            &mlirContext, fir::NameUniquer::doProgramEntry())));
+    }
+  }
+
   // Only add this on bind(C) functions for which the symbol is not reflected in
   // the current context.
   if (!Fortran::semantics::IsBindCProcedure(sym))
@@ -686,7 +707,6 @@ void Fortran::lower::CallInterface<T>::declare() {
       for (const auto &placeHolder : llvm::enumerate(inputs))
         if (!placeHolder.value().attributes.empty())
           func.setArgAttrs(placeHolder.index(), placeHolder.value().attributes);
-      side().setFuncAttrs(func);
 
       setCUDAAttributes(func, side().getProcedureSymbol(), characteristic);
     }
@@ -1599,10 +1619,6 @@ class SignatureBuilder
     return proc;
   }
 
-  /// Set internal procedure attribute on MLIR function. Internal procedure
-  /// are defined in the current file and will not go through SignatureBuilder.
-  void setFuncAttrs(mlir::func::FuncOp) const {}
-
   /// This is not the description of an indirect call.
   static constexpr bool isIndirectCall() { return false; }
 
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index ae0d8bd37228d..4c51b61f6bf02 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -832,8 +832,8 @@ createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc,
 }
 
 bool ClauseProcessor::processMap(
-    mlir::Location currentLocation, const llvm::omp::Directive &directive,
-    Fortran::lower::StatementContext &stmtCtx, mlir::omp::MapClauseOps &result,
+    mlir::Location currentLocation, Fortran::lower::StatementContext &stmtCtx,
+    mlir::omp::MapClauseOps &result,
     llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *mapSyms,
     llvm::SmallVectorImpl<mlir::Location> *mapSymLocs,
     llvm::SmallVectorImpl<mlir::Type> *mapSymTypes) const {
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index aa2c14b61e756..3f9701310ebae 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -114,8 +114,7 @@ class ClauseProcessor {
   // They may be used later on to create the block_arguments for some of the
   // target directives that require it.
   bool processMap(
-      mlir::Location currentLocation, const llvm::omp::Directive &directive,
-      Fortran::lower::StatementContext &stmtCtx,
+      mlir::Location currentLocation, Fortran::lower::StatementContext &stmtCtx,
       mlir::omp::MapClauseOps &result,
       llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *mapSyms =
           nullptr,
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 3dcfe0fd775dc..bb38082b245ef 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -103,21 +103,6 @@ static fir::GlobalOp globalInitialization(
   return global;
 }
 
-static mlir::Operation *getCompareFromReductionOp(mlir::Operation *reductionOp,
-                                                  mlir::Value loadVal) {
-  for (mlir::Value reductionOperand : reductionOp->getOperands()) {
-    if (mlir::Operation *compareOp = reductionOperand.getDefiningOp()) {
-      if (compareOp->getOperand(0) == loadVal ||
-          compareOp->getOperand(1) == loadVal)
-        assert((mlir::isa<mlir::arith::CmpIOp>(compareOp) ||
-                mlir::isa<mlir::arith::CmpFOp>(compareOp)) &&
-               "Expected comparison not found in reduction intrinsic");
-      return compareOp;
-    }
-  }
-  return nullptr;
-}
-
 // Get the extended value for \p val by extracting additional variable
 // information from \p base.
 static fir::ExtendedValue getExtendedValue(fir::ExtendedValue base,
@@ -237,291 +222,432 @@ createAndSetPrivatizedLoopVar(Fortran::lower::AbstractConverter &converter,
   return storeOp;
 }
 
-static mlir::Operation *
-findReductionChain(mlir::Value loadVal, mlir::Value *reductionVal = nullptr) {
-  for (mlir::OpOperand &loadOperand : loadVal.getUses()) {
-    if (mlir::Operation *reductionOp = loadOperand.getOwner()) {
-      if (auto convertOp = mlir::dyn_cast<fir::ConvertOp>(reductionOp)) {
-        for (mlir::OpOperand &convertOperand : convertOp.getRes().getUses()) {
-          if (mlir::Operation *reductionOp = convertOperand.getOwner())
-            return reductionOp;
-        }
-      }
-      for (mlir::OpOperand &reductionOperand : reductionOp->getUses()) {
-        if (auto store =
-                mlir::dyn_cast<fir::StoreOp>(reductionOperand.getOwner())) {
-          if (store.getMemref() == *reductionVal) {
-            store.erase();
-            return reductionOp;
-          }
-        }
-        if (auto assign =
-                mlir::dyn_cast<hlfir::AssignOp>(reductionOperand.getOwner())) {
-          if (assign.getLhs() == *reductionVal) {
-            assign.erase();
-            return reductionOp;
-          }
-        }
-      }
-    }
-  }
-  return nullptr;
-}
+// This helper function implements the functionality of "promoting"
+// non-CPTR arguments of use_device_ptr to use_device_addr
+// arguments (automagic conversion of use_device_ptr ->
+// use_device_addr in these cases). The way we do so currently is
+// through the shuffling of operands from the devicePtrOperands to
+// deviceAddrOperands where neccesary and re-organizing the types,
+// locations and symbols to maintain the correct ordering of ptr/addr
+// input -> BlockArg.
+//
+// This effectively implements some deprecated OpenMP functionality
+// that some legacy applications unfortunately depend on
+// (deprecated in specification version 5.2):
+//
+// "If a list item in a use_device_ptr clause is not of type C_PTR,
+//  the behavior is as if the list item appeared in a use_device_addr
+//  clause. Support for such list items in a use_device_ptr clause
+//  is deprecated."
+static void promoteNonCPtrUseDevicePtrArgsToUseDeviceAddr(
+    mlir::omp::UseDeviceClauseOps &clauseOps,
+    llvm::SmallVectorImpl<mlir::Type> &useDeviceTypes,
+    llvm::SmallVectorImpl<mlir::Location> &useDeviceLocs,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *>
+        &useDeviceSymbols) {
+  auto moveElementToBack = [](size_t idx, auto &vector) {
+    auto *iter = std::next(vector.begin(), idx);
+    vector.push_back(*iter);
+    vector.erase(iter);
+  };
 
-// for a logical operator 'op' reduction X = X op Y
-// This function returns the operation responsible for converting Y from
-// fir.logical<4> to i1
-static fir::ConvertOp getConvertFromReductionOp(mlir::Operation *reductionOp,
-                                                mlir::Value loadVal) {
-  for (mlir::Value reductionOperand : reductionOp->getOperands()) {
-    if (auto convertOp =
-            mlir::dyn_cast<fir::ConvertOp>(reductionOperand.getDefiningOp())) {
-      if (convertOp.getOperand() == loadVal)
-        continue;
-      return convertOp;
+  // Iterate over our use_device_ptr list and shift all non-cptr arguments into
+  // use_device_addr.
+  for (auto *it = clauseOps.useDevicePtrVars.begin();
+       it != clauseOps.useDevicePtrVars.end();) {
+    if (!fir::isa_builtin_cptr_type(fir::unwrapRefType(it->getType()))) {
+      clauseOps.useDeviceAddrVars.push_back(*it);
+      // We have to shuffle the symbols around as well, to maintain
+      // the correct Input -> BlockArg for use_device_ptr/use_device_addr.
+      // NOTE: However, as map's do not seem to be included currently
+      // this isn't as pertinent, but we must try to maintain for
+      // future alterations. I believe the reason they are not currently
+      // is that the BlockArg assign/lowering needs to be extended
+      // to a greater set of types.
+      auto idx = std::distance(clauseOps.useDevicePtrVars.begin(), it);
+      moveElementToBack(idx, useDeviceTypes);
+      moveElementToBack(idx, useDeviceLocs);
+      moveElementToBack(idx, useDeviceSymbols);
+      it = clauseOps.useDevicePtrVars.erase(it);
+      continue;
     }
+    ++it;
   }
-  return nullptr;
 }
 
-static void updateReduction(mlir::Operation *op,
-                            fir::FirOpBuilder &firOpBuilder,
-                            mlir::Value loadVal, mlir::Value reductionVal,
-                            fir::ConvertOp *convertOp = nullptr) {
-  mlir::OpBuilder::InsertPoint insertPtDel = firOpBuilder.saveInsertionPoint();
-  firOpBuilder.setInsertionPoint(op);
-
-  mlir::Value reductionOp;
-  if (convertOp)
-    reductionOp = convertOp->getOperand();
-  else if (op->getOperand(0) == loadVal)
-    reductionOp = op->getOperand(1);
-  else
-    reductionOp = op->getOperand(0);
-
-  firOpBuilder.create<mlir::omp::ReductionOp>(op->getLoc(), reductionOp,
-                                              reductionVal);
-  firOpBuilder.restoreInsertionPoint(insertPtDel);
-}
-
-static void removeStoreOp(mlir::Operation *reductionOp, mlir::Value symVal) {
-  for (mlir::Operation *reductionOpUse : reductionOp->getUsers()) {
-    if (auto convertReduction =
-            mlir::dyn_cast<fir::ConvertOp>(reductionOpUse)) {
-      for (mlir::Operation *convertReductionUse :
-           convertReduction.getRes().getUsers()) {
-        if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(convertReductionUse)) {
-          if (storeOp.getMemref() == symVal)
-            storeOp.erase();
-        }
-        if (auto assignOp =
-                mlir::dyn_cast<hlfir::AssignOp>(convertReductionUse)) {
-          if (assignOp.getLhs() == symVal)
-            assignOp.erase();
-        }
-      }
+/// Extract the list of function and variable symbols affected by the given
+/// 'declare target' directive and return the intended device type for them.
+static void getDeclareTargetInfo(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
+    Fortran::lower::pft::Evaluation &eval,
+    const Fortran::parser::OpenMPDeclareTargetConstruct &declareTargetConstruct,
+    mlir::omp::DeclareTargetClauseOps &clauseOps,
+    llvm::SmallVectorImpl<DeclareTargetCapturePair> &symbolAndClause) {
+  const auto &spec = std::get<Fortran::parser::OmpDeclareTargetSpecifier>(
+      declareTargetConstruct.t);
+  if (const auto *objectList{
+          Fortran::parser::Unwrap<Fortran::parser::OmpObjectList>(spec.u)}) {
+    ObjectList objects{makeObjects(*objectList, semaCtx)};
+    // Case: declare target(func, var1, var2)
+    gatherFuncAndVarSyms(objects, mlir::omp::DeclareTargetCaptureClause::to,
+                         symbolAndClause);
+  } else if (const auto *clauseList{
+                 Fortran::parser::Unwrap<Fortran::parser::OmpClauseList>(
+                     spec.u)}) {
+    if (clauseList->v.empty()) {
+      // Case: declare target, implicit capture of function
+      symbolAndClause.emplace_back(
+          mlir::omp::DeclareTargetCaptureClause::to,
+          eval.getOwningProcedure()->getSubprogramSymbol());
     }
+
+    ClauseProcessor cp(converter, semaCtx, *clauseList);
+    cp.processDeviceType(clauseOps);
+    cp.processEnter(symbolAndClause);
+    cp.processLink(symbolAndClause);
+    cp.processTo(symbolAndClause);
+
+    cp.processTODO<clause::Indirect>(converter.getCurrentLocation(),
+                                     llvm::omp::Directive::OMPD_declare_target);
   }
 }
 
-// Generate an OpenMP reduction operation.
-// TODO: Currently assumes it is either an integer addition/multiplication
-// reduction, or a logical and reduction. Generalize this for various reduction
-// operation types.
-// TODO: Generate the reduction operation during lowering instead of creating
-// and removing operations since this is not a robust approach. Also, removing
-// ops in the builder (instead of a rewriter) is probably not the best approach.
-static void
-genOpenMPReduction(Fortran::lower::AbstractConverter &converter,
-                   Fortran::semantics::SemanticsContext &semaCtx,
-                   const Fortran::parser::OmpClauseList &clauseList) {
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+static void collectDeferredDeclareTargets(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
+    Fortran::lower::pft::Evaluation &eval,
+    const Fortran::parser::OpenMPDeclareTargetConstruct &declareTargetConstruct,
+    llvm::SmallVectorImpl<Fortran::lower::OMPDeferredDeclareTargetInfo>
+        &deferredDeclareTarget) {
+  mlir::omp::DeclareTargetClauseOps clauseOps;
+  llvm::SmallVector<DeclareTargetCapturePair> symbolAndClause;
+  getDeclareTargetInfo(converter, semaCtx, eval, declareTargetConstruct,
+                       clauseOps, symbolAndClause);
+  // Return the device type only if at least one of the targets for the
+  // directive is a function or subroutine
+  mlir::ModuleOp mod = converter.getFirOpBuilder().getModule();
 
-  List<Clause> clauses{makeClauses(clauseList, semaCtx)};
-
-  for (const Clause &clause : clauses) {
-    if (const auto &reductionClause =
-            std::get_if<clause::Reduction>(&clause.u)) {
-      const auto &redOperatorList{
-          std::get<clause::Reduction::ReductionIdentifiers>(
-              reductionClause->t)};
-      assert(redOperatorList.size() == 1 && "Expecting single operator");
-      const auto &redOperator = redOperatorList.front();
-      const auto &objects{std::get<ObjectList>(reductionClause->t)};
-      if (const auto *reductionOp =
-              std::get_if<clause::DefinedOperator>(&redOperator.u)) {
-        const auto &intrinsicOp{
-            std::get<clause::DefinedOperator::IntrinsicOperator>(
-                reductionOp->u)};
-
-        switch (intrinsicOp) {
-        case clause::DefinedOperator::IntrinsicOperator::Add:
-        case clause::DefinedOperator::IntrinsicOperator::Multiply:
-        case clause::DefinedOperator::IntrinsicOperator::AND:
-        case clause::DefinedOperator::IntrinsicOperator::EQV:
-        case clause::DefinedOperator::IntrinsicOperator::OR:
-        case clause::DefinedOperator::IntrinsicOperator::NEQV:
-          break;
-        default:
-          continue;
-        }
-        for (const Object &object : objects) {
-          if (const Fortran::semantics::Symbol *symbol = object.id()) {
-            mlir::Value reductionVal = converter.getSymbolAddress(*symbol);
-            if (auto declOp = reductionVal.getDefiningOp<hlfir::DeclareOp>())
-              reductionVal = declOp.getBase();
-            mlir::Type reductionType =
-                reductionVal.getType().cast<fir::ReferenceType>().getEleTy();
-            if (!reductionType.isa<fir::LogicalType>()) {
-              if (!reductionType.isIntOrIndexOrFloat())
-                continue;
-            }
-            for (mlir::OpOperand &reductionValUse : reductionVal.getUses()) {
-              if (auto loadOp =
-                      mlir::dyn_cast<fir::LoadOp>(reductionValUse.getOwner())) {
-                mlir::Value loadVal = loadOp.getRes();
-                if (reductionType.isa<fir::LogicalType>()) {
-                  mlir::Operation *reductionOp = findReductionChain(loadVal);
-                  fir::ConvertOp convertOp =
-                      getConvertFromReductionOp(reductionOp, loadVal);
-                  updateReduction(reductionOp, firOpBuilder, loadVal,
-                                  reductionVal, &convertOp);
-                  removeStoreOp(reductionOp, reductionVal);
-                } else if (mlir::Operation *reductionOp =
-                               findReductionChain(loadVal, &reductionVal)) {
-                  updateReduction(reductionOp, firOpBuilder, loadVal,
-                                  reductionVal);
-                }
-              }
-            }
-          }
-        }
-      } else if (const auto *reductionIntrinsic =
-                     std::get_if<clause::ProcedureDesignator>(&redOperator.u)) {
-        if (!ReductionProcessor::supportedIntrinsicProcReduction(
-                *reductionIntrinsic))
-          continue;
-        ReductionProcessor::ReductionIdentifier redId =
-            ReductionProcessor::getReductionType(*reductionIntrinsic);
-        for (const Object &object : objects) {
-          if (const Fortran::semantics::Symbol *symbol = object.id()) {
-            mlir::Value reductionVal = converter.getSymbolAddress(*symbol);
-            if (auto declOp = reductionVal.getDefiningOp<hlfir::DeclareOp>())
-              reductionVal = declOp.getBase();
-            for (const mlir::OpOperand &reductionValUse :
-                 reductionVal.getUses()) {
-              if (auto loadOp =
-                      mlir::dyn_cast<fir::LoadOp>(reductionValUse.getOwner())) {
-                mlir::Value loadVal = loadOp.getRes();
-                // Max is lowered as a compare -> select.
-                // Match the pattern here.
-                mlir::Operation *reductionOp =
-                    findReductionChain(loadVal, &reductionVal);
-                if (reductionOp == nullptr)
-                  continue;
-
-                if (redId == ReductionProcessor::ReductionIdentifier::MAX ||
-                    redId == ReductionProcessor::ReductionIdentifier::MIN) {
-                  assert(mlir::isa<mlir::arith::SelectOp>(reductionOp) &&
-                         "Selection Op not found in reduction intrinsic");
-                  mlir::Operation *compareOp =
-                      getCompareFromReductionOp(reductionOp, loadVal);
-                  updateReduction(compareOp, firOpBuilder, loadVal,
-                                  reductionVal);
-                }
-                if (redId == ReductionProcessor::ReductionIdentifier::IOR ||
-                    redId == ReductionProcessor::ReductionIdentifier::IEOR ||
-                    redId == ReductionProcessor::ReductionIdentifier::IAND) {
-                  updateReduction(reductionOp, firOpBuilder, loadVal,
-                                  reductionVal);
-                }
-              }
-            }
-          }
-        }
-      }
+  for (const DeclareTargetCapturePair &symClause : symbolAndClause) {
+    mlir::Operation *op = mod.lookupSymbol(converter.mangleName(
+        std::get<const Fortran::semantics::Symbol &>(symClause)));
+
+    if (!op) {
+      deferredDeclareTarget.push_back({std::get<0>(symClause),
+                                       clauseOps.deviceType,
+                                       std::get<1>(symClause)});
     }
   }
 }
 
-struct OpWithBodyGenInfo {
-  /// A type for a code-gen callback function. This takes as argument the op for
-  /// which the code is being generated and returns the arguments of the op's
-  /// region.
-  using GenOMPRegionEntryCBFn =
-      std::function<llvm::SmallVector<const Fortran::semantics::Symbol *>(
-          mlir::Operation *)>;
+static std::optional<mlir::omp::DeclareTargetDeviceType>
+getDeclareTargetFunctionDevice(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
+    Fortran::lower::pft::Evaluation &eval,
+    const Fortran::parser::OpenMPDeclareTargetConstruct
+        &declareTargetConstruct) {
+  mlir::omp::DeclareTargetClauseOps clauseOps;
+  llvm::SmallVector<DeclareTargetCapturePair> symbolAndClause;
+  getDeclareTargetInfo(converter, semaCtx, eval, declareTargetConstruct,
+                       clauseOps, symbolAndClause);
 
-  OpWithBodyGenInfo(Fortran::lower::AbstractConverter &converter,
-                    Fortran::semantics::SemanticsContext &semaCtx,
-                    mlir::Location loc, Fortran::lower::pft::Evaluation &eval)
-      : converter(converter), semaCtx(semaCtx), loc(loc), eval(eval) {}
+  // Return the device type only if at least one of the targets for the
+  // directive is a function or subroutine
+  mlir::ModuleOp mod = converter.getFirOpBuilder().getModule();
+  for (const DeclareTargetCapturePair &symClause : symbolAndClause) {
+    mlir::Operation *op = mod.lookupSymbol(converter.mangleName(
+        std::get<const Fortran::semantics::Symbol &>(symClause)));
 
-  OpWithBodyGenInfo &setGenNested(bool value) {
-    genNested = value;
-    return *this;
+    if (mlir::isa_and_nonnull<mlir::func::FuncOp>(op))
+      return clauseOps.deviceType;
   }
 
-  OpWithBodyGenInfo &setOuterCombined(bool value) {
-    outerCombined = value;
-    return *this;
-  }
+  return std::nullopt;
+}
 
-  OpWithBodyGenInfo &setClauses(const Fortran::parser::OmpClauseList *value) {
-    clauses = value;
-    return *this;
-  }
+static llvm::SmallVector<const Fortran::semantics::Symbol *>
+genLoopVars(mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
+            mlir::Location &loc,
+            llvm::ArrayRef<const Fortran::semantics::Symbol *> args) {
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  auto &region = op->getRegion(0);
 
-  OpWithBodyGenInfo &setDataSharingProcessor(DataSharingProcessor *value) {
-    dsp = value;
-    return *this;
+  std::size_t loopVarTypeSize = 0;
+  for (const Fortran::semantics::Symbol *arg : args)
+    loopVarTypeSize = std::max(loopVarTypeSize, arg->GetUltimate().size());
+  mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
+  llvm::SmallVector<mlir::Type> tiv(args.size(), loopVarType);
+  llvm::SmallVector<mlir::Location> locs(args.size(), loc);
+  firOpBuilder.createBlock(&region, {}, tiv, locs);
+  // The argument is not currently in memory, so make a temporary for the
+  // argument, and store it there, then bind that location to the argument.
+  mlir::Operation *storeOp = nullptr;
+  for (auto [argIndex, argSymbol] : llvm::enumerate(args)) {
+    mlir::Value indexVal = fir::getBase(region.front().getArgument(argIndex));
+    storeOp =
+        createAndSetPrivatizedLoopVar(converter, loc, indexVal, argSymbol);
   }
+  firOpBuilder.setInsertionPointAfter(storeOp);
+  return llvm::SmallVector<const Fortran::semantics::Symbol *>(args);
+}
 
-  OpWithBodyGenInfo &setReductions(
-      llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *value1,
-      llvm::SmallVectorImpl<mlir::Type> *value2) {
-    reductionSymbols = value1;
-    reductionTypes = value2;
-    return *this;
-  }
+static void genReductionVars(
+    mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
+    mlir::Location &loc,
+    llvm::ArrayRef<const Fortran::semantics::Symbol *> reductionArgs,
+    llvm::ArrayRef<mlir::Type> reductionTypes) {
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  llvm::SmallVector<mlir::Location> blockArgLocs(reductionArgs.size(), loc);
 
-  OpWithBodyGenInfo &setGenRegionEntryCb(GenOMPRegionEntryCBFn value) {
-    genRegionEntryCB = value;
-    return *this;
+  mlir::Block *entryBlock = firOpBuilder.createBlock(
+      &op->getRegion(0), {}, reductionTypes, blockArgLocs);
+
+  // Bind the reduction arguments to their block arguments.
+  for (auto [arg, prv] :
+       llvm::zip_equal(reductionArgs, entryBlock->getArguments())) {
+    converter.bindSymbol(*arg, prv);
   }
+}
 
-  /// [inout] converter to use for the clauses.
-  Fortran::lower::AbstractConverter &converter;
-  /// [in] Semantics context
-  Fortran::semantics::SemanticsContext &semaCtx;
-  /// [in] location in source code.
-  mlir::Location loc;
-  /// [in] current PFT node/evaluation.
-  Fortran::lower::pft::Evaluation &eval;
-  /// [in] whether to generate FIR for nested evaluations
-  bool genNested = true;
-  /// [in] is this an outer operation - prevents privatization.
-  bool outerCombined = false;
-  /// [in] list of clauses to process.
-  const Fortran::parser::OmpClauseList *clauses = nullptr;
-  /// [in] if provided, processes the construct's data-sharing attributes.
-  DataSharingProcessor *dsp = nullptr;
-  /// [in] if provided, list of reduction symbols
-  llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *reductionSymbols =
-      nullptr;
-  /// [in] if provided, list of reduction types
-  llvm::SmallVectorImpl<mlir::Type> *reductionTypes = nullptr;
-  /// [in] if provided, emits the op's region entry. Otherwise, an emtpy block
-  /// is created in the region.
-  GenOMPRegionEntryCBFn genRegionEntryCB = nullptr;
-};
+static llvm::SmallVector<const Fortran::semantics::Symbol *>
+genLoopAndReductionVars(
+    mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
+    mlir::Location &loc,
+    llvm::ArrayRef<const Fortran::semantics::Symbol *> loopArgs,
+    llvm::ArrayRef<const Fortran::semantics::Symbol *> reductionArgs,
+    llvm::ArrayRef<mlir::Type> reductionTypes) {
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+
+  llvm::SmallVector<mlir::Type> blockArgTypes;
+  llvm::SmallVector<mlir::Location> blockArgLocs;
+  blockArgTypes.reserve(loopArgs.size() + reductionArgs.size());
+  blockArgLocs.reserve(blockArgTypes.size());
+  mlir::Block *entryBlock;
+
+  if (loopArgs.size()) {
+    std::size_t loopVarTypeSize = 0;
+    for (const Fortran::semantics::Symbol *arg : loopArgs)
+      loopVarTypeSize = std::max(loopVarTypeSize, arg->GetUltimate().size());
+    mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
+    std::fill_n(std::back_inserter(blockArgTypes), loopArgs.size(),
+                loopVarType);
+    std::fill_n(std::back_inserter(blockArgLocs), loopArgs.size(), loc);
+  }
+  if (reductionArgs.size()) {
+    llvm::copy(reductionTypes, std::back_inserter(blockArgTypes));
+    std::fill_n(std::back_inserter(blockArgLocs), reductionArgs.size(), loc);
+  }
+  entryBlock = firOpBuilder.createBlock(&op->getRegion(0), {}, blockArgTypes,
+                                        blockArgLocs);
+  // The argument is not currently in memory, so make a temporary for the
+  // argument, and store it there, then bind that location to the argument.
+  if (loopArgs.size()) {
+    mlir::Operation *storeOp = nullptr;
+    for (auto [argIndex, argSymbol] : llvm::enumerate(loopArgs)) {
+      mlir::Value indexVal =
+          fir::getBase(op->getRegion(0).front().getArgument(argIndex));
+      storeOp =
+          createAndSetPrivatizedLoopVar(converter, loc, indexVal, argSymbol);
+    }
+    firOpBuilder.setInsertionPointAfter(storeOp);
+  }
+  // Bind the reduction arguments to their block arguments
+  for (auto [arg, prv] : llvm::zip_equal(
+           reductionArgs,
+           llvm::drop_begin(entryBlock->getArguments(), loopArgs.size()))) {
+    converter.bindSymbol(*arg, prv);
+  }
+
+  return llvm::SmallVector<const Fortran::semantics::Symbol *>(loopArgs);
+}
+
+static void
+markDeclareTarget(mlir::Operation *op,
+                  Fortran::lower::AbstractConverter &converter,
+                  mlir::omp::DeclareTargetCaptureClause captureClause,
+                  mlir::omp::DeclareTargetDeviceType deviceType) {
+  // TODO: Add support for program local variables with declare target applied
+  auto declareTargetOp = llvm::dyn_cast<mlir::omp::DeclareTargetInterface>(op);
+  if (!declareTargetOp)
+    fir::emitFatalError(
+        converter.getCurrentLocation(),
+        "Attempt to apply declare target on unsupported operation");
+
+  // The function or global already has a declare target applied to it, very
+  // likely through implicit capture (usage in another declare target
+  // function/subroutine). It should be marked as any if it has been assigned
+  // both host and nohost, else we skip, as there is no change
+  if (declareTargetOp.isDeclareTarget()) {
+    if (declareTargetOp.getDeclareTargetDeviceType() != deviceType)
+      declareTargetOp.setDeclareTarget(mlir::omp::DeclareTargetDeviceType::any,
+                                       captureClause);
+    return;
+  }
+
+  declareTargetOp.setDeclareTarget(deviceType, captureClause);
+}
+
+/// Split a combined directive into an outer leaf directive and the (possibly
+/// combined) rest of the combined directive. Composite directives and
+/// non-compound directives are not split, in which case it will return the
+/// input directive as its first output and an empty value as its second output.
+static std::pair<llvm::omp::Directive, std::optional<llvm::omp::Directive>>
+splitCombinedDirective(llvm::omp::Directive dir) {
+  using D = llvm::omp::Directive;
+  switch (dir) {
+  case D::OMPD_masked_taskloop:
+    return {D::OMPD_masked, D::OMPD_taskloop};
+  case D::OMPD_masked_taskloop_simd:
+    return {D::OMPD_masked, D::OMPD_taskloop_simd};
+  case D::OMPD_master_taskloop:
+    return {D::OMPD_master, D::OMPD_taskloop};
+  case D::OMPD_master_taskloop_simd:
+    return {D::OMPD_master, D::OMPD_taskloop_simd};
+  case D::OMPD_parallel_do:
+    return {D::OMPD_parallel, D::OMPD_do};
+  case D::OMPD_parallel_do_simd:
+    return {D::OMPD_parallel, D::OMPD_do_simd};
+  case D::OMPD_parallel_masked:
+    return {D::OMPD_parallel, D::OMPD_masked};
+  case D::OMPD_parallel_masked_taskloop:
+    return {D::OMPD_parallel, D::OMPD_masked_taskloop};
+  case D::OMPD_parallel_masked_taskloop_simd:
+    return {D::OMPD_parallel, D::OMPD_masked_taskloop_simd};
+  case D::OMPD_parallel_master:
+    return {D::OMPD_parallel, D::OMPD_master};
+  case D::OMPD_parallel_master_taskloop:
+    return {D::OMPD_parallel, D::OMPD_master_taskloop};
+  case D::OMPD_parallel_master_taskloop_simd:
+    return {D::OMPD_parallel, D::OMPD_master_taskloop_simd};
+  case D::OMPD_parallel_sections:
+    return {D::OMPD_parallel, D::OMPD_sections};
+  case D::OMPD_parallel_workshare:
+    return {D::OMPD_parallel, D::OMPD_workshare};
+  case D::OMPD_target_parallel:
+    return {D::OMPD_target, D::OMPD_parallel};
+  case D::OMPD_target_parallel_do:
+    return {D::OMPD_target, D::OMPD_parallel_do};
+  case D::OMPD_target_parallel_do_simd:
+    return {D::OMPD_target, D::OMPD_parallel_do_simd};
+  case D::OMPD_target_simd:
+    return {D::OMPD_target, D::OMPD_simd};
+  case D::OMPD_target_teams:
+    return {D::OMPD_target, D::OMPD_teams};
+  case D::OMPD_target_teams_distribute:
+    return {D::OMPD_target, D::OMPD_teams_distribute};
+  case D::OMPD_target_teams_distribute_parallel_do:
+    return {D::OMPD_target, D::OMPD_teams_distribute_parallel_do};
+  case D::OMPD_target_teams_distribute_parallel_do_simd:
+    return {D::OMPD_target, D::OMPD_teams_distribute_parallel_do_simd};
+  case D::OMPD_target_teams_distribute_simd:
+    return {D::OMPD_target, D::OMPD_teams_distribute_simd};
+  case D::OMPD_teams_distribute:
+    return {D::OMPD_teams, D::OMPD_distribute};
+  case D::OMPD_teams_distribute_parallel_do:
+    return {D::OMPD_teams, D::OMPD_distribute_parallel_do};
+  case D::OMPD_teams_distribute_parallel_do_simd:
+    return {D::OMPD_teams, D::OMPD_distribute_parallel_do_simd};
+  case D::OMPD_teams_distribute_simd:
+    return {D::OMPD_teams, D::OMPD_distribute_simd};
+  case D::OMPD_parallel_loop:
+    return {D::OMPD_parallel, D::OMPD_loop};
+  case D::OMPD_target_parallel_loop:
+    return {D::OMPD_target, D::OMPD_parallel_loop};
+  case D::OMPD_target_teams_loop:
+    return {D::OMPD_target, D::OMPD_teams_loop};
+  case D::OMPD_teams_loop:
+    return {D::OMPD_teams, D::OMPD_loop};
+  default:
+    return {dir, std::nullopt};
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Op body generation helper structures and functions
+//===----------------------------------------------------------------------===//
+
+struct OpWithBodyGenInfo {
+  /// A type for a code-gen callback function. This takes as argument the op for
+  /// which the code is being generated and returns the arguments of the op's
+  /// region.
+  using GenOMPRegionEntryCBFn =
+      std::function<llvm::SmallVector<const Fortran::semantics::Symbol *>(
+          mlir::Operation *)>;
+
+  OpWithBodyGenInfo(Fortran::lower::AbstractConverter &converter,
+                    Fortran::semantics::SemanticsContext &semaCtx,
+                    mlir::Location loc, Fortran::lower::pft::Evaluation &eval,
+                    llvm::omp::Directive dir)
+      : converter(converter), semaCtx(semaCtx), loc(loc), eval(eval), dir(dir) {
+  }
+
+  OpWithBodyGenInfo &setGenNested(bool value) {
+    genNested = value;
+    return *this;
+  }
+
+  OpWithBodyGenInfo &setOuterCombined(bool value) {
+    outerCombined = value;
+    return *this;
+  }
+
+  OpWithBodyGenInfo &setClauses(const Fortran::parser::OmpClauseList *value) {
+    clauses = value;
+    return *this;
+  }
+
+  OpWithBodyGenInfo &setDataSharingProcessor(DataSharingProcessor *value) {
+    dsp = value;
+    return *this;
+  }
+
+  OpWithBodyGenInfo &setReductions(
+      llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *value1,
+      llvm::SmallVectorImpl<mlir::Type> *value2) {
+    reductionSymbols = value1;
+    reductionTypes = value2;
+    return *this;
+  }
+
+  OpWithBodyGenInfo &setGenRegionEntryCb(GenOMPRegionEntryCBFn value) {
+    genRegionEntryCB = value;
+    return *this;
+  }
+
+  /// [inout] converter to use for the clauses.
+  Fortran::lower::AbstractConverter &converter;
+  /// [in] Semantics context
+  Fortran::semantics::SemanticsContext &semaCtx;
+  /// [in] location in source code.
+  mlir::Location loc;
+  /// [in] current PFT node/evaluation.
+  Fortran::lower::pft::Evaluation &eval;
+  /// [in] leaf directive for which to generate the op body.
+  llvm::omp::Directive dir;
+  /// [in] whether to generate FIR for nested evaluations
+  bool genNested = true;
+  /// [in] is this an outer operation - prevents privatization.
+  bool outerCombined = false;
+  /// [in] list of clauses to process.
+  const Fortran::parser::OmpClauseList *clauses = nullptr;
+  /// [in] if provided, processes the construct's data-sharing attributes.
+  DataSharingProcessor *dsp = nullptr;
+  /// [in] if provided, list of reduction symbols
+  llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> *reductionSymbols =
+      nullptr;
+  /// [in] if provided, list of reduction types
+  llvm::SmallVectorImpl<mlir::Type> *reductionTypes = nullptr;
+  /// [in] if provided, emits the op's region entry. Otherwise, an emtpy block
+  /// is created in the region.
+  GenOMPRegionEntryCBFn genRegionEntryCB = nullptr;
+};
 
 /// Create the body (block) for an OpenMP Operation.
 ///
 /// \param [in]   op - the operation the body belongs to.
 /// \param [in] info - options controlling code-gen for the construction.
-template <typename Op>
-static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) {
+static void createBodyOfOp(mlir::Operation &op, OpWithBodyGenInfo &info) {
   fir::FirOpBuilder &firOpBuilder = info.converter.getFirOpBuilder();
 
   auto insertMarker = [](fir::FirOpBuilder &builder) {
@@ -537,10 +663,10 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) {
   auto regionArgs =
       [&]() -> llvm::SmallVector<const Fortran::semantics::Symbol *> {
     if (info.genRegionEntryCB != nullptr) {
-      return info.genRegionEntryCB(op);
+      return info.genRegionEntryCB(&op);
     }
 
-    firOpBuilder.createBlock(&op.getRegion());
+    firOpBuilder.createBlock(&op.getRegion(0));
     return {};
   }();
   // Mark the earliest insertion point.
@@ -555,8 +681,8 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) {
 
   // Start with privatization, so that the lowering of the nested
   // code will use the right symbols.
-  constexpr bool isLoop = std::is_same_v<Op, mlir::omp::WsloopOp> ||
-                          std::is_same_v<Op, mlir::omp::SimdLoopOp>;
+  bool isLoop = llvm::omp::getDirectiveAssociation(info.dir) ==
+                llvm::omp::Association::Loop;
   bool privatize = info.clauses && !info.outerCombined;
 
   firOpBuilder.setInsertionPoint(marker);
@@ -568,7 +694,7 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) {
     }
   }
 
-  if constexpr (std::is_same_v<Op, mlir::omp::ParallelOp>) {
+  if (info.dir == llvm::omp::Directive::OMPD_parallel) {
     threadPrivatizeVars(info.converter, info.eval);
     if (info.clauses) {
       firOpBuilder.setInsertionPoint(marker);
@@ -582,9 +708,9 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) {
     // a lot of complications for our approach if the terminator generation
     // is delayed past this point. Insert a temporary terminator here, then
     // delete it.
-    firOpBuilder.setInsertionPointToEnd(&op.getRegion().back());
-    auto *temp = Fortran::lower::genOpenMPTerminator(
-        firOpBuilder, op.getOperation(), info.loc);
+    firOpBuilder.setInsertionPointToEnd(&op.getRegion(0).back());
+    auto *temp =
+        Fortran::lower::genOpenMPTerminator(firOpBuilder, &op, info.loc);
     firOpBuilder.setInsertionPointAfter(marker);
     genNestedEvaluations(info.converter, info.eval);
     temp->erase();
@@ -626,23 +752,36 @@ static void createBodyOfOp(Op &op, OpWithBodyGenInfo &info) {
     return exit;
   };
 
-  if (auto *exitBlock = getUniqueExit(op.getRegion())) {
+  if (auto *exitBlock = getUniqueExit(op.getRegion(0))) {
     firOpBuilder.setInsertionPointToEnd(exitBlock);
-    auto *term = Fortran::lower::genOpenMPTerminator(
-        firOpBuilder, op.getOperation(), info.loc);
+    auto *term =
+        Fortran::lower::genOpenMPTerminator(firOpBuilder, &op, info.loc);
     // Only insert lastprivate code when there actually is an exit block.
     // Such a block may not exist if the nested code produced an infinite
     // loop (this may not make sense in production code, but a user could
     // write that and we should handle it).
     firOpBuilder.setInsertionPoint(term);
     if (privatize) {
+      // DataSharingProcessor::processStep2() may create operations before/after
+      // the one passed as argument. We need to treat loop wrappers and their
+      // nested loop as a unit, so we need to pass the top level wrapper (if
+      // present). Otherwise, these operations will be inserted within a
+      // wrapper region.
+      mlir::Operation *privatizationTopLevelOp = &op;
+      if (auto loopNest = llvm::dyn_cast<mlir::omp::LoopNestOp>(op)) {
+        llvm::SmallVector<mlir::omp::LoopWrapperInterface> wrappers;
+        loopNest.gatherWrappers(wrappers);
+        if (!wrappers.empty())
+          privatizationTopLevelOp = &*wrappers.back();
+      }
+
       if (!info.dsp) {
         assert(tempDsp.has_value());
-        tempDsp->processStep2(op, isLoop);
+        tempDsp->processStep2(privatizationTopLevelOp, isLoop);
       } else {
         if (isLoop && regionArgs.size() > 0)
           info.dsp->setLoopIV(info.converter.getSymbolAddress(*regionArgs[0]));
-        info.dsp->processStep2(op, isLoop);
+        info.dsp->processStep2(privatizationTopLevelOp, isLoop);
       }
     }
   }
@@ -715,398 +854,42 @@ static void genBodyOfTargetDataOp(
     genNestedEvaluations(converter, eval);
 }
 
-template <typename OpTy, typename... Args>
-static OpTy genOpWithBody(OpWithBodyGenInfo &info, Args &&...args) {
-  auto op = info.converter.getFirOpBuilder().create<OpTy>(
-      info.loc, std::forward<Args>(args)...);
-  createBodyOfOp<OpTy>(op, info);
-  return op;
-}
-
-static mlir::omp::MasterOp
-genMasterOp(Fortran::lower::AbstractConverter &converter,
-            Fortran::semantics::SemanticsContext &semaCtx,
-            Fortran::lower::pft::Evaluation &eval, bool genNested,
-            mlir::Location currentLocation) {
-  return genOpWithBody<mlir::omp::MasterOp>(
-      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
-          .setGenNested(genNested));
-}
-
-static mlir::omp::OrderedRegionOp
-genOrderedRegionOp(Fortran::lower::AbstractConverter &converter,
-                   Fortran::semantics::SemanticsContext &semaCtx,
-                   Fortran::lower::pft::Evaluation &eval, bool genNested,
-                   mlir::Location currentLocation,
-                   const Fortran::parser::OmpClauseList &clauseList) {
-  mlir::omp::OrderedRegionClauseOps clauseOps;
-
-  ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processTODO<clause::Simd>(currentLocation,
-                               llvm::omp::Directive::OMPD_ordered);
-
-  return genOpWithBody<mlir::omp::OrderedRegionOp>(
-      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
-          .setGenNested(genNested),
-      clauseOps);
-}
+// This functions creates a block for the body of the targetOp's region. It adds
+// all the symbols present in mapSymbols as block arguments to this block.
+static void
+genBodyOfTargetOp(Fortran::lower::AbstractConverter &converter,
+                  Fortran::semantics::SemanticsContext &semaCtx,
+                  Fortran::lower::pft::Evaluation &eval, bool genNested,
+                  mlir::omp::TargetOp &targetOp,
+                  llvm::ArrayRef<const Fortran::semantics::Symbol *> mapSyms,
+                  llvm::ArrayRef<mlir::Location> mapSymLocs,
+                  llvm::ArrayRef<mlir::Type> mapSymTypes,
+                  const mlir::Location &currentLocation) {
+  assert(mapSymTypes.size() == mapSymLocs.size());
 
-static mlir::omp::ParallelOp
-genParallelOp(Fortran::lower::AbstractConverter &converter,
-              Fortran::lower::SymMap &symTable,
-              Fortran::semantics::SemanticsContext &semaCtx,
-              Fortran::lower::pft::Evaluation &eval, bool genNested,
-              mlir::Location currentLocation,
-              const Fortran::parser::OmpClauseList &clauseList,
-              bool outerCombined = false) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  Fortran::lower::StatementContext stmtCtx;
-  mlir::omp::ParallelClauseOps clauseOps;
-  llvm::SmallVector<const Fortran::semantics::Symbol *> privateSyms;
-  llvm::SmallVector<mlir::Type> reductionTypes;
-  llvm::SmallVector<const Fortran::semantics::Symbol *> reductionSyms;
-
-  ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processIf(llvm::omp::Directive::OMPD_parallel, clauseOps);
-  cp.processNumThreads(stmtCtx, clauseOps);
-  cp.processProcBind(clauseOps);
-  cp.processDefault();
-  cp.processAllocate(clauseOps);
-
-  if (!outerCombined)
-    cp.processReduction(currentLocation, clauseOps, &reductionTypes,
-                        &reductionSyms);
+  mlir::Region &region = targetOp.getRegion();
 
-  if (ReductionProcessor::doReductionByRef(clauseOps.reductionVars))
-    clauseOps.reductionByRefAttr = firOpBuilder.getUnitAttr();
+  auto *regionBlock =
+      firOpBuilder.createBlock(&region, {}, mapSymTypes, mapSymLocs);
 
-  auto reductionCallback = [&](mlir::Operation *op) {
-    llvm::SmallVector<mlir::Location> locs(clauseOps.reductionVars.size(),
-                                           currentLocation);
-    auto *block =
-        firOpBuilder.createBlock(&op->getRegion(0), {}, reductionTypes, locs);
-    for (auto [arg, prv] :
-         llvm::zip_equal(reductionSyms, block->getArguments())) {
-      converter.bindSymbol(*arg, prv);
+  // Clones the `bounds` placing them inside the target region and returns them.
+  auto cloneBound = [&](mlir::Value bound) {
+    if (mlir::isMemoryEffectFree(bound.getDefiningOp())) {
+      mlir::Operation *clonedOp = bound.getDefiningOp()->clone();
+      regionBlock->push_back(clonedOp);
+      return clonedOp->getResult(0);
     }
-    return reductionSyms;
+    TODO(converter.getCurrentLocation(),
+         "target map clause operand unsupported bound type");
   };
 
-  OpWithBodyGenInfo genInfo =
-      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
-          .setGenNested(genNested)
-          .setOuterCombined(outerCombined)
-          .setClauses(&clauseList)
-          .setReductions(&reductionSyms, &reductionTypes)
-          .setGenRegionEntryCb(reductionCallback);
-
-  if (!enableDelayedPrivatization)
-    return genOpWithBody<mlir::omp::ParallelOp>(genInfo, clauseOps);
-
-  bool privatize = !outerCombined;
-  DataSharingProcessor dsp(converter, semaCtx, clauseList, eval,
-                           /*useDelayedPrivatization=*/true, &symTable);
-
-  if (privatize)
-    dsp.processStep1(&clauseOps, &privateSyms);
-
-  auto genRegionEntryCB = [&](mlir::Operation *op) {
-    auto parallelOp = llvm::cast<mlir::omp::ParallelOp>(op);
-
-    llvm::SmallVector<mlir::Location> reductionLocs(
-        clauseOps.reductionVars.size(), currentLocation);
-
-    mlir::OperandRange privateVars = parallelOp.getPrivateVars();
-    mlir::Region &region = parallelOp.getRegion();
-
-    llvm::SmallVector<mlir::Type> privateVarTypes = reductionTypes;
-    privateVarTypes.reserve(privateVarTypes.size() + privateVars.size());
-    llvm::transform(privateVars, std::back_inserter(privateVarTypes),
-                    [](mlir::Value v) { return v.getType(); });
-
-    llvm::SmallVector<mlir::Location> privateVarLocs = reductionLocs;
-    privateVarLocs.reserve(privateVarLocs.size() + privateVars.size());
-    llvm::transform(privateVars, std::back_inserter(privateVarLocs),
-                    [](mlir::Value v) { return v.getLoc(); });
-
-    firOpBuilder.createBlock(&region, /*insertPt=*/{}, privateVarTypes,
-                             privateVarLocs);
-
-    llvm::SmallVector<const Fortran::semantics::Symbol *> allSymbols =
-        reductionSyms;
-    allSymbols.append(privateSyms);
-    for (auto [arg, prv] : llvm::zip_equal(allSymbols, region.getArguments())) {
-      converter.bindSymbol(*arg, prv);
-    }
-
-    return allSymbols;
-  };
-
-  // TODO Merge with the reduction CB.
-  genInfo.setGenRegionEntryCb(genRegionEntryCB).setDataSharingProcessor(&dsp);
-  return genOpWithBody<mlir::omp::ParallelOp>(genInfo, clauseOps);
-}
-
-static mlir::omp::SectionOp
-genSectionOp(Fortran::lower::AbstractConverter &converter,
-             Fortran::semantics::SemanticsContext &semaCtx,
-             Fortran::lower::pft::Evaluation &eval, bool genNested,
-             mlir::Location currentLocation,
-             const Fortran::parser::OmpClauseList &sectionsClauseList) {
-  // Currently only private/firstprivate clause is handled, and
-  // all privatization is done within `omp.section` operations.
-  return genOpWithBody<mlir::omp::SectionOp>(
-      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
-          .setGenNested(genNested)
-          .setClauses(&sectionsClauseList));
-}
-
-static mlir::omp::SingleOp
-genSingleOp(Fortran::lower::AbstractConverter &converter,
-            Fortran::semantics::SemanticsContext &semaCtx,
-            Fortran::lower::pft::Evaluation &eval, bool genNested,
-            mlir::Location currentLocation,
-            const Fortran::parser::OmpClauseList &beginClauseList,
-            const Fortran::parser::OmpClauseList &endClauseList) {
-  mlir::omp::SingleClauseOps clauseOps;
-
-  ClauseProcessor cp(converter, semaCtx, beginClauseList);
-  cp.processAllocate(clauseOps);
-  // TODO Support delayed privatization.
-
-  ClauseProcessor ecp(converter, semaCtx, endClauseList);
-  ecp.processNowait(clauseOps);
-  ecp.processCopyprivate(currentLocation, clauseOps);
-
-  return genOpWithBody<mlir::omp::SingleOp>(
-      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
-          .setGenNested(genNested)
-          .setClauses(&beginClauseList),
-      clauseOps);
-}
-
-static mlir::omp::TaskOp
-genTaskOp(Fortran::lower::AbstractConverter &converter,
-          Fortran::semantics::SemanticsContext &semaCtx,
-          Fortran::lower::pft::Evaluation &eval, bool genNested,
-          mlir::Location currentLocation,
-          const Fortran::parser::OmpClauseList &clauseList) {
-  Fortran::lower::StatementContext stmtCtx;
-  mlir::omp::TaskClauseOps clauseOps;
-
-  ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processIf(llvm::omp::Directive::OMPD_task, clauseOps);
-  cp.processAllocate(clauseOps);
-  cp.processDefault();
-  cp.processFinal(stmtCtx, clauseOps);
-  cp.processUntied(clauseOps);
-  cp.processMergeable(clauseOps);
-  cp.processPriority(stmtCtx, clauseOps);
-  cp.processDepend(clauseOps);
-  // TODO Support delayed privatization.
-
-  cp.processTODO<clause::InReduction, clause::Detach, clause::Affinity>(
-      currentLocation, llvm::omp::Directive::OMPD_task);
-
-  return genOpWithBody<mlir::omp::TaskOp>(
-      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
-          .setGenNested(genNested)
-          .setClauses(&clauseList),
-      clauseOps);
-}
-
-static mlir::omp::TaskgroupOp
-genTaskgroupOp(Fortran::lower::AbstractConverter &converter,
-               Fortran::semantics::SemanticsContext &semaCtx,
-               Fortran::lower::pft::Evaluation &eval, bool genNested,
-               mlir::Location currentLocation,
-               const Fortran::parser::OmpClauseList &clauseList) {
-  mlir::omp::TaskgroupClauseOps clauseOps;
-
-  ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processAllocate(clauseOps);
-  cp.processTODO<clause::TaskReduction>(currentLocation,
-                                        llvm::omp::Directive::OMPD_taskgroup);
-
-  return genOpWithBody<mlir::omp::TaskgroupOp>(
-      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
-          .setGenNested(genNested)
-          .setClauses(&clauseList),
-      clauseOps);
-}
-
-// This helper function implements the functionality of "promoting"
-// non-CPTR arguments of use_device_ptr to use_device_addr
-// arguments (automagic conversion of use_device_ptr ->
-// use_device_addr in these cases). The way we do so currently is
-// through the shuffling of operands from the devicePtrOperands to
-// deviceAddrOperands where neccesary and re-organizing the types,
-// locations and symbols to maintain the correct ordering of ptr/addr
-// input -> BlockArg.
-//
-// This effectively implements some deprecated OpenMP functionality
-// that some legacy applications unfortunately depend on
-// (deprecated in specification version 5.2):
-//
-// "If a list item in a use_device_ptr clause is not of type C_PTR,
-//  the behavior is as if the list item appeared in a use_device_addr
-//  clause. Support for such list items in a use_device_ptr clause
-//  is deprecated."
-static void promoteNonCPtrUseDevicePtrArgsToUseDeviceAddr(
-    mlir::omp::UseDeviceClauseOps &clauseOps,
-    llvm::SmallVectorImpl<mlir::Type> &useDeviceTypes,
-    llvm::SmallVectorImpl<mlir::Location> &useDeviceLocs,
-    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *>
-        &useDeviceSymbols) {
-  auto moveElementToBack = [](size_t idx, auto &vector) {
-    auto *iter = std::next(vector.begin(), idx);
-    vector.push_back(*iter);
-    vector.erase(iter);
-  };
-
-  // Iterate over our use_device_ptr list and shift all non-cptr arguments into
-  // use_device_addr.
-  for (auto *it = clauseOps.useDevicePtrVars.begin();
-       it != clauseOps.useDevicePtrVars.end();) {
-    if (!fir::isa_builtin_cptr_type(fir::unwrapRefType(it->getType()))) {
-      clauseOps.useDeviceAddrVars.push_back(*it);
-      // We have to shuffle the symbols around as well, to maintain
-      // the correct Input -> BlockArg for use_device_ptr/use_device_addr.
-      // NOTE: However, as map's do not seem to be included currently
-      // this isn't as pertinent, but we must try to maintain for
-      // future alterations. I believe the reason they are not currently
-      // is that the BlockArg assign/lowering needs to be extended
-      // to a greater set of types.
-      auto idx = std::distance(clauseOps.useDevicePtrVars.begin(), it);
-      moveElementToBack(idx, useDeviceTypes);
-      moveElementToBack(idx, useDeviceLocs);
-      moveElementToBack(idx, useDeviceSymbols);
-      it = clauseOps.useDevicePtrVars.erase(it);
-      continue;
-    }
-    ++it;
-  }
-}
-
-static mlir::omp::TargetDataOp
-genTargetDataOp(Fortran::lower::AbstractConverter &converter,
-                Fortran::semantics::SemanticsContext &semaCtx,
-                Fortran::lower::pft::Evaluation &eval, bool genNested,
-                mlir::Location currentLocation,
-                const Fortran::parser::OmpClauseList &clauseList) {
-  Fortran::lower::StatementContext stmtCtx;
-  mlir::omp::TargetDataClauseOps clauseOps;
-  llvm::SmallVector<mlir::Type> useDeviceTypes;
-  llvm::SmallVector<mlir::Location> useDeviceLocs;
-  llvm::SmallVector<const Fortran::semantics::Symbol *> useDeviceSyms;
-
-  ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processIf(llvm::omp::Directive::OMPD_target_data, clauseOps);
-  cp.processDevice(stmtCtx, clauseOps);
-  cp.processUseDevicePtr(clauseOps, useDeviceTypes, useDeviceLocs,
-                         useDeviceSyms);
-  cp.processUseDeviceAddr(clauseOps, useDeviceTypes, useDeviceLocs,
-                          useDeviceSyms);
-
-  // This function implements the deprecated functionality of use_device_ptr
-  // that allows users to provide non-CPTR arguments to it with the caveat
-  // that the compiler will treat them as use_device_addr. A lot of legacy
-  // code may still depend on this functionality, so we should support it
-  // in some manner. We do so currently by simply shifting non-cptr operands
-  // from the use_device_ptr list into the front of the use_device_addr list
-  // whilst maintaining the ordering of useDeviceLocs, useDeviceSymbols and
-  // useDeviceTypes to use_device_ptr/use_device_addr input for BlockArg
-  // ordering.
-  // TODO: Perhaps create a user provideable compiler option that will
-  // re-introduce a hard-error rather than a warning in these cases.
-  promoteNonCPtrUseDevicePtrArgsToUseDeviceAddr(clauseOps, useDeviceTypes,
-                                                useDeviceLocs, useDeviceSyms);
-  cp.processMap(currentLocation, llvm::omp::Directive::OMPD_target_data,
-                stmtCtx, clauseOps);
-
-  auto dataOp = converter.getFirOpBuilder().create<mlir::omp::TargetDataOp>(
-      currentLocation, clauseOps);
-
-  genBodyOfTargetDataOp(converter, semaCtx, eval, genNested, dataOp,
-                        useDeviceTypes, useDeviceLocs, useDeviceSyms,
-                        currentLocation);
-  return dataOp;
-}
-
-template <typename OpTy>
-static OpTy genTargetEnterExitDataUpdateOp(
-    Fortran::lower::AbstractConverter &converter,
-    Fortran::semantics::SemanticsContext &semaCtx,
-    mlir::Location currentLocation,
-    const Fortran::parser::OmpClauseList &clauseList) {
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  Fortran::lower::StatementContext stmtCtx;
-  mlir::omp::TargetEnterExitUpdateDataClauseOps clauseOps;
-
-  // GCC 9.3.0 emits a (probably) bogus warning about an unused variable.
-  [[maybe_unused]] llvm::omp::Directive directive;
-  if constexpr (std::is_same_v<OpTy, mlir::omp::TargetEnterDataOp>) {
-    directive = llvm::omp::Directive::OMPD_target_enter_data;
-  } else if constexpr (std::is_same_v<OpTy, mlir::omp::TargetExitDataOp>) {
-    directive = llvm::omp::Directive::OMPD_target_exit_data;
-  } else if constexpr (std::is_same_v<OpTy, mlir::omp::TargetUpdateOp>) {
-    directive = llvm::omp::Directive::OMPD_target_update;
-  } else {
-    return nullptr;
-  }
-
-  ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processIf(directive, clauseOps);
-  cp.processDevice(stmtCtx, clauseOps);
-  cp.processDepend(clauseOps);
-  cp.processNowait(clauseOps);
-
-  if constexpr (std::is_same_v<OpTy, mlir::omp::TargetUpdateOp>) {
-    cp.processMotionClauses<clause::To>(stmtCtx, clauseOps);
-    cp.processMotionClauses<clause::From>(stmtCtx, clauseOps);
-  } else {
-    cp.processMap(currentLocation, directive, stmtCtx, clauseOps);
-  }
-
-  return firOpBuilder.create<OpTy>(currentLocation, clauseOps);
-}
-
-// This functions creates a block for the body of the targetOp's region. It adds
-// all the symbols present in mapSymbols as block arguments to this block.
-static void
-genBodyOfTargetOp(Fortran::lower::AbstractConverter &converter,
-                  Fortran::semantics::SemanticsContext &semaCtx,
-                  Fortran::lower::pft::Evaluation &eval, bool genNested,
-                  mlir::omp::TargetOp &targetOp,
-                  llvm::ArrayRef<const Fortran::semantics::Symbol *> mapSyms,
-                  llvm::ArrayRef<mlir::Location> mapSymLocs,
-                  llvm::ArrayRef<mlir::Type> mapSymTypes,
-                  const mlir::Location &currentLocation) {
-  assert(mapSymTypes.size() == mapSymLocs.size());
-
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  mlir::Region &region = targetOp.getRegion();
-
-  auto *regionBlock =
-      firOpBuilder.createBlock(&region, {}, mapSymTypes, mapSymLocs);
-
-  // Clones the `bounds` placing them inside the target region and returns them.
-  auto cloneBound = [&](mlir::Value bound) {
-    if (mlir::isMemoryEffectFree(bound.getDefiningOp())) {
-      mlir::Operation *clonedOp = bound.getDefiningOp()->clone();
-      regionBlock->push_back(clonedOp);
-      return clonedOp->getResult(0);
-    }
-    TODO(converter.getCurrentLocation(),
-         "target map clause operand unsupported bound type");
-  };
-
-  auto cloneBounds = [cloneBound](llvm::ArrayRef<mlir::Value> bounds) {
-    llvm::SmallVector<mlir::Value> clonedBounds;
-    for (mlir::Value bound : bounds)
-      clonedBounds.emplace_back(cloneBound(bound));
-    return clonedBounds;
-  };
+  auto cloneBounds = [cloneBound](llvm::ArrayRef<mlir::Value> bounds) {
+    llvm::SmallVector<mlir::Value> clonedBounds;
+    for (mlir::Value bound : bounds)
+      clonedBounds.emplace_back(cloneBound(bound));
+    return clonedBounds;
+  };
 
   // Bind the symbols to their corresponding block arguments.
   for (auto [argIndex, argSymbol] : llvm::enumerate(mapSyms)) {
@@ -1225,456 +1008,842 @@ genBodyOfTargetOp(Fortran::lower::AbstractConverter &converter,
     genNestedEvaluations(converter, eval);
 }
 
-static mlir::omp::TargetOp
-genTargetOp(Fortran::lower::AbstractConverter &converter,
-            Fortran::semantics::SemanticsContext &semaCtx,
-            Fortran::lower::pft::Evaluation &eval, bool genNested,
-            mlir::Location currentLocation,
-            const Fortran::parser::OmpClauseList &clauseList,
-            llvm::omp::Directive directive, bool outerCombined = false) {
-  Fortran::lower::StatementContext stmtCtx;
-  mlir::omp::TargetClauseOps clauseOps;
-  llvm::SmallVector<mlir::Type> mapTypes, devicePtrTypes, deviceAddrTypes;
-  llvm::SmallVector<mlir::Location> mapLocs, devicePtrLocs, deviceAddrLocs;
-  llvm::SmallVector<const Fortran::semantics::Symbol *> mapSyms, devicePtrSyms,
-      deviceAddrSyms;
+template <typename OpTy, typename... Args>
+static OpTy genOpWithBody(OpWithBodyGenInfo &info, Args &&...args) {
+  auto op = info.converter.getFirOpBuilder().create<OpTy>(
+      info.loc, std::forward<Args>(args)...);
+  createBodyOfOp(*op, info);
+  return op;
+}
 
-  ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processIf(llvm::omp::Directive::OMPD_target, clauseOps);
-  cp.processDevice(stmtCtx, clauseOps);
-  cp.processThreadLimit(stmtCtx, clauseOps);
+//===----------------------------------------------------------------------===//
+// Code generation functions for clauses
+//===----------------------------------------------------------------------===//
+
+static void genCriticalDeclareClauses(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
+    const Fortran::parser::OmpClauseList &clauses, mlir::Location loc,
+    mlir::omp::CriticalClauseOps &clauseOps, llvm::StringRef name) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
+  cp.processHint(clauseOps);
+  clauseOps.nameAttr =
+      mlir::StringAttr::get(converter.getFirOpBuilder().getContext(), name);
+}
+
+static void genFlushClauses(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
+    const std::optional<Fortran::parser::OmpObjectList> &objects,
+    const std::optional<std::list<Fortran::parser::OmpMemoryOrderClause>>
+        &clauses,
+    mlir::Location loc, llvm::SmallVectorImpl<mlir::Value> &operandRange) {
+  if (objects)
+    genObjectList2(*objects, converter, operandRange);
+
+  if (clauses && clauses->size() > 0)
+    TODO(converter.getCurrentLocation(), "Handle OmpMemoryOrderClause");
+}
+
+static void genLoopNestClauses(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
+    Fortran::lower::pft::Evaluation &eval,
+    const Fortran::parser::OmpClauseList &clauses, mlir::Location loc,
+    mlir::omp::LoopNestClauseOps &clauseOps,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &iv) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
+  cp.processCollapse(loc, eval, clauseOps, iv);
+  clauseOps.loopInclusiveAttr = converter.getFirOpBuilder().getUnitAttr();
+}
+
+static void
+genOrderedRegionClauses(Fortran::lower::AbstractConverter &converter,
+                        Fortran::semantics::SemanticsContext &semaCtx,
+                        const Fortran::parser::OmpClauseList &clauses,
+                        mlir::Location loc,
+                        mlir::omp::OrderedRegionClauseOps &clauseOps) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
+  cp.processTODO<clause::Simd>(loc, llvm::omp::Directive::OMPD_ordered);
+}
+
+static void genParallelClauses(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
+    Fortran::lower::StatementContext &stmtCtx,
+    const Fortran::parser::OmpClauseList &clauses, mlir::Location loc,
+    bool processReduction, mlir::omp::ParallelClauseOps &clauseOps,
+    llvm::SmallVectorImpl<mlir::Type> &reductionTypes,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &reductionSyms) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
+  cp.processAllocate(clauseOps);
+  cp.processDefault();
+  cp.processIf(llvm::omp::Directive::OMPD_parallel, clauseOps);
+  cp.processNumThreads(stmtCtx, clauseOps);
+  cp.processProcBind(clauseOps);
+
+  if (processReduction) {
+    cp.processReduction(loc, clauseOps, &reductionTypes, &reductionSyms);
+    if (ReductionProcessor::doReductionByRef(clauseOps.reductionVars))
+      clauseOps.reductionByRefAttr = converter.getFirOpBuilder().getUnitAttr();
+  }
+}
+
+static void genSectionsClauses(Fortran::lower::AbstractConverter &converter,
+                               Fortran::semantics::SemanticsContext &semaCtx,
+                               const Fortran::parser::OmpClauseList &clauses,
+                               mlir::Location loc,
+                               bool clausesFromBeginSections,
+                               mlir::omp::SectionsClauseOps &clauseOps) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
+  if (clausesFromBeginSections) {
+    cp.processAllocate(clauseOps);
+    cp.processSectionsReduction(loc, clauseOps);
+    // TODO Support delayed privatization.
+  } else {
+    cp.processNowait(clauseOps);
+  }
+}
+
+static void genSimdClauses(Fortran::lower::AbstractConverter &converter,
+                           Fortran::semantics::SemanticsContext &semaCtx,
+                           const Fortran::parser::OmpClauseList &clauses,
+                           mlir::Location loc,
+                           mlir::omp::SimdClauseOps &clauseOps) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
+  cp.processIf(llvm::omp::Directive::OMPD_simd, clauseOps);
+  cp.processReduction(loc, clauseOps);
+  cp.processSafelen(clauseOps);
+  cp.processSimdlen(clauseOps);
+  // TODO Support delayed privatization.
+
+  cp.processTODO<clause::Aligned, clause::Allocate, clause::Linear,
+                 clause::Nontemporal, clause::Order>(
+      loc, llvm::omp::Directive::OMPD_simd);
+}
+
+static void genSingleClauses(Fortran::lower::AbstractConverter &converter,
+                             Fortran::semantics::SemanticsContext &semaCtx,
+                             const Fortran::parser::OmpClauseList &beginClauses,
+                             const Fortran::parser::OmpClauseList &endClauses,
+                             mlir::Location loc,
+                             mlir::omp::SingleClauseOps &clauseOps) {
+  ClauseProcessor bcp(converter, semaCtx, beginClauses);
+  bcp.processAllocate(clauseOps);
+  // TODO Support delayed privatization.
+
+  ClauseProcessor ecp(converter, semaCtx, endClauses);
+  ecp.processCopyprivate(loc, clauseOps);
+  ecp.processNowait(clauseOps);
+}
+
+static void genTargetClauses(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
+    Fortran::lower::StatementContext &stmtCtx,
+    const Fortran::parser::OmpClauseList &clauses, mlir::Location loc,
+    bool processHostOnlyClauses, bool processReduction,
+    mlir::omp::TargetClauseOps &clauseOps,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &mapSyms,
+    llvm::SmallVectorImpl<mlir::Location> &mapLocs,
+    llvm::SmallVectorImpl<mlir::Type> &mapTypes,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &deviceAddrSyms,
+    llvm::SmallVectorImpl<mlir::Location> &deviceAddrLocs,
+    llvm::SmallVectorImpl<mlir::Type> &deviceAddrTypes,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &devicePtrSyms,
+    llvm::SmallVectorImpl<mlir::Location> &devicePtrLocs,
+    llvm::SmallVectorImpl<mlir::Type> &devicePtrTypes) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processDepend(clauseOps);
-  cp.processNowait(clauseOps);
-  cp.processMap(currentLocation, directive, stmtCtx, clauseOps, &mapSyms,
-                &mapLocs, &mapTypes);
-  cp.processIsDevicePtr(clauseOps, devicePtrTypes, devicePtrLocs,
-                        devicePtrSyms);
+  cp.processDevice(stmtCtx, clauseOps);
   cp.processHasDeviceAddr(clauseOps, deviceAddrTypes, deviceAddrLocs,
                           deviceAddrSyms);
+  cp.processIf(llvm::omp::Directive::OMPD_target, clauseOps);
+  cp.processIsDevicePtr(clauseOps, devicePtrTypes, devicePtrLocs,
+                        devicePtrSyms);
+  cp.processMap(loc, stmtCtx, clauseOps, &mapSyms, &mapLocs, &mapTypes);
+  cp.processThreadLimit(stmtCtx, clauseOps);
   // TODO Support delayed privatization.
 
-  cp.processTODO<clause::Private, clause::Firstprivate, clause::Reduction,
-                 clause::InReduction, clause::Allocate, clause::UsesAllocators,
-                 clause::Defaultmap>(currentLocation,
-                                     llvm::omp::Directive::OMPD_target);
-
-  // 5.8.1 Implicit Data-Mapping Attribute Rules
-  // The following code follows the implicit data-mapping rules to map all the
-  // symbols used inside the region that have not been explicitly mapped using
-  // the map clause.
-  auto captureImplicitMap = [&](const Fortran::semantics::Symbol &sym) {
-    if (llvm::find(mapSyms, &sym) == mapSyms.end()) {
-      mlir::Value baseOp = converter.getSymbolAddress(sym);
-      if (!baseOp)
-        if (const auto *details = sym.template detailsIf<
-                                  Fortran::semantics::HostAssocDetails>()) {
-          baseOp = converter.getSymbolAddress(details->symbol());
-          converter.copySymbolBinding(details->symbol(), sym);
-        }
-
-      if (baseOp) {
-        llvm::SmallVector<mlir::Value> bounds;
-        std::stringstream name;
-        fir::ExtendedValue dataExv = converter.getSymbolExtendedValue(sym);
-        name << sym.name().ToString();
+  if (processHostOnlyClauses)
+    cp.processNowait(clauseOps);
 
-        Fortran::lower::AddrAndBoundsInfo info =
-            getDataOperandBaseAddr(converter, converter.getFirOpBuilder(), sym,
-                                   converter.getCurrentLocation());
-        if (fir::unwrapRefType(info.addr.getType()).isa<fir::BaseBoxType>())
-          bounds =
-              Fortran::lower::genBoundsOpsFromBox<mlir::omp::MapBoundsOp,
-                                                  mlir::omp::MapBoundsType>(
-                  converter.getFirOpBuilder(), converter.getCurrentLocation(),
-                  converter, dataExv, info);
-        if (fir::unwrapRefType(info.addr.getType()).isa<fir::SequenceType>()) {
-          bool dataExvIsAssumedSize =
-              Fortran::semantics::IsAssumedSizeArray(sym.GetUltimate());
-          bounds = Fortran::lower::genBaseBoundsOps<mlir::omp::MapBoundsOp,
-                                                    mlir::omp::MapBoundsType>(
-              converter.getFirOpBuilder(), converter.getCurrentLocation(),
-              converter, dataExv, dataExvIsAssumedSize);
-        }
+  cp.processTODO<clause::Allocate, clause::Defaultmap, clause::Firstprivate,
+                 clause::InReduction, clause::Private, clause::Reduction,
+                 clause::UsesAllocators>(loc,
+                                         llvm::omp::Directive::OMPD_target);
+}
 
-        llvm::omp::OpenMPOffloadMappingFlags mapFlag =
-            llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT;
-        mlir::omp::VariableCaptureKind captureKind =
-            mlir::omp::VariableCaptureKind::ByRef;
+static void genTargetDataClauses(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
+    Fortran::lower::StatementContext &stmtCtx,
+    const Fortran::parser::OmpClauseList &clauses, mlir::Location loc,
+    mlir::omp::TargetDataClauseOps &clauseOps,
+    llvm::SmallVectorImpl<mlir::Type> &useDeviceTypes,
+    llvm::SmallVectorImpl<mlir::Location> &useDeviceLocs,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &useDeviceSyms) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
+  cp.processDevice(stmtCtx, clauseOps);
+  cp.processIf(llvm::omp::Directive::OMPD_target_data, clauseOps);
+  cp.processMap(loc, stmtCtx, clauseOps);
+  cp.processUseDeviceAddr(clauseOps, useDeviceTypes, useDeviceLocs,
+                          useDeviceSyms);
+  cp.processUseDevicePtr(clauseOps, useDeviceTypes, useDeviceLocs,
+                         useDeviceSyms);
 
-        mlir::Type eleType = baseOp.getType();
-        if (auto refType = baseOp.getType().dyn_cast<fir::ReferenceType>())
-          eleType = refType.getElementType();
+  // This function implements the deprecated functionality of use_device_ptr
+  // that allows users to provide non-CPTR arguments to it with the caveat
+  // that the compiler will treat them as use_device_addr. A lot of legacy
+  // code may still depend on this functionality, so we should support it
+  // in some manner. We do so currently by simply shifting non-cptr operands
+  // from the use_device_ptr list into the front of the use_device_addr list
+  // whilst maintaining the ordering of useDeviceLocs, useDeviceSyms and
+  // useDeviceTypes to use_device_ptr/use_device_addr input for BlockArg
+  // ordering.
+  // TODO: Perhaps create a user provideable compiler option that will
+  // re-introduce a hard-error rather than a warning in these cases.
+  promoteNonCPtrUseDevicePtrArgsToUseDeviceAddr(clauseOps, useDeviceTypes,
+                                                useDeviceLocs, useDeviceSyms);
+}
 
-        // If a variable is specified in declare target link and if device
-        // type is not specified as `nohost`, it needs to be mapped tofrom
-        mlir::ModuleOp mod = converter.getFirOpBuilder().getModule();
-        mlir::Operation *op = mod.lookupSymbol(converter.mangleName(sym));
-        auto declareTargetOp =
-            llvm::dyn_cast_if_present<mlir::omp::DeclareTargetInterface>(op);
-        if (declareTargetOp && declareTargetOp.isDeclareTarget()) {
-          if (declareTargetOp.getDeclareTargetCaptureClause() ==
-                  mlir::omp::DeclareTargetCaptureClause::link &&
-              declareTargetOp.getDeclareTargetDeviceType() !=
-                  mlir::omp::DeclareTargetDeviceType::nohost) {
-            mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO;
-            mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
-          }
-        } else if (fir::isa_trivial(eleType) || fir::isa_char(eleType)) {
-          captureKind = mlir::omp::VariableCaptureKind::ByCopy;
-        } else if (!fir::isa_builtin_cptr_type(eleType)) {
-          mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO;
-          mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
-        }
+static void genTargetEnterExitUpdateDataClauses(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
+    Fortran::lower::StatementContext &stmtCtx,
+    const Fortran::parser::OmpClauseList &clauses, mlir::Location loc,
+    llvm::omp::Directive directive,
+    mlir::omp::TargetEnterExitUpdateDataClauseOps &clauseOps) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
+  cp.processDepend(clauseOps);
+  cp.processDevice(stmtCtx, clauseOps);
+  cp.processIf(directive, clauseOps);
+  cp.processNowait(clauseOps);
 
-        mlir::Value mapOp = createMapInfoOp(
-            converter.getFirOpBuilder(), baseOp.getLoc(), baseOp, mlir::Value{},
-            name.str(), bounds, {},
-            static_cast<
-                std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
-                mapFlag),
-            captureKind, baseOp.getType());
+  if (directive == llvm::omp::Directive::OMPD_target_update) {
+    cp.processMotionClauses<clause::To>(stmtCtx, clauseOps);
+    cp.processMotionClauses<clause::From>(stmtCtx, clauseOps);
+  } else {
+    cp.processMap(loc, stmtCtx, clauseOps);
+  }
+}
 
-        clauseOps.mapVars.push_back(mapOp);
-        mapSyms.push_back(&sym);
-        mapLocs.push_back(baseOp.getLoc());
-        mapTypes.push_back(baseOp.getType());
-      }
-    }
-  };
-  Fortran::lower::pft::visitAllSymbols(eval, captureImplicitMap);
+static void genTaskClauses(Fortran::lower::AbstractConverter &converter,
+                           Fortran::semantics::SemanticsContext &semaCtx,
+                           Fortran::lower::StatementContext &stmtCtx,
+                           const Fortran::parser::OmpClauseList &clauses,
+                           mlir::Location loc,
+                           mlir::omp::TaskClauseOps &clauseOps) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
+  cp.processAllocate(clauseOps);
+  cp.processDefault();
+  cp.processDepend(clauseOps);
+  cp.processFinal(stmtCtx, clauseOps);
+  cp.processIf(llvm::omp::Directive::OMPD_task, clauseOps);
+  cp.processMergeable(clauseOps);
+  cp.processPriority(stmtCtx, clauseOps);
+  cp.processUntied(clauseOps);
+  // TODO Support delayed privatization.
 
-  auto targetOp = converter.getFirOpBuilder().create<mlir::omp::TargetOp>(
-      currentLocation, clauseOps);
+  cp.processTODO<clause::Affinity, clause::Detach, clause::InReduction>(
+      loc, llvm::omp::Directive::OMPD_task);
+}
 
-  genBodyOfTargetOp(converter, semaCtx, eval, genNested, targetOp, mapSyms,
-                    mapLocs, mapTypes, currentLocation);
+static void genTaskgroupClauses(Fortran::lower::AbstractConverter &converter,
+                                Fortran::semantics::SemanticsContext &semaCtx,
+                                const Fortran::parser::OmpClauseList &clauses,
+                                mlir::Location loc,
+                                mlir::omp::TaskgroupClauseOps &clauseOps) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
+  cp.processAllocate(clauseOps);
 
-  return targetOp;
+  cp.processTODO<clause::TaskReduction>(loc,
+                                        llvm::omp::Directive::OMPD_taskgroup);
 }
 
-static mlir::omp::TeamsOp
-genTeamsOp(Fortran::lower::AbstractConverter &converter,
-           Fortran::semantics::SemanticsContext &semaCtx,
-           Fortran::lower::pft::Evaluation &eval, bool genNested,
-           mlir::Location currentLocation,
-           const Fortran::parser::OmpClauseList &clauseList,
-           bool outerCombined = false) {
-  Fortran::lower::StatementContext stmtCtx;
-  mlir::omp::TeamsClauseOps clauseOps;
+static void genTaskwaitClauses(Fortran::lower::AbstractConverter &converter,
+                               Fortran::semantics::SemanticsContext &semaCtx,
+                               const Fortran::parser::OmpClauseList &clauses,
+                               mlir::Location loc,
+                               mlir::omp::TaskwaitClauseOps &clauseOps) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
+  cp.processTODO<clause::Depend, clause::Nowait>(
+      loc, llvm::omp::Directive::OMPD_taskwait);
+}
 
-  ClauseProcessor cp(converter, semaCtx, clauseList);
-  cp.processIf(llvm::omp::Directive::OMPD_teams, clauseOps);
+static void genTeamsClauses(Fortran::lower::AbstractConverter &converter,
+                            Fortran::semantics::SemanticsContext &semaCtx,
+                            Fortran::lower::StatementContext &stmtCtx,
+                            const Fortran::parser::OmpClauseList &clauses,
+                            mlir::Location loc,
+                            mlir::omp::TeamsClauseOps &clauseOps) {
+  ClauseProcessor cp(converter, semaCtx, clauses);
   cp.processAllocate(clauseOps);
   cp.processDefault();
+  cp.processIf(llvm::omp::Directive::OMPD_teams, clauseOps);
   cp.processNumTeams(stmtCtx, clauseOps);
   cp.processThreadLimit(stmtCtx, clauseOps);
   // TODO Support delayed privatization.
 
-  cp.processTODO<clause::Reduction>(currentLocation,
-                                    llvm::omp::Directive::OMPD_teams);
-
-  return genOpWithBody<mlir::omp::TeamsOp>(
-      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
-          .setGenNested(genNested)
-          .setOuterCombined(outerCombined)
-          .setClauses(&clauseList),
-      clauseOps);
+  cp.processTODO<clause::Reduction>(loc, llvm::omp::Directive::OMPD_teams);
 }
 
-/// Extract the list of function and variable symbols affected by the given
-/// 'declare target' directive and return the intended device type for them.
-static void getDeclareTargetInfo(
+static void genWsloopClauses(
     Fortran::lower::AbstractConverter &converter,
     Fortran::semantics::SemanticsContext &semaCtx,
+    Fortran::lower::StatementContext &stmtCtx,
     Fortran::lower::pft::Evaluation &eval,
-    const Fortran::parser::OpenMPDeclareTargetConstruct &declareTargetConstruct,
-    mlir::omp::DeclareTargetClauseOps &clauseOps,
-    llvm::SmallVectorImpl<DeclareTargetCapturePair> &symbolAndClause) {
-  const auto &spec = std::get<Fortran::parser::OmpDeclareTargetSpecifier>(
-      declareTargetConstruct.t);
-  if (const auto *objectList{
-          Fortran::parser::Unwrap<Fortran::parser::OmpObjectList>(spec.u)}) {
-    ObjectList objects{makeObjects(*objectList, semaCtx)};
-    // Case: declare target(func, var1, var2)
-    gatherFuncAndVarSyms(objects, mlir::omp::DeclareTargetCaptureClause::to,
-                         symbolAndClause);
-  } else if (const auto *clauseList{
-                 Fortran::parser::Unwrap<Fortran::parser::OmpClauseList>(
-                     spec.u)}) {
-    if (clauseList->v.empty()) {
-      // Case: declare target, implicit capture of function
-      symbolAndClause.emplace_back(
-          mlir::omp::DeclareTargetCaptureClause::to,
-          eval.getOwningProcedure()->getSubprogramSymbol());
-    }
+    const Fortran::parser::OmpClauseList &beginClauses,
+    const Fortran::parser::OmpClauseList *endClauses, mlir::Location loc,
+    mlir::omp::WsloopClauseOps &clauseOps,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &iv,
+    llvm::SmallVectorImpl<mlir::Type> &reductionTypes,
+    llvm::SmallVectorImpl<const Fortran::semantics::Symbol *> &reductionSyms) {
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  ClauseProcessor bcp(converter, semaCtx, beginClauses);
+  bcp.processCollapse(loc, eval, clauseOps, iv);
+  bcp.processOrdered(clauseOps);
+  bcp.processReduction(loc, clauseOps, &reductionTypes, &reductionSyms);
+  bcp.processSchedule(stmtCtx, clauseOps);
+  clauseOps.loopInclusiveAttr = firOpBuilder.getUnitAttr();
+  // TODO Support delayed privatization.
 
-    ClauseProcessor cp(converter, semaCtx, *clauseList);
-    cp.processTo(symbolAndClause);
-    cp.processEnter(symbolAndClause);
-    cp.processLink(symbolAndClause);
-    cp.processDeviceType(clauseOps);
-    cp.processTODO<clause::Indirect>(converter.getCurrentLocation(),
-                                     llvm::omp::Directive::OMPD_declare_target);
+  if (ReductionProcessor::doReductionByRef(clauseOps.reductionVars))
+    clauseOps.reductionByRefAttr = firOpBuilder.getUnitAttr();
+
+  if (endClauses) {
+    ClauseProcessor ecp(converter, semaCtx, *endClauses);
+    ecp.processNowait(clauseOps);
   }
+
+  bcp.processTODO<clause::Allocate, clause::Linear, clause::Order>(
+      loc, llvm::omp::Directive::OMPD_do);
 }
 
-static void collectDeferredDeclareTargets(
-    Fortran::lower::AbstractConverter &converter,
-    Fortran::semantics::SemanticsContext &semaCtx,
-    Fortran::lower::pft::Evaluation &eval,
-    const Fortran::parser::OpenMPDeclareTargetConstruct &declareTargetConstruct,
-    llvm::SmallVectorImpl<Fortran::lower::OMPDeferredDeclareTargetInfo>
-        &deferredDeclareTarget) {
-  mlir::omp::DeclareTargetClauseOps clauseOps;
-  llvm::SmallVector<DeclareTargetCapturePair> symbolAndClause;
-  getDeclareTargetInfo(converter, semaCtx, eval, declareTargetConstruct,
-                       clauseOps, symbolAndClause);
-  // Return the device type only if at least one of the targets for the
-  // directive is a function or subroutine
-  mlir::ModuleOp mod = converter.getFirOpBuilder().getModule();
+//===----------------------------------------------------------------------===//
+// Code generation functions for leaf constructs
+//===----------------------------------------------------------------------===//
 
-  for (const DeclareTargetCapturePair &symClause : symbolAndClause) {
-    mlir::Operation *op = mod.lookupSymbol(converter.mangleName(
-        std::get<const Fortran::semantics::Symbol &>(symClause)));
+static mlir::omp::BarrierOp
+genBarrierOp(Fortran::lower::AbstractConverter &converter,
+             Fortran::semantics::SemanticsContext &semaCtx,
+             Fortran::lower::pft::Evaluation &eval, mlir::Location loc) {
+  return converter.getFirOpBuilder().create<mlir::omp::BarrierOp>(loc);
+}
 
-    if (!op) {
-      deferredDeclareTarget.push_back({std::get<0>(symClause),
-                                       clauseOps.deviceType,
-                                       std::get<1>(symClause)});
+static mlir::omp::CriticalOp
+genCriticalOp(Fortran::lower::AbstractConverter &converter,
+              Fortran::semantics::SemanticsContext &semaCtx,
+              Fortran::lower::pft::Evaluation &eval, bool genNested,
+              mlir::Location loc,
+              const Fortran::parser::OmpClauseList &clauseList,
+              const std::optional<Fortran::parser::Name> &name) {
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  mlir::FlatSymbolRefAttr nameAttr;
+
+  if (name) {
+    std::string nameStr = name->ToString();
+    mlir::ModuleOp mod = firOpBuilder.getModule();
+    auto global = mod.lookupSymbol<mlir::omp::CriticalDeclareOp>(nameStr);
+    if (!global) {
+      mlir::omp::CriticalClauseOps clauseOps;
+      genCriticalDeclareClauses(converter, semaCtx, clauseList, loc, clauseOps,
+                                nameStr);
+
+      mlir::OpBuilder modBuilder(mod.getBodyRegion());
+      global = modBuilder.create<mlir::omp::CriticalDeclareOp>(loc, clauseOps);
     }
+    nameAttr = mlir::FlatSymbolRefAttr::get(firOpBuilder.getContext(),
+                                            global.getSymName());
   }
+
+  return genOpWithBody<mlir::omp::CriticalOp>(
+      OpWithBodyGenInfo(converter, semaCtx, loc, eval,
+                        llvm::omp::Directive::OMPD_critical)
+          .setGenNested(genNested),
+      nameAttr);
 }
 
-static std::optional<mlir::omp::DeclareTargetDeviceType>
-getDeclareTargetFunctionDevice(
-    Fortran::lower::AbstractConverter &converter,
-    Fortran::semantics::SemanticsContext &semaCtx,
-    Fortran::lower::pft::Evaluation &eval,
-    const Fortran::parser::OpenMPDeclareTargetConstruct
-        &declareTargetConstruct) {
-  mlir::omp::DeclareTargetClauseOps clauseOps;
-  llvm::SmallVector<DeclareTargetCapturePair> symbolAndClause;
-  getDeclareTargetInfo(converter, semaCtx, eval, declareTargetConstruct,
-                       clauseOps, symbolAndClause);
+static mlir::omp::DistributeOp
+genDistributeOp(Fortran::lower::AbstractConverter &converter,
+                Fortran::semantics::SemanticsContext &semaCtx,
+                Fortran::lower::pft::Evaluation &eval, bool genNested,
+                mlir::Location loc,
+                const Fortran::parser::OmpClauseList &clauseList) {
+  TODO(loc, "Distribute construct");
+  return nullptr;
+}
 
-  // Return the device type only if at least one of the targets for the
-  // directive is a function or subroutine
-  mlir::ModuleOp mod = converter.getFirOpBuilder().getModule();
-  for (const DeclareTargetCapturePair &symClause : symbolAndClause) {
-    mlir::Operation *op = mod.lookupSymbol(converter.mangleName(
-        std::get<const Fortran::semantics::Symbol &>(symClause)));
+static mlir::omp::FlushOp
+genFlushOp(Fortran::lower::AbstractConverter &converter,
+           Fortran::semantics::SemanticsContext &semaCtx,
+           Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+           const std::optional<Fortran::parser::OmpObjectList> &objectList,
+           const std::optional<std::list<Fortran::parser::OmpMemoryOrderClause>>
+               &clauseList) {
+  llvm::SmallVector<mlir::Value> operandRange;
+  genFlushClauses(converter, semaCtx, objectList, clauseList, loc,
+                  operandRange);
+
+  return converter.getFirOpBuilder().create<mlir::omp::FlushOp>(
+      converter.getCurrentLocation(), operandRange);
+}
 
-    if (mlir::isa_and_nonnull<mlir::func::FuncOp>(op))
-      return clauseOps.deviceType;
-  }
+static mlir::omp::MasterOp
+genMasterOp(Fortran::lower::AbstractConverter &converter,
+            Fortran::semantics::SemanticsContext &semaCtx,
+            Fortran::lower::pft::Evaluation &eval, bool genNested,
+            mlir::Location loc) {
+  return genOpWithBody<mlir::omp::MasterOp>(
+      OpWithBodyGenInfo(converter, semaCtx, loc, eval,
+                        llvm::omp::Directive::OMPD_master)
+          .setGenNested(genNested));
+}
 
-  return std::nullopt;
+static mlir::omp::OrderedOp
+genOrderedOp(Fortran::lower::AbstractConverter &converter,
+             Fortran::semantics::SemanticsContext &semaCtx,
+             Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+             const Fortran::parser::OmpClauseList &clauseList) {
+  TODO(loc, "OMPD_ordered");
+  return nullptr;
 }
 
-//===----------------------------------------------------------------------===//
-// genOMP() Code generation helper functions
-//===----------------------------------------------------------------------===//
+static mlir::omp::OrderedRegionOp
+genOrderedRegionOp(Fortran::lower::AbstractConverter &converter,
+                   Fortran::semantics::SemanticsContext &semaCtx,
+                   Fortran::lower::pft::Evaluation &eval, bool genNested,
+                   mlir::Location loc,
+                   const Fortran::parser::OmpClauseList &clauseList) {
+  mlir::omp::OrderedRegionClauseOps clauseOps;
+  genOrderedRegionClauses(converter, semaCtx, clauseList, loc, clauseOps);
 
-static void
-genOmpSimpleStandalone(Fortran::lower::AbstractConverter &converter,
-                       Fortran::semantics::SemanticsContext &semaCtx,
-                       Fortran::lower::pft::Evaluation &eval, bool genNested,
-                       const Fortran::parser::OpenMPSimpleStandaloneConstruct
-                           &simpleStandaloneConstruct) {
-  const auto &directive =
-      std::get<Fortran::parser::OmpSimpleStandaloneDirective>(
-          simpleStandaloneConstruct.t);
+  return genOpWithBody<mlir::omp::OrderedRegionOp>(
+      OpWithBodyGenInfo(converter, semaCtx, loc, eval,
+                        llvm::omp::Directive::OMPD_ordered)
+          .setGenNested(genNested),
+      clauseOps);
+}
+
+static mlir::omp::ParallelOp
+genParallelOp(Fortran::lower::AbstractConverter &converter,
+              Fortran::lower::SymMap &symTable,
+              Fortran::semantics::SemanticsContext &semaCtx,
+              Fortran::lower::pft::Evaluation &eval, bool genNested,
+              mlir::Location loc,
+              const Fortran::parser::OmpClauseList &clauseList,
+              bool outerCombined = false) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  const auto &opClauseList =
-      std::get<Fortran::parser::OmpClauseList>(simpleStandaloneConstruct.t);
-  mlir::Location currentLocation = converter.genLocation(directive.source);
+  Fortran::lower::StatementContext stmtCtx;
+  mlir::omp::ParallelClauseOps clauseOps;
+  llvm::SmallVector<const Fortran::semantics::Symbol *> privateSyms;
+  llvm::SmallVector<mlir::Type> reductionTypes;
+  llvm::SmallVector<const Fortran::semantics::Symbol *> reductionSyms;
+  genParallelClauses(converter, semaCtx, stmtCtx, clauseList, loc,
+                     /*processReduction=*/!outerCombined, clauseOps,
+                     reductionTypes, reductionSyms);
 
-  switch (directive.v) {
-  default:
-    break;
-  case llvm::omp::Directive::OMPD_barrier:
-    firOpBuilder.create<mlir::omp::BarrierOp>(currentLocation);
-    break;
-  case llvm::omp::Directive::OMPD_taskwait: {
-    mlir::omp::TaskwaitClauseOps clauseOps;
-    ClauseProcessor cp(converter, semaCtx, opClauseList);
-    cp.processTODO<clause::Depend, clause::Nowait>(
-        currentLocation, llvm::omp::Directive::OMPD_taskwait);
-    firOpBuilder.create<mlir::omp::TaskwaitOp>(currentLocation, clauseOps);
-    break;
-  }
-  case llvm::omp::Directive::OMPD_taskyield:
-    firOpBuilder.create<mlir::omp::TaskyieldOp>(currentLocation);
-    break;
-  case llvm::omp::Directive::OMPD_target_data:
-    genTargetDataOp(converter, semaCtx, eval, genNested, currentLocation,
-                    opClauseList);
-    break;
-  case llvm::omp::Directive::OMPD_target_enter_data:
-    genTargetEnterExitDataUpdateOp<mlir::omp::TargetEnterDataOp>(
-        converter, semaCtx, currentLocation, opClauseList);
-    break;
-  case llvm::omp::Directive::OMPD_target_exit_data:
-    genTargetEnterExitDataUpdateOp<mlir::omp::TargetExitDataOp>(
-        converter, semaCtx, currentLocation, opClauseList);
-    break;
-  case llvm::omp::Directive::OMPD_target_update:
-    genTargetEnterExitDataUpdateOp<mlir::omp::TargetUpdateOp>(
-        converter, semaCtx, currentLocation, opClauseList);
-    break;
-  case llvm::omp::Directive::OMPD_ordered:
-    TODO(currentLocation, "OMPD_ordered");
-  }
+  auto reductionCallback = [&](mlir::Operation *op) {
+    genReductionVars(op, converter, loc, reductionSyms, reductionTypes);
+    return reductionSyms;
+  };
+
+  OpWithBodyGenInfo genInfo =
+      OpWithBodyGenInfo(converter, semaCtx, loc, eval,
+                        llvm::omp::Directive::OMPD_parallel)
+          .setGenNested(genNested)
+          .setOuterCombined(outerCombined)
+          .setClauses(&clauseList)
+          .setReductions(&reductionSyms, &reductionTypes)
+          .setGenRegionEntryCb(reductionCallback);
+
+  if (!enableDelayedPrivatization)
+    return genOpWithBody<mlir::omp::ParallelOp>(genInfo, clauseOps);
+
+  bool privatize = !outerCombined;
+  DataSharingProcessor dsp(converter, semaCtx, clauseList, eval,
+                           /*useDelayedPrivatization=*/true, &symTable);
+
+  if (privatize)
+    dsp.processStep1(&clauseOps, &privateSyms);
+
+  auto genRegionEntryCB = [&](mlir::Operation *op) {
+    auto parallelOp = llvm::cast<mlir::omp::ParallelOp>(op);
+
+    llvm::SmallVector<mlir::Location> reductionLocs(
+        clauseOps.reductionVars.size(), loc);
+
+    mlir::OperandRange privateVars = parallelOp.getPrivateVars();
+    mlir::Region &region = parallelOp.getRegion();
+
+    llvm::SmallVector<mlir::Type> privateVarTypes = reductionTypes;
+    privateVarTypes.reserve(privateVarTypes.size() + privateVars.size());
+    llvm::transform(privateVars, std::back_inserter(privateVarTypes),
+                    [](mlir::Value v) { return v.getType(); });
+
+    llvm::SmallVector<mlir::Location> privateVarLocs = reductionLocs;
+    privateVarLocs.reserve(privateVarLocs.size() + privateVars.size());
+    llvm::transform(privateVars, std::back_inserter(privateVarLocs),
+                    [](mlir::Value v) { return v.getLoc(); });
+
+    firOpBuilder.createBlock(&region, /*insertPt=*/{}, privateVarTypes,
+                             privateVarLocs);
+
+    llvm::SmallVector<const Fortran::semantics::Symbol *> allSymbols =
+        reductionSyms;
+    allSymbols.append(privateSyms);
+    for (auto [arg, prv] : llvm::zip_equal(allSymbols, region.getArguments())) {
+      converter.bindSymbol(*arg, prv);
+    }
+
+    return allSymbols;
+  };
+
+  // TODO Merge with the reduction CB.
+  genInfo.setGenRegionEntryCb(genRegionEntryCB).setDataSharingProcessor(&dsp);
+  return genOpWithBody<mlir::omp::ParallelOp>(genInfo, clauseOps);
 }
 
-static void
-genOmpFlush(Fortran::lower::AbstractConverter &converter,
-            Fortran::semantics::SemanticsContext &semaCtx,
-            Fortran::lower::pft::Evaluation &eval,
-            const Fortran::parser::OpenMPFlushConstruct &flushConstruct) {
-  llvm::SmallVector<mlir::Value, 4> operandRange;
-  if (const auto &ompObjectList =
-          std::get<std::optional<Fortran::parser::OmpObjectList>>(
-              flushConstruct.t))
-    genObjectList2(*ompObjectList, converter, operandRange);
-  const auto &memOrderClause =
-      std::get<std::optional<std::list<Fortran::parser::OmpMemoryOrderClause>>>(
-          flushConstruct.t);
-  if (memOrderClause && memOrderClause->size() > 0)
-    TODO(converter.getCurrentLocation(), "Handle OmpMemoryOrderClause");
-  converter.getFirOpBuilder().create<mlir::omp::FlushOp>(
-      converter.getCurrentLocation(), operandRange);
+static mlir::omp::SectionOp
+genSectionOp(Fortran::lower::AbstractConverter &converter,
+             Fortran::semantics::SemanticsContext &semaCtx,
+             Fortran::lower::pft::Evaluation &eval, bool genNested,
+             mlir::Location loc,
+             const Fortran::parser::OmpClauseList &clauseList) {
+  // Currently only private/firstprivate clause is handled, and
+  // all privatization is done within `omp.section` operations.
+  return genOpWithBody<mlir::omp::SectionOp>(
+      OpWithBodyGenInfo(converter, semaCtx, loc, eval,
+                        llvm::omp::Directive::OMPD_section)
+          .setGenNested(genNested)
+          .setClauses(&clauseList));
 }
 
-static llvm::SmallVector<const Fortran::semantics::Symbol *>
-genLoopVars(mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
-            mlir::Location &loc,
-            llvm::ArrayRef<const Fortran::semantics::Symbol *> args) {
+static mlir::omp::SectionsOp
+genSectionsOp(Fortran::lower::AbstractConverter &converter,
+              Fortran::semantics::SemanticsContext &semaCtx,
+              Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+              const mlir::omp::SectionsClauseOps &clauseOps) {
+  return genOpWithBody<mlir::omp::SectionsOp>(
+      OpWithBodyGenInfo(converter, semaCtx, loc, eval,
+                        llvm::omp::Directive::OMPD_sections)
+          .setGenNested(false),
+      clauseOps);
+}
+
+static mlir::omp::SimdOp
+genSimdOp(Fortran::lower::AbstractConverter &converter,
+          Fortran::semantics::SemanticsContext &semaCtx,
+          Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+          const Fortran::parser::OmpClauseList &clauseList) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  auto &region = op->getRegion(0);
+  DataSharingProcessor dsp(converter, semaCtx, clauseList, eval);
+  dsp.processStep1();
 
-  std::size_t loopVarTypeSize = 0;
-  for (const Fortran::semantics::Symbol *arg : args)
-    loopVarTypeSize = std::max(loopVarTypeSize, arg->GetUltimate().size());
-  mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
-  llvm::SmallVector<mlir::Type> tiv(args.size(), loopVarType);
-  llvm::SmallVector<mlir::Location> locs(args.size(), loc);
-  firOpBuilder.createBlock(&region, {}, tiv, locs);
-  // The argument is not currently in memory, so make a temporary for the
-  // argument, and store it there, then bind that location to the argument.
-  mlir::Operation *storeOp = nullptr;
-  for (auto [argIndex, argSymbol] : llvm::enumerate(args)) {
-    mlir::Value indexVal = fir::getBase(region.front().getArgument(argIndex));
-    storeOp =
-        createAndSetPrivatizedLoopVar(converter, loc, indexVal, argSymbol);
-  }
-  firOpBuilder.setInsertionPointAfter(storeOp);
+  Fortran::lower::StatementContext stmtCtx;
+  mlir::omp::LoopNestClauseOps loopClauseOps;
+  mlir::omp::SimdClauseOps simdClauseOps;
+  llvm::SmallVector<const Fortran::semantics::Symbol *> iv;
+  genLoopNestClauses(converter, semaCtx, eval, clauseList, loc, loopClauseOps,
+                     iv);
+  genSimdClauses(converter, semaCtx, clauseList, loc, simdClauseOps);
 
-  return llvm::SmallVector<const Fortran::semantics::Symbol *>(args);
+  // Create omp.simd wrapper.
+  auto simdOp = firOpBuilder.create<mlir::omp::SimdOp>(loc, simdClauseOps);
+
+  // TODO: Add reduction-related arguments to the wrapper's entry block.
+  firOpBuilder.createBlock(&simdOp.getRegion());
+  firOpBuilder.setInsertionPoint(
+      Fortran::lower::genOpenMPTerminator(firOpBuilder, simdOp, loc));
+
+  // Create nested omp.loop_nest and fill body with loop contents.
+  auto loopOp = firOpBuilder.create<mlir::omp::LoopNestOp>(loc, loopClauseOps);
+
+  auto *nestedEval =
+      getCollapsedLoopEval(eval, Fortran::lower::getCollapseValue(clauseList));
+
+  auto ivCallback = [&](mlir::Operation *op) {
+    return genLoopVars(op, converter, loc, iv);
+  };
+
+  createBodyOfOp(*loopOp,
+                 OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval,
+                                   llvm::omp::Directive::OMPD_simd)
+                     .setClauses(&clauseList)
+                     .setDataSharingProcessor(&dsp)
+                     .setGenRegionEntryCb(ivCallback));
+
+  return simdOp;
 }
 
-static llvm::SmallVector<const Fortran::semantics::Symbol *>
-genLoopAndReductionVars(
-    mlir::Operation *op, Fortran::lower::AbstractConverter &converter,
-    mlir::Location &loc,
-    llvm::ArrayRef<const Fortran::semantics::Symbol *> loopArgs,
-    llvm::ArrayRef<const Fortran::semantics::Symbol *> reductionArgs,
-    llvm::ArrayRef<mlir::Type> reductionTypes) {
+static mlir::omp::SingleOp
+genSingleOp(Fortran::lower::AbstractConverter &converter,
+            Fortran::semantics::SemanticsContext &semaCtx,
+            Fortran::lower::pft::Evaluation &eval, bool genNested,
+            mlir::Location loc,
+            const Fortran::parser::OmpClauseList &beginClauseList,
+            const Fortran::parser::OmpClauseList &endClauseList) {
+  mlir::omp::SingleClauseOps clauseOps;
+  genSingleClauses(converter, semaCtx, beginClauseList, endClauseList, loc,
+                   clauseOps);
+
+  return genOpWithBody<mlir::omp::SingleOp>(
+      OpWithBodyGenInfo(converter, semaCtx, loc, eval,
+                        llvm::omp::Directive::OMPD_single)
+          .setGenNested(genNested)
+          .setClauses(&beginClauseList),
+      clauseOps);
+}
+
+static mlir::omp::TargetOp
+genTargetOp(Fortran::lower::AbstractConverter &converter,
+            Fortran::semantics::SemanticsContext &semaCtx,
+            Fortran::lower::pft::Evaluation &eval, bool genNested,
+            mlir::Location loc,
+            const Fortran::parser::OmpClauseList &clauseList,
+            bool outerCombined = false) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  Fortran::lower::StatementContext stmtCtx;
 
-  llvm::SmallVector<mlir::Type> blockArgTypes;
-  llvm::SmallVector<mlir::Location> blockArgLocs;
-  blockArgTypes.reserve(loopArgs.size() + reductionArgs.size());
-  blockArgLocs.reserve(blockArgTypes.size());
-  mlir::Block *entryBlock;
+  bool processHostOnlyClauses =
+      !llvm::cast<mlir::omp::OffloadModuleInterface>(*converter.getModuleOp())
+           .getIsTargetDevice();
 
-  if (loopArgs.size()) {
-    std::size_t loopVarTypeSize = 0;
-    for (const Fortran::semantics::Symbol *arg : loopArgs)
-      loopVarTypeSize = std::max(loopVarTypeSize, arg->GetUltimate().size());
-    mlir::Type loopVarType = getLoopVarType(converter, loopVarTypeSize);
-    std::fill_n(std::back_inserter(blockArgTypes), loopArgs.size(),
-                loopVarType);
-    std::fill_n(std::back_inserter(blockArgLocs), loopArgs.size(), loc);
-  }
-  if (reductionArgs.size()) {
-    llvm::copy(reductionTypes, std::back_inserter(blockArgTypes));
-    std::fill_n(std::back_inserter(blockArgLocs), reductionArgs.size(), loc);
-  }
-  entryBlock = firOpBuilder.createBlock(&op->getRegion(0), {}, blockArgTypes,
-                                        blockArgLocs);
-  // The argument is not currently in memory, so make a temporary for the
-  // argument, and store it there, then bind that location to the argument.
-  if (loopArgs.size()) {
-    mlir::Operation *storeOp = nullptr;
-    for (auto [argIndex, argSymbol] : llvm::enumerate(loopArgs)) {
-      mlir::Value indexVal =
-          fir::getBase(op->getRegion(0).front().getArgument(argIndex));
-      storeOp =
-          createAndSetPrivatizedLoopVar(converter, loc, indexVal, argSymbol);
+  mlir::omp::TargetClauseOps clauseOps;
+  llvm::SmallVector<const Fortran::semantics::Symbol *> mapSyms, devicePtrSyms,
+      deviceAddrSyms;
+  llvm::SmallVector<mlir::Location> mapLocs, devicePtrLocs, deviceAddrLocs;
+  llvm::SmallVector<mlir::Type> mapTypes, devicePtrTypes, deviceAddrTypes;
+  genTargetClauses(converter, semaCtx, stmtCtx, clauseList, loc,
+                   processHostOnlyClauses, /*processReduction=*/outerCombined,
+                   clauseOps, mapSyms, mapLocs, mapTypes, deviceAddrSyms,
+                   deviceAddrLocs, deviceAddrTypes, devicePtrSyms,
+                   devicePtrLocs, devicePtrTypes);
+
+  // 5.8.1 Implicit Data-Mapping Attribute Rules
+  // The following code follows the implicit data-mapping rules to map all the
+  // symbols used inside the region that have not been explicitly mapped using
+  // the map clause.
+  auto captureImplicitMap = [&](const Fortran::semantics::Symbol &sym) {
+    if (llvm::find(mapSyms, &sym) == mapSyms.end()) {
+      mlir::Value baseOp = converter.getSymbolAddress(sym);
+      if (!baseOp)
+        if (const auto *details = sym.template detailsIf<
+                                  Fortran::semantics::HostAssocDetails>()) {
+          baseOp = converter.getSymbolAddress(details->symbol());
+          converter.copySymbolBinding(details->symbol(), sym);
+        }
+
+      if (baseOp) {
+        llvm::SmallVector<mlir::Value> bounds;
+        std::stringstream name;
+        fir::ExtendedValue dataExv = converter.getSymbolExtendedValue(sym);
+        name << sym.name().ToString();
+
+        Fortran::lower::AddrAndBoundsInfo info = getDataOperandBaseAddr(
+            converter, firOpBuilder, sym, converter.getCurrentLocation());
+        if (fir::unwrapRefType(info.addr.getType()).isa<fir::BaseBoxType>())
+          bounds =
+              Fortran::lower::genBoundsOpsFromBox<mlir::omp::MapBoundsOp,
+                                                  mlir::omp::MapBoundsType>(
+                  firOpBuilder, converter.getCurrentLocation(), converter,
+                  dataExv, info);
+        if (fir::unwrapRefType(info.addr.getType()).isa<fir::SequenceType>()) {
+          bool dataExvIsAssumedSize =
+              Fortran::semantics::IsAssumedSizeArray(sym.GetUltimate());
+          bounds = Fortran::lower::genBaseBoundsOps<mlir::omp::MapBoundsOp,
+                                                    mlir::omp::MapBoundsType>(
+              firOpBuilder, converter.getCurrentLocation(), converter, dataExv,
+              dataExvIsAssumedSize);
+        }
+
+        llvm::omp::OpenMPOffloadMappingFlags mapFlag =
+            llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT;
+        mlir::omp::VariableCaptureKind captureKind =
+            mlir::omp::VariableCaptureKind::ByRef;
+
+        mlir::Type eleType = baseOp.getType();
+        if (auto refType = baseOp.getType().dyn_cast<fir::ReferenceType>())
+          eleType = refType.getElementType();
+
+        // If a variable is specified in declare target link and if device
+        // type is not specified as `nohost`, it needs to be mapped tofrom
+        mlir::ModuleOp mod = firOpBuilder.getModule();
+        mlir::Operation *op = mod.lookupSymbol(converter.mangleName(sym));
+        auto declareTargetOp =
+            llvm::dyn_cast_if_present<mlir::omp::DeclareTargetInterface>(op);
+        if (declareTargetOp && declareTargetOp.isDeclareTarget()) {
+          if (declareTargetOp.getDeclareTargetCaptureClause() ==
+                  mlir::omp::DeclareTargetCaptureClause::link &&
+              declareTargetOp.getDeclareTargetDeviceType() !=
+                  mlir::omp::DeclareTargetDeviceType::nohost) {
+            mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO;
+            mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
+          }
+        } else if (fir::isa_trivial(eleType) || fir::isa_char(eleType)) {
+          captureKind = mlir::omp::VariableCaptureKind::ByCopy;
+        } else if (!fir::isa_builtin_cptr_type(eleType)) {
+          mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO;
+          mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM;
+        }
+
+        mlir::Value mapOp = createMapInfoOp(
+            firOpBuilder, baseOp.getLoc(), baseOp, mlir::Value{}, name.str(),
+            bounds, {},
+            static_cast<
+                std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
+                mapFlag),
+            captureKind, baseOp.getType());
+
+        clauseOps.mapVars.push_back(mapOp);
+        mapSyms.push_back(&sym);
+        mapLocs.push_back(baseOp.getLoc());
+        mapTypes.push_back(baseOp.getType());
+      }
     }
-    firOpBuilder.setInsertionPointAfter(storeOp);
-  }
-  // Bind the reduction arguments to their block arguments
-  for (auto [arg, prv] : llvm::zip_equal(
-           reductionArgs,
-           llvm::drop_begin(entryBlock->getArguments(), loopArgs.size()))) {
-    converter.bindSymbol(*arg, prv);
-  }
+  };
+  Fortran::lower::pft::visitAllSymbols(eval, captureImplicitMap);
 
-  return llvm::SmallVector<const Fortran::semantics::Symbol *>(loopArgs);
+  auto targetOp = firOpBuilder.create<mlir::omp::TargetOp>(loc, clauseOps);
+  genBodyOfTargetOp(converter, semaCtx, eval, genNested, targetOp, mapSyms,
+                    mapLocs, mapTypes, loc);
+  return targetOp;
 }
 
-static void
-createSimdLoop(Fortran::lower::AbstractConverter &converter,
-               Fortran::semantics::SemanticsContext &semaCtx,
-               Fortran::lower::pft::Evaluation &eval,
-               llvm::omp::Directive ompDirective,
-               const Fortran::parser::OmpClauseList &loopOpClauseList,
-               mlir::Location loc) {
+static mlir::omp::TargetDataOp
+genTargetDataOp(Fortran::lower::AbstractConverter &converter,
+                Fortran::semantics::SemanticsContext &semaCtx,
+                Fortran::lower::pft::Evaluation &eval, bool genNested,
+                mlir::Location loc,
+                const Fortran::parser::OmpClauseList &clauseList) {
+  Fortran::lower::StatementContext stmtCtx;
+  mlir::omp::TargetDataClauseOps clauseOps;
+  llvm::SmallVector<mlir::Type> useDeviceTypes;
+  llvm::SmallVector<mlir::Location> useDeviceLocs;
+  llvm::SmallVector<const Fortran::semantics::Symbol *> useDeviceSyms;
+  genTargetDataClauses(converter, semaCtx, stmtCtx, clauseList, loc, clauseOps,
+                       useDeviceTypes, useDeviceLocs, useDeviceSyms);
+
+  auto targetDataOp =
+      converter.getFirOpBuilder().create<mlir::omp::TargetDataOp>(loc,
+                                                                  clauseOps);
+  genBodyOfTargetDataOp(converter, semaCtx, eval, genNested, targetDataOp,
+                        useDeviceTypes, useDeviceLocs, useDeviceSyms, loc);
+  return targetDataOp;
+}
+
+template <typename OpTy>
+static OpTy genTargetEnterExitUpdateDataOp(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx, mlir::Location loc,
+    const Fortran::parser::OmpClauseList &clauseList) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  DataSharingProcessor dsp(converter, semaCtx, loopOpClauseList, eval);
-  dsp.processStep1();
+  Fortran::lower::StatementContext stmtCtx;
 
+  // GCC 9.3.0 emits a (probably) bogus warning about an unused variable.
+  [[maybe_unused]] llvm::omp::Directive directive;
+  if constexpr (std::is_same_v<OpTy, mlir::omp::TargetEnterDataOp>) {
+    directive = llvm::omp::Directive::OMPD_target_enter_data;
+  } else if constexpr (std::is_same_v<OpTy, mlir::omp::TargetExitDataOp>) {
+    directive = llvm::omp::Directive::OMPD_target_exit_data;
+  } else if constexpr (std::is_same_v<OpTy, mlir::omp::TargetUpdateOp>) {
+    directive = llvm::omp::Directive::OMPD_target_update;
+  } else {
+    llvm_unreachable("Unexpected TARGET DATA construct");
+  }
+  mlir::omp::TargetEnterExitUpdateDataClauseOps clauseOps;
+  genTargetEnterExitUpdateDataClauses(converter, semaCtx, stmtCtx, clauseList,
+                                      loc, directive, clauseOps);
+
+  return firOpBuilder.create<OpTy>(loc, clauseOps);
+}
+
+static mlir::omp::TaskOp
+genTaskOp(Fortran::lower::AbstractConverter &converter,
+          Fortran::semantics::SemanticsContext &semaCtx,
+          Fortran::lower::pft::Evaluation &eval, bool genNested,
+          mlir::Location loc,
+          const Fortran::parser::OmpClauseList &clauseList) {
   Fortran::lower::StatementContext stmtCtx;
-  mlir::omp::SimdLoopClauseOps clauseOps;
-  llvm::SmallVector<const Fortran::semantics::Symbol *> iv;
+  mlir::omp::TaskClauseOps clauseOps;
+  genTaskClauses(converter, semaCtx, stmtCtx, clauseList, loc, clauseOps);
 
-  ClauseProcessor cp(converter, semaCtx, loopOpClauseList);
-  cp.processCollapse(loc, eval, clauseOps, iv);
-  cp.processReduction(loc, clauseOps);
-  cp.processIf(llvm::omp::Directive::OMPD_simd, clauseOps);
-  cp.processSimdlen(clauseOps);
-  cp.processSafelen(clauseOps);
-  clauseOps.loopInclusiveAttr = firOpBuilder.getUnitAttr();
-  // TODO Support delayed privatization.
+  return genOpWithBody<mlir::omp::TaskOp>(
+      OpWithBodyGenInfo(converter, semaCtx, loc, eval,
+                        llvm::omp::Directive::OMPD_task)
+          .setGenNested(genNested)
+          .setClauses(&clauseList),
+      clauseOps);
+}
+
+static mlir::omp::TaskgroupOp
+genTaskgroupOp(Fortran::lower::AbstractConverter &converter,
+               Fortran::semantics::SemanticsContext &semaCtx,
+               Fortran::lower::pft::Evaluation &eval, bool genNested,
+               mlir::Location loc,
+               const Fortran::parser::OmpClauseList &clauseList) {
+  mlir::omp::TaskgroupClauseOps clauseOps;
+  genTaskgroupClauses(converter, semaCtx, clauseList, loc, clauseOps);
+
+  return genOpWithBody<mlir::omp::TaskgroupOp>(
+      OpWithBodyGenInfo(converter, semaCtx, loc, eval,
+                        llvm::omp::Directive::OMPD_taskgroup)
+          .setGenNested(genNested)
+          .setClauses(&clauseList),
+      clauseOps);
+}
 
-  cp.processTODO<clause::Aligned, clause::Allocate, clause::Linear,
-                 clause::Nontemporal, clause::Order>(loc, ompDirective);
+static mlir::omp::TaskloopOp
+genTaskloopOp(Fortran::lower::AbstractConverter &converter,
+              Fortran::semantics::SemanticsContext &semaCtx,
+              Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+              const Fortran::parser::OmpClauseList &clauseList) {
+  TODO(loc, "Taskloop construct");
+}
 
-  auto *nestedEval = getCollapsedLoopEval(
-      eval, Fortran::lower::getCollapseValue(loopOpClauseList));
+static mlir::omp::TaskwaitOp
+genTaskwaitOp(Fortran::lower::AbstractConverter &converter,
+              Fortran::semantics::SemanticsContext &semaCtx,
+              Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+              const Fortran::parser::OmpClauseList &clauseList) {
+  mlir::omp::TaskwaitClauseOps clauseOps;
+  genTaskwaitClauses(converter, semaCtx, clauseList, loc, clauseOps);
+  return converter.getFirOpBuilder().create<mlir::omp::TaskwaitOp>(loc,
+                                                                   clauseOps);
+}
 
-  auto ivCallback = [&](mlir::Operation *op) {
-    return genLoopVars(op, converter, loc, iv);
-  };
+static mlir::omp::TaskyieldOp
+genTaskyieldOp(Fortran::lower::AbstractConverter &converter,
+               Fortran::semantics::SemanticsContext &semaCtx,
+               Fortran::lower::pft::Evaluation &eval, mlir::Location loc) {
+  return converter.getFirOpBuilder().create<mlir::omp::TaskyieldOp>(loc);
+}
 
-  genOpWithBody<mlir::omp::SimdLoopOp>(
-      OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval)
-          .setClauses(&loopOpClauseList)
-          .setDataSharingProcessor(&dsp)
-          .setGenRegionEntryCb(ivCallback),
+static mlir::omp::TeamsOp
+genTeamsOp(Fortran::lower::AbstractConverter &converter,
+           Fortran::semantics::SemanticsContext &semaCtx,
+           Fortran::lower::pft::Evaluation &eval, bool genNested,
+           mlir::Location loc, const Fortran::parser::OmpClauseList &clauseList,
+           bool outerCombined = false) {
+  Fortran::lower::StatementContext stmtCtx;
+  mlir::omp::TeamsClauseOps clauseOps;
+  genTeamsClauses(converter, semaCtx, stmtCtx, clauseList, loc, clauseOps);
+
+  return genOpWithBody<mlir::omp::TeamsOp>(
+      OpWithBodyGenInfo(converter, semaCtx, loc, eval,
+                        llvm::omp::Directive::OMPD_teams)
+          .setGenNested(genNested)
+          .setOuterCombined(outerCombined)
+          .setClauses(&clauseList),
       clauseOps);
 }
 
-static void createWsloop(Fortran::lower::AbstractConverter &converter,
-                         Fortran::semantics::SemanticsContext &semaCtx,
-                         Fortran::lower::pft::Evaluation &eval,
-                         llvm::omp::Directive ompDirective,
-                         const Fortran::parser::OmpClauseList &beginClauseList,
-                         const Fortran::parser::OmpClauseList *endClauseList,
-                         mlir::Location loc) {
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+static mlir::omp::WsloopOp
+genWsloopOp(Fortran::lower::AbstractConverter &converter,
+            Fortran::semantics::SemanticsContext &semaCtx,
+            Fortran::lower::pft::Evaluation &eval, mlir::Location loc,
+            const Fortran::parser::OmpClauseList &beginClauseList,
+            const Fortran::parser::OmpClauseList *endClauseList) {
   DataSharingProcessor dsp(converter, semaCtx, beginClauseList, eval);
   dsp.processStep1();
 
@@ -1683,30 +1852,9 @@ static void createWsloop(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<const Fortran::semantics::Symbol *> iv;
   llvm::SmallVector<mlir::Type> reductionTypes;
   llvm::SmallVector<const Fortran::semantics::Symbol *> reductionSyms;
-
-  ClauseProcessor cp(converter, semaCtx, beginClauseList);
-  cp.processCollapse(loc, eval, clauseOps, iv);
-  cp.processSchedule(stmtCtx, clauseOps);
-  cp.processReduction(loc, clauseOps, &reductionTypes, &reductionSyms);
-  cp.processOrdered(clauseOps);
-  clauseOps.loopInclusiveAttr = firOpBuilder.getUnitAttr();
-  // TODO Support delayed privatization.
-
-  if (ReductionProcessor::doReductionByRef(clauseOps.reductionVars))
-    clauseOps.reductionByRefAttr = firOpBuilder.getUnitAttr();
-
-  cp.processTODO<clause::Allocate, clause::Linear, clause::Order>(loc,
-                                                                  ompDirective);
-
-  // In FORTRAN `nowait` clause occur at the end of `omp do` directive.
-  // i.e
-  // !$omp do
-  // <...>
-  // !$omp end do nowait
-  if (endClauseList) {
-    ClauseProcessor ecp(converter, semaCtx, *endClauseList);
-    ecp.processNowait(clauseOps);
-  }
+  genWsloopClauses(converter, semaCtx, stmtCtx, eval, beginClauseList,
+                   endClauseList, loc, clauseOps, iv, reductionTypes,
+                   reductionSyms);
 
   auto *nestedEval = getCollapsedLoopEval(
       eval, Fortran::lower::getCollapseValue(beginClauseList));
@@ -1716,8 +1864,9 @@ static void createWsloop(Fortran::lower::AbstractConverter &converter,
                                    reductionTypes);
   };
 
-  genOpWithBody<mlir::omp::WsloopOp>(
-      OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval)
+  return genOpWithBody<mlir::omp::WsloopOp>(
+      OpWithBodyGenInfo(converter, semaCtx, loc, *nestedEval,
+                        llvm::omp::Directive::OMPD_do)
           .setClauses(&beginClauseList)
           .setDataSharingProcessor(&dsp)
           .setReductions(&reductionSyms, &reductionTypes)
@@ -1725,16 +1874,48 @@ static void createWsloop(Fortran::lower::AbstractConverter &converter,
       clauseOps);
 }
 
-static void createSimdWsloop(
+//===----------------------------------------------------------------------===//
+// Code generation functions for composite constructs
+//===----------------------------------------------------------------------===//
+
+static void genCompositeDistributeParallelDo(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
+    Fortran::lower::pft::Evaluation &eval,
+    const Fortran::parser::OmpClauseList &beginClauseList,
+    const Fortran::parser::OmpClauseList *endClauseList, mlir::Location loc) {
+  TODO(loc, "Composite DISTRIBUTE PARALLEL DO");
+}
+
+static void genCompositeDistributeParallelDoSimd(
+    Fortran::lower::AbstractConverter &converter,
+    Fortran::semantics::SemanticsContext &semaCtx,
+    Fortran::lower::pft::Evaluation &eval,
+    const Fortran::parser::OmpClauseList &beginClauseList,
+    const Fortran::parser::OmpClauseList *endClauseList, mlir::Location loc) {
+  TODO(loc, "Composite DISTRIBUTE PARALLEL DO SIMD");
+}
+
+static void genCompositeDistributeSimd(
     Fortran::lower::AbstractConverter &converter,
     Fortran::semantics::SemanticsContext &semaCtx,
-    Fortran::lower::pft::Evaluation &eval, llvm::omp::Directive ompDirective,
+    Fortran::lower::pft::Evaluation &eval,
     const Fortran::parser::OmpClauseList &beginClauseList,
     const Fortran::parser::OmpClauseList *endClauseList, mlir::Location loc) {
+  TODO(loc, "Composite DISTRIBUTE SIMD");
+}
+
+static void
+genCompositeDoSimd(Fortran::lower::AbstractConverter &converter,
+                   Fortran::semantics::SemanticsContext &semaCtx,
+                   Fortran::lower::pft::Evaluation &eval,
+                   const Fortran::parser::OmpClauseList &beginClauseList,
+                   const Fortran::parser::OmpClauseList *endClauseList,
+                   mlir::Location loc) {
   ClauseProcessor cp(converter, semaCtx, beginClauseList);
   cp.processTODO<clause::Aligned, clause::Allocate, clause::Linear,
-                 clause::Safelen, clause::Simdlen, clause::Order>(loc,
-                                                                  ompDirective);
+                 clause::Order, clause::Safelen, clause::Simdlen>(
+      loc, llvm::omp::OMPD_do_simd);
   // TODO: Add support for vectorization - add vectorization hints inside loop
   // body.
   // OpenMP standard does not specify the length of vector instructions.
@@ -1743,34 +1924,17 @@ static void createSimdWsloop(
   // When support for vectorization is enabled, then we need to add handling of
   // if clause. Currently if clause can be skipped because we always assume
   // SIMD length = 1.
-  createWsloop(converter, semaCtx, eval, ompDirective, beginClauseList,
-               endClauseList, loc);
+  genWsloopOp(converter, semaCtx, eval, loc, beginClauseList, endClauseList);
 }
 
 static void
-markDeclareTarget(mlir::Operation *op,
-                  Fortran::lower::AbstractConverter &converter,
-                  mlir::omp::DeclareTargetCaptureClause captureClause,
-                  mlir::omp::DeclareTargetDeviceType deviceType) {
-  // TODO: Add support for program local variables with declare target applied
-  auto declareTargetOp = llvm::dyn_cast<mlir::omp::DeclareTargetInterface>(op);
-  if (!declareTargetOp)
-    fir::emitFatalError(
-        converter.getCurrentLocation(),
-        "Attempt to apply declare target on unsupported operation");
-
-  // The function or global already has a declare target applied to it, very
-  // likely through implicit capture (usage in another declare target
-  // function/subroutine). It should be marked as any if it has been assigned
-  // both host and nohost, else we skip, as there is no change
-  if (declareTargetOp.isDeclareTarget()) {
-    if (declareTargetOp.getDeclareTargetDeviceType() != deviceType)
-      declareTargetOp.setDeclareTarget(mlir::omp::DeclareTargetDeviceType::any,
-                                       captureClause);
-    return;
-  }
-
-  declareTargetOp.setDeclareTarget(deviceType, captureClause);
+genCompositeTaskloopSimd(Fortran::lower::AbstractConverter &converter,
+                         Fortran::semantics::SemanticsContext &semaCtx,
+                         Fortran::lower::pft::Evaluation &eval,
+                         const Fortran::parser::OmpClauseList &beginClauseList,
+                         const Fortran::parser::OmpClauseList *endClauseList,
+                         mlir::Location loc) {
+  TODO(loc, "Composite TASKLOOP SIMD");
 }
 
 //===----------------------------------------------------------------------===//
@@ -1865,6 +2029,102 @@ genOMP(Fortran::lower::AbstractConverter &converter,
       ompDeclConstruct.u);
 }
 
+//===----------------------------------------------------------------------===//
+// OpenMPStandaloneConstruct visitors
+//===----------------------------------------------------------------------===//
+
+static void genOMP(Fortran::lower::AbstractConverter &converter,
+                   Fortran::lower::SymMap &symTable,
+                   Fortran::semantics::SemanticsContext &semaCtx,
+                   Fortran::lower::pft::Evaluation &eval,
+                   const Fortran::parser::OpenMPSimpleStandaloneConstruct
+                       &simpleStandaloneConstruct) {
+  const auto &directive =
+      std::get<Fortran::parser::OmpSimpleStandaloneDirective>(
+          simpleStandaloneConstruct.t);
+  const auto &clauseList =
+      std::get<Fortran::parser::OmpClauseList>(simpleStandaloneConstruct.t);
+  mlir::Location currentLocation = converter.genLocation(directive.source);
+
+  switch (directive.v) {
+  default:
+    break;
+  case llvm::omp::Directive::OMPD_barrier:
+    genBarrierOp(converter, semaCtx, eval, currentLocation);
+    break;
+  case llvm::omp::Directive::OMPD_taskwait:
+    genTaskwaitOp(converter, semaCtx, eval, currentLocation, clauseList);
+    break;
+  case llvm::omp::Directive::OMPD_taskyield:
+    genTaskyieldOp(converter, semaCtx, eval, currentLocation);
+    break;
+  case llvm::omp::Directive::OMPD_target_data:
+    genTargetDataOp(converter, semaCtx, eval, /*genNested=*/true,
+                    currentLocation, clauseList);
+    break;
+  case llvm::omp::Directive::OMPD_target_enter_data:
+    genTargetEnterExitUpdateDataOp<mlir::omp::TargetEnterDataOp>(
+        converter, semaCtx, currentLocation, clauseList);
+    break;
+  case llvm::omp::Directive::OMPD_target_exit_data:
+    genTargetEnterExitUpdateDataOp<mlir::omp::TargetExitDataOp>(
+        converter, semaCtx, currentLocation, clauseList);
+    break;
+  case llvm::omp::Directive::OMPD_target_update:
+    genTargetEnterExitUpdateDataOp<mlir::omp::TargetUpdateOp>(
+        converter, semaCtx, currentLocation, clauseList);
+    break;
+  case llvm::omp::Directive::OMPD_ordered:
+    genOrderedOp(converter, semaCtx, eval, currentLocation, clauseList);
+    break;
+  }
+}
+
+static void
+genOMP(Fortran::lower::AbstractConverter &converter,
+       Fortran::lower::SymMap &symTable,
+       Fortran::semantics::SemanticsContext &semaCtx,
+       Fortran::lower::pft::Evaluation &eval,
+       const Fortran::parser::OpenMPFlushConstruct &flushConstruct) {
+  const auto &verbatim = std::get<Fortran::parser::Verbatim>(flushConstruct.t);
+  const auto &objectList =
+      std::get<std::optional<Fortran::parser::OmpObjectList>>(flushConstruct.t);
+  const auto &clauseList =
+      std::get<std::optional<std::list<Fortran::parser::OmpMemoryOrderClause>>>(
+          flushConstruct.t);
+  mlir::Location currentLocation = converter.genLocation(verbatim.source);
+  genFlushOp(converter, semaCtx, eval, currentLocation, objectList, clauseList);
+}
+
+static void
+genOMP(Fortran::lower::AbstractConverter &converter,
+       Fortran::lower::SymMap &symTable,
+       Fortran::semantics::SemanticsContext &semaCtx,
+       Fortran::lower::pft::Evaluation &eval,
+       const Fortran::parser::OpenMPCancelConstruct &cancelConstruct) {
+  TODO(converter.getCurrentLocation(), "OpenMPCancelConstruct");
+}
+
+static void genOMP(Fortran::lower::AbstractConverter &converter,
+                   Fortran::lower::SymMap &symTable,
+                   Fortran::semantics::SemanticsContext &semaCtx,
+                   Fortran::lower::pft::Evaluation &eval,
+                   const Fortran::parser::OpenMPCancellationPointConstruct
+                       &cancellationPointConstruct) {
+  TODO(converter.getCurrentLocation(), "OpenMPCancelConstruct");
+}
+
+static void
+genOMP(Fortran::lower::AbstractConverter &converter,
+       Fortran::lower::SymMap &symTable,
+       Fortran::semantics::SemanticsContext &semaCtx,
+       Fortran::lower::pft::Evaluation &eval,
+       const Fortran::parser::OpenMPStandaloneConstruct &standaloneConstruct) {
+  std::visit(
+      [&](auto &&s) { return genOMP(converter, symTable, semaCtx, eval, s); },
+      standaloneConstruct.u);
+}
+
 //===----------------------------------------------------------------------===//
 // OpenMPConstruct visitors
 //===----------------------------------------------------------------------===//
@@ -1934,13 +2194,18 @@ genOMP(Fortran::lower::AbstractConverter &converter,
       std::get<Fortran::parser::OmpBeginBlockDirective>(blockConstruct.t);
   const auto &endBlockDirective =
       std::get<Fortran::parser::OmpEndBlockDirective>(blockConstruct.t);
-  const auto &directive =
-      std::get<Fortran::parser::OmpBlockDirective>(beginBlockDirective.t);
+  mlir::Location currentLocation =
+      converter.genLocation(beginBlockDirective.source);
+  const auto origDirective =
+      std::get<Fortran::parser::OmpBlockDirective>(beginBlockDirective.t).v;
   const auto &beginClauseList =
       std::get<Fortran::parser::OmpClauseList>(beginBlockDirective.t);
   const auto &endClauseList =
       std::get<Fortran::parser::OmpClauseList>(endBlockDirective.t);
 
+  assert(llvm::omp::blockConstructSet.test(origDirective) &&
+         "Expected block construct");
+
   for (const Fortran::parser::OmpClause &clause : beginClauseList.v) {
     mlir::Location clauseLocation = converter.genLocation(clause.source);
     if (!std::get_if<Fortran::parser::OmpClause::If>(&clause.u) &&
@@ -1976,95 +2241,74 @@ genOMP(Fortran::lower::AbstractConverter &converter,
       TODO(clauseLocation, "OpenMP Block construct clause");
   }
 
-  bool singleDirective = true;
-  mlir::Location currentLocation = converter.genLocation(directive.source);
-  switch (directive.v) {
-  case llvm::omp::Directive::OMPD_master:
-    genMasterOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation);
-    break;
-  case llvm::omp::Directive::OMPD_ordered:
-    genOrderedRegionOp(converter, semaCtx, eval, /*genNested=*/true,
-                       currentLocation, beginClauseList);
-    break;
-  case llvm::omp::Directive::OMPD_parallel:
-    genParallelOp(converter, symTable, semaCtx, eval, /*genNested=*/true,
-                  currentLocation, beginClauseList);
-    break;
-  case llvm::omp::Directive::OMPD_single:
-    genSingleOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
-                beginClauseList, endClauseList);
-    break;
-  case llvm::omp::Directive::OMPD_target:
-    genTargetOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
-                beginClauseList, directive.v);
-    break;
-  case llvm::omp::Directive::OMPD_target_data:
-    genTargetDataOp(converter, semaCtx, eval, /*genNested=*/true,
-                    currentLocation, beginClauseList);
-    break;
-  case llvm::omp::Directive::OMPD_task:
-    genTaskOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
-              beginClauseList);
-    break;
-  case llvm::omp::Directive::OMPD_taskgroup:
-    genTaskgroupOp(converter, semaCtx, eval, /*genNested=*/true,
-                   currentLocation, beginClauseList);
-    break;
-  case llvm::omp::Directive::OMPD_teams:
-    genTeamsOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
-               beginClauseList,
-               /*outerCombined=*/false);
-    break;
-  case llvm::omp::Directive::OMPD_workshare:
-    // FIXME: Workshare is not a commonly used OpenMP construct, an
-    // implementation for this feature will come later. For the codes
-    // that use this construct, add a single construct for now.
-    genSingleOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
-                beginClauseList, endClauseList);
-    break;
-  default:
-    singleDirective = false;
-    break;
-  }
-
-  if (singleDirective)
-    return;
-
-  // Codegen for combined directives
-  bool combinedDirective = false;
-  if ((llvm::omp::allTargetSet & llvm::omp::blockConstructSet)
-          .test(directive.v)) {
-    genTargetOp(converter, semaCtx, eval, /*genNested=*/false, currentLocation,
-                beginClauseList, directive.v,
-                /*outerCombined=*/true);
-    combinedDirective = true;
-  }
-  if ((llvm::omp::allTeamsSet & llvm::omp::blockConstructSet)
-          .test(directive.v)) {
-    genTeamsOp(converter, semaCtx, eval, /*genNested=*/false, currentLocation,
-               beginClauseList);
-    combinedDirective = true;
-  }
-  if ((llvm::omp::allParallelSet & llvm::omp::blockConstructSet)
-          .test(directive.v)) {
-    bool outerCombined =
-        directive.v != llvm::omp::Directive::OMPD_target_parallel;
-    genParallelOp(converter, symTable, semaCtx, eval, /*genNested=*/false,
-                  currentLocation, beginClauseList, outerCombined);
-    combinedDirective = true;
-  }
-  if ((llvm::omp::workShareSet & llvm::omp::blockConstructSet)
-          .test(directive.v)) {
-    genSingleOp(converter, semaCtx, eval, /*genNested=*/false, currentLocation,
-                beginClauseList, endClauseList);
-    combinedDirective = true;
+  std::optional<llvm::omp::Directive> nextDir = origDirective;
+  bool outermostLeafConstruct = true;
+  while (nextDir) {
+    llvm::omp::Directive leafDir;
+    std::tie(leafDir, nextDir) = splitCombinedDirective(*nextDir);
+    const bool genNested = !nextDir;
+    const bool outerCombined = outermostLeafConstruct && nextDir.has_value();
+    switch (leafDir) {
+    case llvm::omp::Directive::OMPD_master:
+      // 2.16 MASTER construct.
+      genMasterOp(converter, semaCtx, eval, genNested, currentLocation);
+      break;
+    case llvm::omp::Directive::OMPD_ordered:
+      // 2.17.9 ORDERED construct.
+      genOrderedRegionOp(converter, semaCtx, eval, genNested, currentLocation,
+                         beginClauseList);
+      break;
+    case llvm::omp::Directive::OMPD_parallel:
+      // 2.6 PARALLEL construct.
+      genParallelOp(converter, symTable, semaCtx, eval, genNested,
+                    currentLocation, beginClauseList, outerCombined);
+      break;
+    case llvm::omp::Directive::OMPD_single:
+      // 2.8.2 SINGLE construct.
+      genSingleOp(converter, semaCtx, eval, genNested, currentLocation,
+                  beginClauseList, endClauseList);
+      break;
+    case llvm::omp::Directive::OMPD_target:
+      // 2.12.5 TARGET construct.
+      genTargetOp(converter, semaCtx, eval, genNested, currentLocation,
+                  beginClauseList, outerCombined);
+      break;
+    case llvm::omp::Directive::OMPD_target_data:
+      // 2.12.2 TARGET DATA construct.
+      genTargetDataOp(converter, semaCtx, eval, genNested, currentLocation,
+                      beginClauseList);
+      break;
+    case llvm::omp::Directive::OMPD_task:
+      // 2.10.1 TASK construct.
+      genTaskOp(converter, semaCtx, eval, genNested, currentLocation,
+                beginClauseList);
+      break;
+    case llvm::omp::Directive::OMPD_taskgroup:
+      // 2.17.6 TASKGROUP construct.
+      genTaskgroupOp(converter, semaCtx, eval, genNested, currentLocation,
+                     beginClauseList);
+      break;
+    case llvm::omp::Directive::OMPD_teams:
+      // 2.7 TEAMS construct.
+      // FIXME Pass the outerCombined argument or rename it to better describe
+      // what it represents if it must always be `false` in this context.
+      genTeamsOp(converter, semaCtx, eval, genNested, currentLocation,
+                 beginClauseList);
+      break;
+    case llvm::omp::Directive::OMPD_workshare:
+      // 2.8.3 WORKSHARE construct.
+      // FIXME: Workshare is not a commonly used OpenMP construct, an
+      // implementation for this feature will come later. For the codes
+      // that use this construct, add a single construct for now.
+      genSingleOp(converter, semaCtx, eval, genNested, currentLocation,
+                  beginClauseList, endClauseList);
+      break;
+    default:
+      llvm_unreachable("Unexpected block construct");
+      break;
+    }
+    outermostLeafConstruct = false;
   }
-  if (!combinedDirective)
-    TODO(currentLocation, "Unhandled block directive (" +
-                              llvm::omp::getOpenMPDirectiveName(directive.v) +
-                              ")");
-
-  genNestedEvaluations(converter, eval);
 }
 
 static void
@@ -2073,44 +2317,13 @@ genOMP(Fortran::lower::AbstractConverter &converter,
        Fortran::semantics::SemanticsContext &semaCtx,
        Fortran::lower::pft::Evaluation &eval,
        const Fortran::parser::OpenMPCriticalConstruct &criticalConstruct) {
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  mlir::Location currentLocation = converter.getCurrentLocation();
-  std::string name;
-  const Fortran::parser::OmpCriticalDirective &cd =
+  const auto &cd =
       std::get<Fortran::parser::OmpCriticalDirective>(criticalConstruct.t);
-  if (std::get<std::optional<Fortran::parser::Name>>(cd.t).has_value()) {
-    name =
-        std::get<std::optional<Fortran::parser::Name>>(cd.t).value().ToString();
-  }
-
-  mlir::omp::CriticalOp criticalOp = [&]() {
-    if (name.empty()) {
-      return firOpBuilder.create<mlir::omp::CriticalOp>(
-          currentLocation, mlir::FlatSymbolRefAttr());
-    }
-
-    mlir::ModuleOp module = firOpBuilder.getModule();
-    mlir::OpBuilder modBuilder(module.getBodyRegion());
-    auto global = module.lookupSymbol<mlir::omp::CriticalDeclareOp>(name);
-    if (!global) {
-      mlir::omp::CriticalClauseOps clauseOps;
-      const auto &clauseList = std::get<Fortran::parser::OmpClauseList>(cd.t);
-
-      ClauseProcessor cp(converter, semaCtx, clauseList);
-      cp.processHint(clauseOps);
-      clauseOps.nameAttr =
-          mlir::StringAttr::get(firOpBuilder.getContext(), name);
-
-      global = modBuilder.create<mlir::omp::CriticalDeclareOp>(currentLocation,
-                                                               clauseOps);
-    }
-
-    return firOpBuilder.create<mlir::omp::CriticalOp>(
-        currentLocation, mlir::FlatSymbolRefAttr::get(firOpBuilder.getContext(),
-                                                      global.getSymName()));
-  }();
-  auto genInfo = OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval);
-  createBodyOfOp<mlir::omp::CriticalOp>(criticalOp, genInfo);
+  const auto &clauseList = std::get<Fortran::parser::OmpClauseList>(cd.t);
+  const auto &name = std::get<std::optional<Fortran::parser::Name>>(cd.t);
+  mlir::Location currentLocation = converter.getCurrentLocation();
+  genCriticalOp(converter, semaCtx, eval, /*genNested=*/true, currentLocation,
+                clauseList, name);
 }
 
 static void
@@ -2129,13 +2342,16 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
                    const Fortran::parser::OpenMPLoopConstruct &loopConstruct) {
   const auto &beginLoopDirective =
       std::get<Fortran::parser::OmpBeginLoopDirective>(loopConstruct.t);
-  const auto &loopOpClauseList =
+  const auto &beginClauseList =
       std::get<Fortran::parser::OmpClauseList>(beginLoopDirective.t);
   mlir::Location currentLocation =
       converter.genLocation(beginLoopDirective.source);
-  const auto ompDirective =
+  const auto origDirective =
       std::get<Fortran::parser::OmpLoopDirective>(beginLoopDirective.t).v;
 
+  assert(llvm::omp::loopConstructSet.test(origDirective) &&
+         "Expected loop construct");
+
   const auto *endClauseList = [&]() {
     using RetTy = const Fortran::parser::OmpClauseList *;
     if (auto &endLoopDirective =
@@ -2147,61 +2363,103 @@ static void genOMP(Fortran::lower::AbstractConverter &converter,
     return RetTy();
   }();
 
-  bool validDirective = false;
-  if (llvm::omp::topTaskloopSet.test(ompDirective)) {
-    validDirective = true;
-    TODO(currentLocation, "Taskloop construct");
-  } else {
-    // Create omp.{target, teams, distribute, parallel} nested operations
-    if ((llvm::omp::allTargetSet & llvm::omp::loopConstructSet)
-            .test(ompDirective)) {
-      validDirective = true;
-      genTargetOp(converter, semaCtx, eval, /*genNested=*/false,
-                  currentLocation, loopOpClauseList, ompDirective,
-                  /*outerCombined=*/true);
-    }
-    if ((llvm::omp::allTeamsSet & llvm::omp::loopConstructSet)
-            .test(ompDirective)) {
-      validDirective = true;
-      genTeamsOp(converter, semaCtx, eval, /*genNested=*/false, currentLocation,
-                 loopOpClauseList,
-                 /*outerCombined=*/true);
-    }
-    if (llvm::omp::allDistributeSet.test(ompDirective)) {
-      validDirective = true;
-      TODO(currentLocation, "Distribute construct");
-    }
-    if ((llvm::omp::allParallelSet & llvm::omp::loopConstructSet)
-            .test(ompDirective)) {
-      validDirective = true;
-      genParallelOp(converter, symTable, semaCtx, eval, /*genNested=*/false,
-                    currentLocation, loopOpClauseList,
-                    /*outerCombined=*/true);
+  std::optional<llvm::omp::Directive> nextDir = origDirective;
+  while (nextDir) {
+    llvm::omp::Directive leafDir;
+    std::tie(leafDir, nextDir) = splitCombinedDirective(*nextDir);
+    if (llvm::omp::compositeConstructSet.test(leafDir)) {
+      assert(!nextDir && "Composite construct cannot be split");
+      switch (leafDir) {
+      case llvm::omp::Directive::OMPD_distribute_parallel_do:
+        // 2.9.4.3 DISTRIBUTE PARALLEL Worksharing-Loop construct.
+        genCompositeDistributeParallelDo(converter, semaCtx, eval,
+                                         beginClauseList, endClauseList,
+                                         currentLocation);
+        break;
+      case llvm::omp::Directive::OMPD_distribute_parallel_do_simd:
+        // 2.9.4.4 DISTRIBUTE PARALLEL Worksharing-Loop SIMD construct.
+        genCompositeDistributeParallelDoSimd(converter, semaCtx, eval,
+                                             beginClauseList, endClauseList,
+                                             currentLocation);
+        break;
+      case llvm::omp::Directive::OMPD_distribute_simd:
+        // 2.9.4.2 DISTRIBUTE SIMD construct.
+        genCompositeDistributeSimd(converter, semaCtx, eval, beginClauseList,
+                                   endClauseList, currentLocation);
+        break;
+      case llvm::omp::Directive::OMPD_do_simd:
+        // 2.9.3.2 Worksharing-Loop SIMD construct.
+        genCompositeDoSimd(converter, semaCtx, eval, beginClauseList,
+                           endClauseList, currentLocation);
+        break;
+      case llvm::omp::Directive::OMPD_taskloop_simd:
+        // 2.10.3 TASKLOOP SIMD construct.
+        genCompositeTaskloopSimd(converter, semaCtx, eval, beginClauseList,
+                                 endClauseList, currentLocation);
+        break;
+      default:
+        llvm_unreachable("Unexpected composite construct");
+      }
+    } else {
+      const bool genNested = !nextDir;
+      switch (leafDir) {
+      case llvm::omp::Directive::OMPD_distribute:
+        // 2.9.4.1 DISTRIBUTE construct.
+        genDistributeOp(converter, semaCtx, eval, genNested, currentLocation,
+                        beginClauseList);
+        break;
+      case llvm::omp::Directive::OMPD_do:
+        // 2.9.2 Worksharing-Loop construct.
+        genWsloopOp(converter, semaCtx, eval, currentLocation, beginClauseList,
+                    endClauseList);
+        break;
+      case llvm::omp::Directive::OMPD_parallel:
+        // 2.6 PARALLEL construct.
+        // FIXME This is not necessarily always the outer leaf construct of a
+        // combined construct in this context (e.g. DISTRIBUTE PARALLEL DO).
+        // Maybe rename the argument if it represents something else or
+        // initialize it properly.
+        genParallelOp(converter, symTable, semaCtx, eval, genNested,
+                      currentLocation, beginClauseList,
+                      /*outerCombined=*/true);
+        break;
+      case llvm::omp::Directive::OMPD_simd:
+        // 2.9.3.1 SIMD construct.
+        genSimdOp(converter, semaCtx, eval, currentLocation, beginClauseList);
+        break;
+      case llvm::omp::Directive::OMPD_target:
+        // 2.12.5 TARGET construct.
+        genTargetOp(converter, semaCtx, eval, genNested, currentLocation,
+                    beginClauseList, /*outerCombined=*/true);
+        break;
+      case llvm::omp::Directive::OMPD_taskloop:
+        // 2.10.2 TASKLOOP construct.
+        genTaskloopOp(converter, semaCtx, eval, currentLocation,
+                      beginClauseList);
+        break;
+      case llvm::omp::Directive::OMPD_teams:
+        // 2.7 TEAMS construct.
+        // FIXME This is not necessarily always the outer leaf construct of a
+        // combined construct in this constext (e.g. TARGET TEAMS DISTRIBUTE).
+        // Maybe rename the argument if it represents something else or
+        // initialize it properly.
+        genTeamsOp(converter, semaCtx, eval, genNested, currentLocation,
+                   beginClauseList, /*outerCombined=*/true);
+        break;
+      case llvm::omp::Directive::OMPD_loop:
+      case llvm::omp::Directive::OMPD_masked:
+      case llvm::omp::Directive::OMPD_master:
+      case llvm::omp::Directive::OMPD_tile:
+      case llvm::omp::Directive::OMPD_unroll:
+        TODO(currentLocation, "Unhandled loop directive (" +
+                                  llvm::omp::getOpenMPDirectiveName(leafDir) +
+                                  ")");
+        break;
+      default:
+        llvm_unreachable("Unexpected loop construct");
+      }
     }
   }
-  if ((llvm::omp::allDoSet | llvm::omp::allSimdSet).test(ompDirective))
-    validDirective = true;
-
-  if (!validDirective) {
-    TODO(currentLocation, "Unhandled loop directive (" +
-                              llvm::omp::getOpenMPDirectiveName(ompDirective) +
-                              ")");
-  }
-
-  if (llvm::omp::allDoSimdSet.test(ompDirective)) {
-    // 2.9.3.2 Workshare SIMD construct
-    createSimdWsloop(converter, semaCtx, eval, ompDirective, loopOpClauseList,
-                     endClauseList, currentLocation);
-
-  } else if (llvm::omp::allSimdSet.test(ompDirective)) {
-    // 2.9.3.1 SIMD construct
-    createSimdLoop(converter, semaCtx, eval, ompDirective, loopOpClauseList,
-                   currentLocation);
-    genOpenMPReduction(converter, semaCtx, loopOpClauseList);
-  } else {
-    createWsloop(converter, semaCtx, eval, ompDirective, loopOpClauseList,
-                 endClauseList, currentLocation);
-  }
 }
 
 static void
@@ -2220,44 +2478,39 @@ genOMP(Fortran::lower::AbstractConverter &converter,
        Fortran::semantics::SemanticsContext &semaCtx,
        Fortran::lower::pft::Evaluation &eval,
        const Fortran::parser::OpenMPSectionsConstruct &sectionsConstruct) {
-  mlir::Location currentLocation = converter.getCurrentLocation();
-  mlir::omp::SectionsClauseOps clauseOps;
   const auto &beginSectionsDirective =
       std::get<Fortran::parser::OmpBeginSectionsDirective>(sectionsConstruct.t);
-  const auto &sectionsClauseList =
+  const auto &beginClauseList =
       std::get<Fortran::parser::OmpClauseList>(beginSectionsDirective.t);
 
   // Process clauses before optional omp.parallel, so that new variables are
   // allocated outside of the parallel region
-  ClauseProcessor cp(converter, semaCtx, sectionsClauseList);
-  cp.processSectionsReduction(currentLocation, clauseOps);
-  cp.processAllocate(clauseOps);
-  // TODO Support delayed privatization.
+  mlir::Location currentLocation = converter.getCurrentLocation();
+  mlir::omp::SectionsClauseOps clauseOps;
+  genSectionsClauses(converter, semaCtx, beginClauseList, currentLocation,
+                     /*clausesFromBeginSections=*/true, clauseOps);
 
+  // Parallel wrapper of PARALLEL SECTIONS construct
   llvm::omp::Directive dir =
       std::get<Fortran::parser::OmpSectionsDirective>(beginSectionsDirective.t)
           .v;
-
-  // Parallel wrapper of PARALLEL SECTIONS construct
   if (dir == llvm::omp::Directive::OMPD_parallel_sections) {
     genParallelOp(converter, symTable, semaCtx, eval,
-                  /*genNested=*/false, currentLocation, sectionsClauseList,
+                  /*genNested=*/false, currentLocation, beginClauseList,
                   /*outerCombined=*/true);
   } else {
     const auto &endSectionsDirective =
         std::get<Fortran::parser::OmpEndSectionsDirective>(sectionsConstruct.t);
-    const auto &endSectionsClauseList =
+    const auto &endClauseList =
         std::get<Fortran::parser::OmpClauseList>(endSectionsDirective.t);
-    ClauseProcessor(converter, semaCtx, endSectionsClauseList)
-        .processNowait(clauseOps);
+    genSectionsClauses(converter, semaCtx, endClauseList, currentLocation,
+                       /*clausesFromBeginSections=*/false, clauseOps);
   }
 
-  // SECTIONS construct
-  genOpWithBody<mlir::omp::SectionsOp>(
-      OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
-          .setGenNested(false),
-      clauseOps);
+  // SECTIONS construct.
+  genSectionsOp(converter, semaCtx, eval, currentLocation, clauseOps);
 
+  // Generate nested SECTION operations recursively.
   const auto &sectionBlocks =
       std::get<Fortran::parser::OmpSectionBlocks>(sectionsConstruct.t);
   auto &firOpBuilder = converter.getFirOpBuilder();
@@ -2266,40 +2519,12 @@ genOMP(Fortran::lower::AbstractConverter &converter,
        llvm::zip(sectionBlocks.v, eval.getNestedEvaluations())) {
     symTable.pushScope();
     genSectionOp(converter, semaCtx, neval, /*genNested=*/true, currentLocation,
-                 sectionsClauseList);
+                 beginClauseList);
     symTable.popScope();
     firOpBuilder.restoreInsertionPoint(ip);
   }
 }
 
-static void
-genOMP(Fortran::lower::AbstractConverter &converter,
-       Fortran::lower::SymMap &symTable,
-       Fortran::semantics::SemanticsContext &semaCtx,
-       Fortran::lower::pft::Evaluation &eval,
-       const Fortran::parser::OpenMPStandaloneConstruct &standaloneConstruct) {
-  std::visit(
-      Fortran::common::visitors{
-          [&](const Fortran::parser::OpenMPSimpleStandaloneConstruct
-                  &simpleStandaloneConstruct) {
-            genOmpSimpleStandalone(converter, semaCtx, eval,
-                                   /*genNested=*/true,
-                                   simpleStandaloneConstruct);
-          },
-          [&](const Fortran::parser::OpenMPFlushConstruct &flushConstruct) {
-            genOmpFlush(converter, semaCtx, eval, flushConstruct);
-          },
-          [&](const Fortran::parser::OpenMPCancelConstruct &cancelConstruct) {
-            TODO(converter.getCurrentLocation(), "OpenMPCancelConstruct");
-          },
-          [&](const Fortran::parser::OpenMPCancellationPointConstruct
-                  &cancellationPointConstruct) {
-            TODO(converter.getCurrentLocation(), "OpenMPCancelConstruct");
-          },
-      },
-      standaloneConstruct.u);
-}
-
 static void genOMP(Fortran::lower::AbstractConverter &converter,
                    Fortran::lower::SymMap &symTable,
                    Fortran::semantics::SemanticsContext &semaCtx,
@@ -2318,10 +2543,9 @@ mlir::Operation *Fortran::lower::genOpenMPTerminator(fir::FirOpBuilder &builder,
                                                      mlir::Operation *op,
                                                      mlir::Location loc) {
   if (mlir::isa<mlir::omp::WsloopOp, mlir::omp::DeclareReductionOp,
-                mlir::omp::AtomicUpdateOp, mlir::omp::SimdLoopOp>(op))
+                mlir::omp::AtomicUpdateOp, mlir::omp::LoopNestOp>(op))
     return builder.create<mlir::omp::YieldOp>(loc);
-  else
-    return builder.create<mlir::omp::TerminatorOp>(loc);
+  return builder.create<mlir::omp::TerminatorOp>(loc);
 }
 
 void Fortran::lower::genOpenMPConstruct(
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 8ab74103cb6a8..5c24c95db427a 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -3993,6 +3993,38 @@ mlir::LogicalResult fir::CUDAKernelOp::verify() {
   return mlir::success();
 }
 
+mlir::LogicalResult fir::CUDAAllocateOp::verify() {
+  if (getPinned() && getStream())
+    return emitOpError("pinned and stream cannot appears at the same time");
+  if (!fir::unwrapRefType(getBox().getType()).isa<fir::BaseBoxType>())
+    return emitOpError(
+        "expect box to be a reference to a class or box type value");
+  if (getSource() &&
+      !fir::unwrapRefType(getSource().getType()).isa<fir::BaseBoxType>())
+    return emitOpError(
+        "expect source to be a reference to/or a class or box type value");
+  if (getErrmsg() &&
+      !fir::unwrapRefType(getErrmsg().getType()).isa<fir::BoxType>())
+    return emitOpError(
+        "expect errmsg to be a reference to/or a box type value");
+  if (getErrmsg() && !getHasStat())
+    return emitOpError("expect stat attribute when errmsg is provided");
+  return mlir::success();
+}
+
+mlir::LogicalResult fir::CUDADeallocateOp::verify() {
+  if (!fir::unwrapRefType(getBox().getType()).isa<fir::BaseBoxType>())
+    return emitOpError(
+        "expect box to be a reference to class or box type value");
+  if (getErrmsg() &&
+      !fir::unwrapRefType(getErrmsg().getType()).isa<fir::BoxType>())
+    return emitOpError(
+        "expect errmsg to be a reference to/or a box type value");
+  if (getErrmsg() && !getHasStat())
+    return emitOpError("expect stat attribute when errmsg is provided");
+  return mlir::success();
+}
+
 //===----------------------------------------------------------------------===//
 // FIROpsDialect
 //===----------------------------------------------------------------------===//
diff --git a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
index 675314ed9da03..18ca5711bfea8 100644
--- a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
+++ b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
@@ -728,7 +728,7 @@ conservativeCallConflict(llvm::ArrayRef<mlir::Operation *> reaches) {
       if (auto callee =
               call.getCallableForCallee().dyn_cast<mlir::SymbolRefAttr>()) {
         auto module = op->getParentOfType<mlir::ModuleOp>();
-        return isInternalPorcedure(
+        return isInternalProcedure(
             module.lookupSymbol<mlir::func::FuncOp>(callee));
       }
     return false;
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index e85d8d1f7ab53..bafa242a79302 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -1048,7 +1048,7 @@ void OmpStructureChecker::CheckThreadprivateOrDeclareTargetVar(
                       name->symbol->GetUltimate().owner();
                   if (!curScope.IsTopLevel()) {
                     const semantics::Scope &declScope =
-                        GetProgramUnitContaining(curScope);
+                        GetProgramUnitOrBlockConstructContaining(curScope);
                     const semantics::Symbol *sym{
                         declScope.parent().FindSymbol(name->symbol->name())};
                     if (sym &&
diff --git a/flang/runtime/CMakeLists.txt b/flang/runtime/CMakeLists.txt
index 2a65a22ab674c..bdd0e07bbfd4d 100644
--- a/flang/runtime/CMakeLists.txt
+++ b/flang/runtime/CMakeLists.txt
@@ -224,7 +224,7 @@ set(supported_files
   utf.cpp
   )
 
-enable_cuda_compilation("${supported_files}")
+enable_cuda_compilation(FortranRuntime "${supported_files}")
 enable_omp_offload_compilation("${supported_files}")
 
 if (NOT TARGET FortranFloat128Math)
diff --git a/flang/runtime/extensions.cpp b/flang/runtime/extensions.cpp
index 3ac98000335d7..12498b502ae1c 100644
--- a/flang/runtime/extensions.cpp
+++ b/flang/runtime/extensions.cpp
@@ -17,6 +17,7 @@
 #include "flang/Runtime/entry-names.h"
 #include "flang/Runtime/io-api.h"
 #include <chrono>
+#include <cstring>
 #include <ctime>
 #include <signal.h>
 #include <thread>
@@ -138,5 +139,77 @@ void RTNAME(Sleep)(std::int64_t seconds) {
   std::this_thread::sleep_for(std::chrono::seconds(seconds));
 }
 
+// TODO: not supported on Windows
+#ifndef _WIN32
+std::int64_t FORTRAN_PROCEDURE_NAME(access)(const char *name,
+    std::int64_t nameLength, const char *mode, std::int64_t modeLength) {
+  std::int64_t ret{-1};
+  if (nameLength <= 0 || modeLength <= 0 || !name || !mode) {
+    return ret;
+  }
+
+  // ensure name is null terminated
+  char *newName{nullptr};
+  if (name[nameLength - 1] != '\0') {
+    newName = static_cast<char *>(std::malloc(nameLength + 1));
+    std::memcpy(newName, name, nameLength);
+    newName[nameLength] = '\0';
+    name = newName;
+  }
+
+  // calculate mode
+  bool read{false};
+  bool write{false};
+  bool execute{false};
+  bool exists{false};
+  int imode{0};
+
+  for (std::int64_t i = 0; i < modeLength; ++i) {
+    switch (mode[i]) {
+    case 'r':
+      read = true;
+      break;
+    case 'w':
+      write = true;
+      break;
+    case 'x':
+      execute = true;
+      break;
+    case ' ':
+      exists = true;
+      break;
+    default:
+      // invalid mode
+      goto cleanup;
+    }
+  }
+  if (!read && !write && !execute && !exists) {
+    // invalid mode
+    goto cleanup;
+  }
+
+  if (!read && !write && !execute) {
+    imode = F_OK;
+  } else {
+    if (read) {
+      imode |= R_OK;
+    }
+    if (write) {
+      imode |= W_OK;
+    }
+    if (execute) {
+      imode |= X_OK;
+    }
+  }
+  ret = access(name, imode);
+
+cleanup:
+  if (newName) {
+    free(newName);
+  }
+  return ret;
+}
+#endif
+
 } // namespace Fortran::runtime
 } // extern "C"
diff --git a/flang/test/Driver/msvc-dependent-lib-flags.f90 b/flang/test/Driver/msvc-dependent-lib-flags.f90
index 643dbe9e949cb..6cfc969e92b20 100644
--- a/flang/test/Driver/msvc-dependent-lib-flags.f90
+++ b/flang/test/Driver/msvc-dependent-lib-flags.f90
@@ -1,7 +1,7 @@
-! RUN: %flang -### --target=aarch64-windows-msvc %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC
-! RUN: %flang -### --target=aarch64-windows-msvc -fms-runtime-lib=static_dbg %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DEBUG
-! RUN: %flang -### --target=aarch64-windows-msvc -fms-runtime-lib=dll %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DLL
-! RUN: %flang -### --target=aarch64-windows-msvc -fms-runtime-lib=dll_dbg %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DLL-DEBUG
+! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC
+! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=static_dbg %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DEBUG
+! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=dll %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DLL
+! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=dll_dbg %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DLL-DEBUG
 
 ! MSVC: -fc1
 ! MSVC-SAME: --dependent-lib=clang_rt.builtins.lib
diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
index 92628af37085a..fa7979e8875af 100644
--- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
+++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
@@ -180,14 +180,16 @@ func.func @_QPsimd1(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}, %arg1: !fir.ref
   omp.parallel  {
     %1 = fir.alloca i32 {adapt.valuebyref, pinned}
     %2 = fir.load %arg0 : !fir.ref<i32>
-    omp.simdloop for (%arg2) : i32 = (%c1_i32) to (%2) step (%c1_i32)  {
-      fir.store %arg2 to %1 : !fir.ref<i32>
-      %3 = fir.load %1 : !fir.ref<i32>
-      %4 = fir.convert %3 : (i32) -> i64
-      %5 = arith.subi %4, %c1_i64 : i64
-      %6 = fir.coordinate_of %arg1, %5 : (!fir.ref<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
-      fir.store %3 to %6 : !fir.ref<i32>
-      omp.yield
+    omp.simd {
+      omp.loop_nest (%arg2) : i32 = (%c1_i32) to (%2) step (%c1_i32) {
+        fir.store %arg2 to %1 : !fir.ref<i32>
+        %3 = fir.load %1 : !fir.ref<i32>
+        %4 = fir.convert %3 : (i32) -> i64
+        %5 = arith.subi %4, %c1_i64 : i64
+        %6 = fir.coordinate_of %arg1, %5 : (!fir.ref<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+        fir.store %3 to %6 : !fir.ref<i32>
+        omp.yield
+      }
     }
     omp.terminator
   }
@@ -202,8 +204,8 @@ func.func @_QPsimd1(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}, %arg1: !fir.ref
 // CHECK:      %[[ONE_3:.*]] = llvm.mlir.constant(1 : i64) : i64
 // CHECK:      %[[I_VAR:.*]] = llvm.alloca %[[ONE_3]] x i32 {pinned} : (i64) -> !llvm.ptr
 // CHECK:      %[[N:.*]] = llvm.load %[[N_REF]] : !llvm.ptr -> i32
-// CHECK: omp.simdloop
-// CHECK-SAME: (%[[I:.*]]) : i32 = (%[[ONE_2]]) to (%[[N]]) step (%[[ONE_2]]) {
+// CHECK: omp.simd {
+// CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[ONE_2]]) to (%[[N]]) step (%[[ONE_2]]) {
 // CHECK:   llvm.store %[[I]], %[[I_VAR]] : i32, !llvm.ptr
 // CHECK:   %[[I1:.*]] = llvm.load %[[I_VAR]] : !llvm.ptr -> i32
 // CHECK:   %[[I1_EXT:.*]] = llvm.sext %[[I1]] : i32 to i64
@@ -212,6 +214,7 @@ func.func @_QPsimd1(%arg0: !fir.ref<i32> {fir.bindc_name = "n"}, %arg1: !fir.ref
 // CHECK:   llvm.store %[[I1]], %[[ARR_I_REF]] : i32, !llvm.ptr
 // CHECK: omp.yield
 // CHECK: }
+// CHECK: }
 // CHECK: omp.terminator
 // CHECK: }
 // CHECK: llvm.return
@@ -471,55 +474,59 @@ func.func @_QPomp_target() {
 
 // -----
 
-func.func @_QPsimdloop_with_nested_loop() {
+func.func @_QPsimd_with_nested_loop() {
   %0 = fir.alloca i32 {adapt.valuebyref}
-  %1 = fir.alloca !fir.array<10xi32> {bindc_name = "a", uniq_name = "_QFsimdloop_with_nested_loopEa"}
-  %2 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimdloop_with_nested_loopEi"}
-  %3 = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFsimdloop_with_nested_loopEj"}
+  %1 = fir.alloca !fir.array<10xi32> {bindc_name = "a", uniq_name = "_QFsimd_with_nested_loopEa"}
+  %2 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimd_with_nested_loopEi"}
+  %3 = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFsimd_with_nested_loopEj"}
   %c1_i32 = arith.constant 1 : i32
   %c10_i32 = arith.constant 10 : i32
   %c1_i32_0 = arith.constant 1 : i32
-  omp.simdloop   for  (%arg0) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32_0) {
-    fir.store %arg0 to %0 : !fir.ref<i32>
-    %c1_i32_1 = arith.constant 1 : i32
-    %4 = fir.convert %c1_i32_1 : (i32) -> index
-    %c10_i32_2 = arith.constant 10 : i32
-    %5 = fir.convert %c10_i32_2 : (i32) -> index
-    %c1 = arith.constant 1 : index
-    %6 = fir.do_loop %arg1 = %4 to %5 step %c1 -> index {
-      %8 = fir.convert %arg1 : (index) -> i32
-      fir.store %8 to %3 : !fir.ref<i32>
-      %9 = fir.load %0 : !fir.ref<i32>
-      %10 = fir.load %0 : !fir.ref<i32>
-      %11 = fir.convert %10 : (i32) -> i64
-      %c1_i64 = arith.constant 1 : i64
-      %12 = arith.subi %11, %c1_i64 : i64
-      %13 = fir.coordinate_of %1, %12 : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
-      fir.store %9 to %13 : !fir.ref<i32>
-      %14 = arith.addi %arg1, %c1 : index
-      fir.result %14 : index
+  omp.simd {
+    omp.loop_nest (%arg0) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32_0) {
+      fir.store %arg0 to %0 : !fir.ref<i32>
+      %c1_i32_1 = arith.constant 1 : i32
+      %4 = fir.convert %c1_i32_1 : (i32) -> index
+      %c10_i32_2 = arith.constant 10 : i32
+      %5 = fir.convert %c10_i32_2 : (i32) -> index
+      %c1 = arith.constant 1 : index
+      %6 = fir.do_loop %arg1 = %4 to %5 step %c1 -> index {
+        %8 = fir.convert %arg1 : (index) -> i32
+        fir.store %8 to %3 : !fir.ref<i32>
+        %9 = fir.load %0 : !fir.ref<i32>
+        %10 = fir.load %0 : !fir.ref<i32>
+        %11 = fir.convert %10 : (i32) -> i64
+        %c1_i64 = arith.constant 1 : i64
+        %12 = arith.subi %11, %c1_i64 : i64
+        %13 = fir.coordinate_of %1, %12 : (!fir.ref<!fir.array<10xi32>>, i64) -> !fir.ref<i32>
+        fir.store %9 to %13 : !fir.ref<i32>
+        %14 = arith.addi %arg1, %c1 : index
+        fir.result %14 : index
+      }
+      %7 = fir.convert %6 : (index) -> i32
+      fir.store %7 to %3 : !fir.ref<i32>
+      omp.yield
     }
-    %7 = fir.convert %6 : (index) -> i32
-    fir.store %7 to %3 : !fir.ref<i32>
-    omp.yield
   }
   return
 }
 
-// CHECK-LABEL:   llvm.func @_QPsimdloop_with_nested_loop() {
+// CHECK-LABEL:   llvm.func @_QPsimd_with_nested_loop() {
 // CHECK:           %[[LOWER:.*]] = llvm.mlir.constant(1 : i32) : i32
 // CHECK:           %[[UPPER:.*]] = llvm.mlir.constant(10 : i32) : i32
 // CHECK:           %[[STEP:.*]] = llvm.mlir.constant(1 : i32) : i32
-// CHECK:           omp.simdloop   for  (%[[CNT:.*]]) : i32 = (%[[LOWER]]) to (%[[UPPER]]) inclusive step (%[[STEP]]) {
-// CHECK:             llvm.br ^bb1(%[[VAL_1:.*]], %[[VAL_2:.*]] : i64, i64)
-// CHECK:           ^bb1(%[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i64):
-// CHECK:             %[[VAL_5:.*]] = llvm.mlir.constant(0 : index) : i64
-// CHECK:             %[[VAL_6:.*]] = llvm.icmp "sgt" %[[VAL_4]], %[[VAL_5]] : i64
-// CHECK:             llvm.cond_br %[[VAL_6]], ^bb2, ^bb3
-// CHECK:           ^bb2:
-// CHECK:             llvm.br ^bb1(%[[VAL_7:.*]], %[[VAL_8:.*]] : i64, i64)
-// CHECK:           ^bb3:
-// CHECK:             omp.yield
+// CHECK:           omp.simd {
+// CHECK-NEXT:        omp.loop_nest (%[[CNT:.*]]) : i32 = (%[[LOWER]]) to (%[[UPPER]]) inclusive step (%[[STEP]]) {
+// CHECK:               llvm.br ^bb1(%[[VAL_1:.*]], %[[VAL_2:.*]] : i64, i64)
+// CHECK:             ^bb1(%[[VAL_3:.*]]: i64, %[[VAL_4:.*]]: i64):
+// CHECK:               %[[VAL_5:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK:               %[[VAL_6:.*]] = llvm.icmp "sgt" %[[VAL_4]], %[[VAL_5]] : i64
+// CHECK:               llvm.cond_br %[[VAL_6]], ^bb2, ^bb3
+// CHECK:             ^bb2:
+// CHECK:               llvm.br ^bb1(%[[VAL_7:.*]], %[[VAL_8:.*]] : i64, i64)
+// CHECK:             ^bb3:
+// CHECK:               omp.yield
+// CHECK:             }
 // CHECK:           }
 // CHECK:           llvm.return
 // CHECK:         }
diff --git a/flang/test/Fir/cuf-invalid.fir b/flang/test/Fir/cuf-invalid.fir
new file mode 100644
index 0000000000000..6c533a32ccf9b
--- /dev/null
+++ b/flang/test/Fir/cuf-invalid.fir
@@ -0,0 +1,87 @@
+// RUN: fir-opt -split-input-file -verify-diagnostics %s
+
+func.func @_QPsub1() {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
+  %1 = fir.alloca i32
+  %pinned = fir.alloca i1
+  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+  %s = fir.load %1 : !fir.ref<i32>
+  // expected-error@+1{{'fir.cuda_allocate' op pinned and stream cannot appears at the same time}}
+  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> stream(%s : i32) pinned(%pinned : !fir.ref<i1>) {cuda_attr = #fir.cuda<device>} -> i32
+  return
+}
+
+// -----
+
+func.func @_QPsub1() {
+  %1 = fir.alloca i32
+  // expected-error@+1{{'fir.cuda_allocate' op expect box to be a reference to a class or box type value}}
+  %2 = fir.cuda_allocate %1 : !fir.ref<i32> {cuda_attr = #fir.cuda<device>} -> i32
+  return
+}
+
+// -----
+
+func.func @_QPsub1() {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
+  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %c100 = arith.constant 100 : index
+  %7 = fir.alloca !fir.char<1,100> {bindc_name = "msg", uniq_name = "_QFsub1Emsg"}
+  %8:2 = hlfir.declare %7 typeparams %c100 {uniq_name = "_QFsub1Emsg"} : (!fir.ref<!fir.char<1,100>>, index) -> (!fir.ref<!fir.char<1,100>>, !fir.ref<!fir.char<1,100>>)
+  %9 = fir.embox %8#1 : (!fir.ref<!fir.char<1,100>>) -> !fir.box<!fir.char<1,100>>
+  %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+  %16 = fir.convert %9 : (!fir.box<!fir.char<1,100>>) -> !fir.box<none>
+  // expected-error@+1{{'fir.cuda_allocate' op expect stat attribute when errmsg is provided}}
+  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> errmsg(%16 : !fir.box<none>) {cuda_attr = #fir.cuda<device>} -> i32
+  return
+}
+
+// -----
+
+func.func @_QPsub1() {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
+  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %1 = fir.alloca i32
+  %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+  // expected-error@+1{{'fir.cuda_allocate' op expect errmsg to be a reference to/or a box type value}}
+  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> errmsg(%1 : !fir.ref<i32>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
+  return
+}
+
+// -----
+
+func.func @_QPsub1() {
+  %1 = fir.alloca i32
+  // expected-error@+1{{'fir.cuda_deallocate' op expect box to be a reference to class or box type value}}
+  %2 = fir.cuda_deallocate %1 : !fir.ref<i32> {cuda_attr = #fir.cuda<device>} -> i32
+  return
+}
+
+// -----
+
+func.func @_QPsub1() {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
+  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %1 = fir.alloca i32
+  %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+  // expected-error@+1{{'fir.cuda_deallocate' op expect errmsg to be a reference to/or a box type value}}
+  %13 = fir.cuda_deallocate %11 : !fir.ref<!fir.box<none>> errmsg(%1 : !fir.ref<i32>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
+  return
+}
+
+// -----
+
+func.func @_QPsub1() {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
+  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %c100 = arith.constant 100 : index
+  %7 = fir.alloca !fir.char<1,100> {bindc_name = "msg", uniq_name = "_QFsub1Emsg"}
+  %8:2 = hlfir.declare %7 typeparams %c100 {uniq_name = "_QFsub1Emsg"} : (!fir.ref<!fir.char<1,100>>, index) -> (!fir.ref<!fir.char<1,100>>, !fir.ref<!fir.char<1,100>>)
+  %9 = fir.embox %8#1 : (!fir.ref<!fir.char<1,100>>) -> !fir.box<!fir.char<1,100>>
+  %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+  %16 = fir.convert %9 : (!fir.box<!fir.char<1,100>>) -> !fir.box<none>
+  // expected-error@+1{{'fir.cuda_deallocate' op expect stat attribute when errmsg is provided}}
+  %13 = fir.cuda_deallocate %11 : !fir.ref<!fir.box<none>> errmsg(%16 : !fir.box<none>) {cuda_attr = #fir.cuda<device>} -> i32
+  return
+}
diff --git a/flang/test/Fir/cuf.mlir b/flang/test/Fir/cuf.mlir
new file mode 100644
index 0000000000000..71f0652067fac
--- /dev/null
+++ b/flang/test/Fir/cuf.mlir
@@ -0,0 +1,76 @@
+// RUN: fir-opt --split-input-file %s | fir-opt --split-input-file | FileCheck %s
+
+// Simple round trip test of operations.
+
+func.func @_QPsub1() {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
+  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> {cuda_attr = #fir.cuda<device>} -> i32
+  %14 = fir.cuda_deallocate %11 : !fir.ref<!fir.box<none>> {cuda_attr = #fir.cuda<device>} -> i32
+  return
+}
+
+// CHECK: fir.cuda_allocate %{{.*}} : !fir.ref<!fir.box<none>> {cuda_attr = #fir.cuda<device>} -> i32
+// CHECK: fir.cuda_deallocate %{{.*}} : !fir.ref<!fir.box<none>> {cuda_attr = #fir.cuda<device>} -> i32
+
+// -----
+
+func.func @_QPsub1() {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
+  %1 = fir.alloca i32
+  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+  %s = fir.load %1 : !fir.ref<i32>
+  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> stream(%s : i32) {cuda_attr = #fir.cuda<device>} -> i32
+  return
+}
+
+// CHECK: fir.cuda_allocate %{{.*}} : !fir.ref<!fir.box<none>> stream(%{{.*}} : i32) {cuda_attr = #fir.cuda<device>} -> i32
+
+// -----
+
+func.func @_QPsub1() {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
+  %1 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "b", uniq_name = "_QFsub1Eb"}
+  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %5:2 = hlfir.declare %1 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+  %12 = fir.convert %5#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> source(%12 : !fir.ref<!fir.box<none>>) {cuda_attr = #fir.cuda<device>} -> i32
+  return
+}
+
+// CHECK: fir.cuda_allocate %{{.*}} : !fir.ref<!fir.box<none>> source(%{{.*}} : !fir.ref<!fir.box<none>>) {cuda_attr = #fir.cuda<device>} -> i32
+
+// -----
+
+func.func @_QPsub1() {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
+  %pinned = fir.alloca i1
+  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> pinned(%pinned : !fir.ref<i1>) {cuda_attr = #fir.cuda<device>} -> i32
+  return
+}
+
+// CHECK: fir.cuda_allocate %{{.*}} : !fir.ref<!fir.box<none>> pinned(%{{.*}} : !fir.ref<i1>) {cuda_attr = #fir.cuda<device>} -> i32
+
+// -----
+
+func.func @_QPsub1() {
+  %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
+  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %c100 = arith.constant 100 : index
+  %7 = fir.alloca !fir.char<1,100> {bindc_name = "msg", uniq_name = "_QFsub1Emsg"}
+  %8:2 = hlfir.declare %7 typeparams %c100 {uniq_name = "_QFsub1Emsg"} : (!fir.ref<!fir.char<1,100>>, index) -> (!fir.ref<!fir.char<1,100>>, !fir.ref<!fir.char<1,100>>)
+  %9 = fir.embox %8#1 : (!fir.ref<!fir.char<1,100>>) -> !fir.box<!fir.char<1,100>>
+  %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+  %16 = fir.convert %9 : (!fir.box<!fir.char<1,100>>) -> !fir.box<none>
+  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> errmsg(%16 : !fir.box<none>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
+  %14 = fir.cuda_deallocate %11 : !fir.ref<!fir.box<none>> errmsg(%16 : !fir.box<none>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
+  return
+}
+
+// CHECK: fir.cuda_allocate %{{.*}} : !fir.ref<!fir.box<none>> errmsg(%{{.*}} : !fir.box<none>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
+// CHECK: fir.cuda_deallocate %{{.*}} : !fir.ref<!fir.box<none>> errmsg(%{{.*}} : !fir.box<none>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf
new file mode 100644
index 0000000000000..55223011e8d9e
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf
@@ -0,0 +1,107 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test lowering of CUDA allocatable allocate/deallocate statements.
+
+subroutine sub1()
+  real, allocatable, device :: a(:)
+  allocate(a(10))
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub1()
+! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+
+subroutine sub2()
+  real, allocatable, managed :: a(:)
+  integer :: istat
+  allocate(a(10), stat=istat)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub2()
+! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub2Ea"}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub2Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[ISTAT:.*]] = fir.alloca i32 {bindc_name = "istat", uniq_name = "_QFsub2Eistat"}
+! CHECK: %[[ISTAT_DECL:.*]]:2 = hlfir.declare %[[ISTAT]] {uniq_name = "_QFsub2Eistat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %[[STAT:.*]] = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<managed>, hasStat} -> i32
+! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>
+
+subroutine sub3()
+  integer, allocatable, pinned :: a(:,:)
+  logical :: plog
+  allocate(a(20,30), pinned = plog)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub3()
+! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xi32>>> {bindc_name = "a", uniq_name = "_QFsub3Ea"}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub3Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
+! CHECK: %[[PLOG:.*]] = fir.alloca !fir.logical<4> {bindc_name = "plog", uniq_name = "_QFsub3Eplog"}
+! CHECK: %[[PLOG_DECL:.*]]:2 = hlfir.declare %5 {uniq_name = "_QFsub3Eplog"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK-2: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> pinned(%[[PLOG_DECL]]#1 : !fir.ref<!fir.logical<4>>) {cuda_attr = #fir.cuda<pinned>} -> i32
+
+subroutine sub4()
+  real, allocatable, unified :: a(:)
+  integer :: istream
+  allocate(a(10), stream=istream)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub4()
+! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub4Ea"}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<unified>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub4Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[ISTREAM:.*]] = fir.alloca i32 {bindc_name = "istream", uniq_name = "_QFsub4Eistream"}
+! CHECK: %[[ISTREAM_DECL:.*]]:2 = hlfir.declare %[[ISTREAM]] {uniq_name = "_QFsub4Eistream"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %[[STREAM:.*]] = fir.load %[[ISTREAM_DECL]]#0 : !fir.ref<i32>
+! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> stream(%[[STREAM]] : i32) {cuda_attr = #fir.cuda<unified>} -> i32
+
+subroutine sub5()
+  real, allocatable, device :: a(:)
+  real, allocatable :: b(:)
+  allocate(a, source=b)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub5()
+! CHECK: %[[BOX_A:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub5Ea"}
+! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub5Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[BOX_B:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "b", uniq_name = "_QFsub5Eb"}
+! CHECK: %[[BOX_B_DECL:.*]]:2 = hlfir.declare %[[BOX_B]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub5Eb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[LOAD_B:.*]] = fir.load %[[BOX_B_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> source(%[[LOAD_B]] : !fir.box<!fir.heap<!fir.array<?xf32>>>) {cuda_attr = #fir.cuda<device>} -> i32
+
+subroutine sub6()
+  real, allocatable, device :: a(:)
+  real, allocatable :: b(:)
+  allocate(a, mold=b)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub6()
+! CHECK: %[[BOX_A:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub6Ea"}
+! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub6Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[BOX_B:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "b", uniq_name = "_QFsub6Eb"}
+! CHECK: %[[BOX_B_DECL:.*]]:2 = hlfir.declare %[[BOX_B]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub6Eb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[LOAD_B:.*]] = fir.load %[[BOX_B_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK: fir.call @_FortranAAllocatableApplyMold
+! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+
+subroutine sub7()
+  real, allocatable, device :: a(:)
+  integer :: istat
+  character(50) :: err
+  allocate(a(100), stat=istat, errmsg=err)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPsub7()
+! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub7Ea"}
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[ERR:.*]] = fir.alloca !fir.char<1,50> {bindc_name = "err", uniq_name = "_QFsub7Eerr"}
+! CHECK: %[[ERR_DECL:.*]]:2 = hlfir.declare %[[ERR]] typeparams %{{.*}} {uniq_name = "_QFsub7Eerr"} : (!fir.ref<!fir.char<1,50>>, index) -> (!fir.ref<!fir.char<1,50>>, !fir.ref<!fir.char<1,50>>)
+! CHECK: %[[ISTAT:.*]] = fir.alloca i32 {bindc_name = "istat", uniq_name = "_QFsub7Eistat"}
+! CHECK: %[[ISTAT_DECL:.*]]:2 = hlfir.declare %[[ISTAT]] {uniq_name = "_QFsub7Eistat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[ERR_BOX:.*]] = fir.embox %[[ERR_DECL]]#1 : (!fir.ref<!fir.char<1,50>>) -> !fir.box<!fir.char<1,50>>
+! CHECK: fir.call @_FortranAAllocatableSetBounds
+! CHECK: %[[STAT:.*]] = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> errmsg(%[[ERR_BOX]] : !fir.box<!fir.char<1,50>>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
+! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>
diff --git a/flang/test/Lower/HLFIR/internal-procedures-bindc-host.f90 b/flang/test/Lower/HLFIR/internal-procedures-bindc-host.f90
new file mode 100644
index 0000000000000..07f60b98b0941
--- /dev/null
+++ b/flang/test/Lower/HLFIR/internal-procedures-bindc-host.f90
@@ -0,0 +1,39 @@
+! Test fir.host_sym attribute to retain link between internal
+! and host procedure in FIR even when BIND(C) is involved.
+
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -o - %s | fir-opt -external-name-interop -o - |FileCheck %s --check-prefix=AFTER_RENAME_PASS
+
+subroutine foo() bind(c, name="some_c_name")
+  call bar()
+contains
+ subroutine bar()
+ end subroutine
+end subroutine
+! CHECK: func.func @some_c_name()
+! CHECK: func.func private @_QFfooPbar() attributes {fir.host_symbol = @some_c_name, llvm.linkage = #llvm.linkage<internal>}
+! AFTER_RENAME_PASS: func.func @some_c_name()
+! AFTER_RENAME_PASS: func.func private @_QFfooPbar() attributes {fir.host_symbol = @some_c_name, llvm.linkage = #llvm.linkage<internal>}
+
+subroutine notbindc()
+  call bar()
+contains
+ subroutine bar()
+ end subroutine
+end subroutine
+! CHECK: func.func @_QPnotbindc()
+! CHECK: func.func private @_QFnotbindcPbar() attributes {fir.host_symbol = @_QPnotbindc, llvm.linkage = #llvm.linkage<internal>}
+! AFTER_RENAME_PASS: func.func @notbindc_() attributes {fir.internal_name = "_QPnotbindc"}
+! AFTER_RENAME_PASS: func.func private @_QFnotbindcPbar() attributes {fir.host_symbol = @notbindc_, llvm.linkage = #llvm.linkage<internal>}
+
+
+! Main program
+call bar()
+contains
+ subroutine bar()
+ end subroutine
+end
+! CHECK: func.func @_QQmain()
+! CHECK: func.func private @_QFPbar() attributes {fir.host_symbol = @_QQmain, llvm.linkage = #llvm.linkage<internal>}
+! AFTER_RENAME_PASS: func.func @_QQmain()
+! AFTER_RENAME_PASS: func.func private @_QFPbar() attributes {fir.host_symbol = @_QQmain, llvm.linkage = #llvm.linkage<internal>}
diff --git a/flang/test/Lower/HLFIR/internal-procedures.f90 b/flang/test/Lower/HLFIR/internal-procedures.f90
index c898903b6fbe1..fff7125897ddf 100644
--- a/flang/test/Lower/HLFIR/internal-procedures.f90
+++ b/flang/test/Lower/HLFIR/internal-procedures.f90
@@ -10,7 +10,7 @@ subroutine internal
 end subroutine
 end subroutine
 ! CHECK-LABEL: func.func private @_QFtest_explicit_shape_arrayPinternal(
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<!fir.box<!fir.array<?xf32>>>>, i32) -> !fir.ref<!fir.box<!fir.array<?xf32>>>
 ! CHECK:  %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
@@ -28,7 +28,7 @@ subroutine internal
 end subroutine
 end subroutine
 ! CHECK-LABEL: func.func private @_QFtest_assumed_shapePinternal(
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<!fir.box<!fir.array<?xf32>>>>, i32) -> !fir.ref<!fir.box<!fir.array<?xf32>>>
 ! CHECK:  %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
@@ -45,7 +45,7 @@ subroutine internal()
 end subroutine
 end subroutine
 ! CHECK-LABEL:   func.func private @_QFtest_scalar_charPinternal(
-! CHECK-SAME:                               %[[VAL_0:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! CHECK-SAME:                               %[[VAL_0:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<!fir.boxchar<1>>>, i32) -> !fir.ref<!fir.boxchar<1>>
 ! CHECK:  %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.boxchar<1>>
diff --git a/flang/test/Lower/OpenACC/acc-routine04.f90 b/flang/test/Lower/OpenACC/acc-routine04.f90
index 2339c23eaaf85..f603376163901 100644
--- a/flang/test/Lower/OpenACC/acc-routine04.f90
+++ b/flang/test/Lower/OpenACC/acc-routine04.f90
@@ -31,4 +31,4 @@ subroutine sub2()
 ! CHECK: acc.routine @acc_routine_0 func(@_QMdummy_modPsub1) seq
 ! CHECK: func.func @_QMdummy_modPsub1(%arg0: !fir.ref<i32> {fir.bindc_name = "i"}) attributes {acc.routine_info = #acc.routine_info<[@acc_routine_0]>}
 ! CHECK: func.func @_QQmain() attributes {fir.bindc_name = "test_acc_routine"}
-! CHECK: func.func private @_QFPsub2() attributes {acc.routine_info = #acc.routine_info<[@acc_routine_1]>, llvm.linkage = #llvm.linkage<internal>}
+! CHECK: func.func private @_QFPsub2() attributes {acc.routine_info = #acc.routine_info<[@acc_routine_1]>, fir.host_symbol = @_QQmain, llvm.linkage = #llvm.linkage<internal>}
diff --git a/flang/test/Lower/OpenMP/FIR/if-clause.f90 b/flang/test/Lower/OpenMP/FIR/if-clause.f90
index a1235be8e61ea..f686b9708fc54 100644
--- a/flang/test/Lower/OpenMP/FIR/if-clause.f90
+++ b/flang/test/Lower/OpenMP/FIR/if-clause.f90
@@ -116,7 +116,7 @@ program main
   do i = 1, 10
   end do
   !$omp end parallel do simd
-  
+
   ! CHECK:      omp.parallel
   ! CHECK-SAME: if({{.*}})
   ! CHECK:      omp.wsloop
@@ -124,7 +124,7 @@ program main
   do i = 1, 10
   end do
   !$omp end parallel do simd
-  
+
   ! CHECK:      omp.parallel
   ! CHECK-SAME: if({{.*}})
   ! CHECK:      omp.wsloop
@@ -134,7 +134,7 @@ program main
   do i = 1, 10
   end do
   !$omp end parallel do simd
-  
+
   ! CHECK:      omp.parallel
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
@@ -147,7 +147,7 @@ program main
   ! ----------------------------------------------------------------------------
   ! SIMD
   ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.simdloop
+  ! CHECK:      omp.simd
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
   !$omp simd
@@ -155,14 +155,14 @@ program main
   end do
   !$omp end simd
 
-  ! CHECK:      omp.simdloop
+  ! CHECK:      omp.simd
   ! CHECK-SAME: if({{.*}})
   !$omp simd if(.true.)
   do i = 1, 10
   end do
   !$omp end simd
 
-  ! CHECK:      omp.simdloop
+  ! CHECK:      omp.simd
   ! CHECK-SAME: if({{.*}})
   !$omp simd if(simd: .true.)
   do i = 1, 10
@@ -281,7 +281,6 @@ program main
   end do
   !$omp end target parallel do
 
-  
   ! CHECK:      omp.target
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
@@ -360,7 +359,7 @@ program main
   ! CHECK:      omp.target
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.simdloop
+  ! CHECK:      omp.simd
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
   !$omp target simd
@@ -370,7 +369,7 @@ program main
 
   ! CHECK:      omp.target
   ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.simdloop
+  ! CHECK:      omp.simd
   ! CHECK-SAME: if({{.*}})
   !$omp target simd if(.true.)
   do i = 1, 10
@@ -379,7 +378,7 @@ program main
 
   ! CHECK:      omp.target
   ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.simdloop
+  ! CHECK:      omp.simd
   ! CHECK-SAME: if({{.*}})
   !$omp target simd if(target: .true.) if(simd: .false.)
   do i = 1, 10
@@ -388,7 +387,7 @@ program main
 
   ! CHECK:      omp.target
   ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.simdloop
+  ! CHECK:      omp.simd
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
   !$omp target simd if(target: .true.)
@@ -399,7 +398,7 @@ program main
   ! CHECK:      omp.target
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.simdloop
+  ! CHECK:      omp.simd
   ! CHECK-SAME: if({{.*}})
   !$omp target simd if(simd: .true.)
   do i = 1, 10
diff --git a/flang/test/Lower/OpenMP/FIR/loop-combined.f90 b/flang/test/Lower/OpenMP/FIR/loop-combined.f90
index a6cec1beb49c8..6c6618dc9fb57 100644
--- a/flang/test/Lower/OpenMP/FIR/loop-combined.f90
+++ b/flang/test/Lower/OpenMP/FIR/loop-combined.f90
@@ -75,7 +75,7 @@ program main
   ! TARGET SIMD
   ! ----------------------------------------------------------------------------
   ! CHECK: omp.target
-  ! CHECK: omp.simdloop
+  ! CHECK: omp.simd
   !$omp target simd
   do i = 1, 10
   end do
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-private-clause.f90 b/flang/test/Lower/OpenMP/FIR/parallel-private-clause.f90
index 8f5d280943cc2..8b75ecbaae8c7 100644
--- a/flang/test/Lower/OpenMP/FIR/parallel-private-clause.f90
+++ b/flang/test/Lower/OpenMP/FIR/parallel-private-clause.f90
@@ -361,7 +361,8 @@ subroutine simd_loop_1
   ! FIRDialect:     %[[UB:.*]] = arith.constant 9 : i32
   ! FIRDialect:     %[[STEP:.*]] = arith.constant 1 : i32
 
-  ! FIRDialect: omp.simdloop for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+  ! FIRDialect: omp.simd {
+  ! FIRDialect-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   !$OMP SIMD PRIVATE(r)
   do i=1, 9
   ! FIRDialect:     fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/FIR/simd.f90 b/flang/test/Lower/OpenMP/FIR/simd.f90
index c8c2022d693d4..db7d30295c45d 100644
--- a/flang/test/Lower/OpenMP/FIR/simd.f90
+++ b/flang/test/Lower/OpenMP/FIR/simd.f90
@@ -2,32 +2,34 @@
 
 ! RUN: bbc -fopenmp -emit-fir -hlfir=false %s -o - | FileCheck %s
 
-!CHECK-LABEL: func @_QPsimdloop()
-subroutine simdloop
-integer :: i
+!CHECK-LABEL: func @_QPsimd()
+subroutine simd
+  integer :: i
   !$OMP SIMD
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK-NEXT: %[[UB:.*]] = arith.constant 9 : i32
   ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: omp.simdloop for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+  ! CHECK-NEXT: omp.simd {
+  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i=1, 9
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
     ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
     ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
-  !$OMP END SIMD 
+  !$OMP END SIMD
 end subroutine
 
-!CHECK-LABEL: func @_QPsimdloop_with_if_clause
-subroutine simdloop_with_if_clause(n, threshold)
-integer :: i, n, threshold
+!CHECK-LABEL: func @_QPsimd_with_if_clause
+subroutine simd_with_if_clause(n, threshold)
+  integer :: i, n, threshold
   !$OMP SIMD IF( n .GE. threshold )
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %arg0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
   ! CHECK: %[[COND:.*]] = arith.cmpi sge
-  ! CHECK: omp.simdloop if(%[[COND:.*]]) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive  step (%[[STEP]]) {
+  ! CHECK: omp.simd if(%[[COND:.*]]) {
+  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
     ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
@@ -37,14 +39,15 @@ subroutine simdloop_with_if_clause(n, threshold)
   !$OMP END SIMD
 end subroutine
 
-!CHECK-LABEL: func @_QPsimdloop_with_simdlen_clause
-subroutine simdloop_with_simdlen_clause(n, threshold)
-integer :: i, n, threshold
+!CHECK-LABEL: func @_QPsimd_with_simdlen_clause
+subroutine simd_with_simdlen_clause(n, threshold)
+  integer :: i, n, threshold
   !$OMP SIMD SIMDLEN(2)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %arg0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simdloop simdlen(2) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive  step (%[[STEP]]) {
+  ! CHECK: omp.simd simdlen(2) {
+  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
     ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
@@ -54,15 +57,16 @@ subroutine simdloop_with_simdlen_clause(n, threshold)
   !$OMP END SIMD
 end subroutine
 
-!CHECK-LABEL: func @_QPsimdloop_with_simdlen_clause_from_param
-subroutine simdloop_with_simdlen_clause_from_param(n, threshold)
-integer :: i, n, threshold
-integer, parameter :: simdlen = 2;
+!CHECK-LABEL: func @_QPsimd_with_simdlen_clause_from_param
+subroutine simd_with_simdlen_clause_from_param(n, threshold)
+  integer :: i, n, threshold
+  integer, parameter :: simdlen = 2;
   !$OMP SIMD SIMDLEN(simdlen)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %arg0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simdloop simdlen(2) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive  step (%[[STEP]]) {
+  ! CHECK: omp.simd simdlen(2) {
+  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
     ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
@@ -72,15 +76,16 @@ subroutine simdloop_with_simdlen_clause_from_param(n, threshold)
   !$OMP END SIMD
 end subroutine
 
-!CHECK-LABEL: func @_QPsimdloop_with_simdlen_clause_from_expr_from_param
-subroutine simdloop_with_simdlen_clause_from_expr_from_param(n, threshold)
-integer :: i, n, threshold
-integer, parameter :: simdlen = 2;
+!CHECK-LABEL: func @_QPsimd_with_simdlen_clause_from_expr_from_param
+subroutine simd_with_simdlen_clause_from_expr_from_param(n, threshold)
+  integer :: i, n, threshold
+  integer, parameter :: simdlen = 2;
   !$OMP SIMD SIMDLEN(simdlen*2 + 2)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %arg0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simdloop simdlen(6) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive  step (%[[STEP]]) {
+  ! CHECK: omp.simd simdlen(6) {
+  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
     ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
@@ -90,14 +95,15 @@ subroutine simdloop_with_simdlen_clause_from_expr_from_param(n, threshold)
   !$OMP END SIMD
 end subroutine
 
-!CHECK-LABEL: func @_QPsimdloop_with_safelen_clause
-subroutine simdloop_with_safelen_clause(n, threshold)
-integer :: i, n, threshold
+!CHECK-LABEL: func @_QPsimd_with_safelen_clause
+subroutine simd_with_safelen_clause(n, threshold)
+  integer :: i, n, threshold
   !$OMP SIMD SAFELEN(2)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %arg0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simdloop safelen(2) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive  step (%[[STEP]]) {
+  ! CHECK: omp.simd safelen(2) {
+  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
     ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
@@ -107,15 +113,16 @@ subroutine simdloop_with_safelen_clause(n, threshold)
   !$OMP END SIMD
 end subroutine
 
-!CHECK-LABEL: func @_QPsimdloop_with_safelen_clause_from_expr_from_param
-subroutine simdloop_with_safelen_clause_from_expr_from_param(n, threshold)
-integer :: i, n, threshold
-integer, parameter :: safelen = 2;
+!CHECK-LABEL: func @_QPsimd_with_safelen_clause_from_expr_from_param
+subroutine simd_with_safelen_clause_from_expr_from_param(n, threshold)
+  integer :: i, n, threshold
+  integer, parameter :: safelen = 2;
   !$OMP SIMD SAFELEN(safelen*2 + 2)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %arg0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simdloop safelen(6) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive  step (%[[STEP]]) {
+  ! CHECK: omp.simd safelen(6) {
+  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
     ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
@@ -125,14 +132,15 @@ subroutine simdloop_with_safelen_clause_from_expr_from_param(n, threshold)
   !$OMP END SIMD
 end subroutine
 
-!CHECK-LABEL: func @_QPsimdloop_with_simdlen_safelen_clause
-subroutine simdloop_with_simdlen_safelen_clause(n, threshold)
-integer :: i, n, threshold
+!CHECK-LABEL: func @_QPsimd_with_simdlen_safelen_clause
+subroutine simd_with_simdlen_safelen_clause(n, threshold)
+  integer :: i, n, threshold
   !$OMP SIMD SIMDLEN(1) SAFELEN(2)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %arg0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simdloop simdlen(1) safelen(2) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive  step (%[[STEP]]) {
+  ! CHECK: omp.simd simdlen(1) safelen(2) {
+  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
     ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
@@ -142,20 +150,21 @@ subroutine simdloop_with_simdlen_safelen_clause(n, threshold)
   !$OMP END SIMD
 end subroutine
 
-!CHECK-LABEL: func @_QPsimdloop_with_collapse_clause
-subroutine simdloop_with_collapse_clause(n)
-integer :: i, j, n
-integer :: A(n,n)
-! CHECK: %[[LOWER_I:.*]] = arith.constant 1 : i32
-! CHECK: %[[UPPER_I:.*]] = fir.load %[[PARAM_ARG:.*]] : !fir.ref<i32>
-! CHECK: %[[STEP_I:.*]] = arith.constant 1 : i32
-! CHECK: %[[LOWER_J:.*]] = arith.constant 1 : i32
-! CHECK: %[[UPPER_J:.*]] = fir.load %[[PARAM_ARG:.*]] : !fir.ref<i32>
-! CHECK: %[[STEP_J:.*]] = arith.constant 1 : i32
-! CHECK: omp.simdloop  for (%[[ARG_0:.*]], %[[ARG_1:.*]]) : i32 = (
-! CHECK-SAME:               %[[LOWER_I]], %[[LOWER_J]]) to (
-! CHECK-SAME:               %[[UPPER_I]], %[[UPPER_J]]) inclusive step (
-! CHECK-SAME:               %[[STEP_I]], %[[STEP_J]]) {
+!CHECK-LABEL: func @_QPsimd_with_collapse_clause
+subroutine simd_with_collapse_clause(n)
+  integer :: i, j, n
+  integer :: A(n,n)
+  ! CHECK: %[[LOWER_I:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[UPPER_I:.*]] = fir.load %[[PARAM_ARG:.*]] : !fir.ref<i32>
+  ! CHECK: %[[STEP_I:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[LOWER_J:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[UPPER_J:.*]] = fir.load %[[PARAM_ARG:.*]] : !fir.ref<i32>
+  ! CHECK: %[[STEP_J:.*]] = arith.constant 1 : i32
+  ! CHECK: omp.simd {
+  ! CHECK-NEXT: omp.loop_nest (%[[ARG_0:.*]], %[[ARG_1:.*]]) : i32 = (
+  ! CHECK-SAME:               %[[LOWER_I]], %[[LOWER_J]]) to (
+  ! CHECK-SAME:               %[[UPPER_I]], %[[UPPER_J]]) inclusive step (
+  ! CHECK-SAME:               %[[STEP_I]], %[[STEP_J]]) {
   !$OMP SIMD COLLAPSE(2)
   do i = 1, n
     do j = 1, n
diff --git a/flang/test/Lower/OpenMP/FIR/target.f90 b/flang/test/Lower/OpenMP/FIR/target.f90
index 022327f9c25da..ca3162340d784 100644
--- a/flang/test/Lower/OpenMP/FIR/target.f90
+++ b/flang/test/Lower/OpenMP/FIR/target.f90
@@ -411,8 +411,8 @@ end subroutine omp_target_implicit_bounds
 !CHECK-LABEL: func.func @_QPomp_target_thread_limit() {
 subroutine omp_target_thread_limit
    integer :: a
-   !CHECK: %[[VAL_1:.*]] = arith.constant 64 : i32
    !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "a"}
+   !CHECK: %[[VAL_1:.*]] = arith.constant 64 : i32
    !CHECK: omp.target   thread_limit(%[[VAL_1]] : i32) map_entries(%[[MAP]] -> %[[ARG_0:.*]] : !fir.ref<i32>) {
    !CHECK: ^bb0(%[[ARG_0]]: !fir.ref<i32>):
    !$omp target map(tofrom: a) thread_limit(64)
diff --git a/flang/test/Lower/OpenMP/if-clause.f90 b/flang/test/Lower/OpenMP/if-clause.f90
index f982bf67b0722..ce4427a0c2cab 100644
--- a/flang/test/Lower/OpenMP/if-clause.f90
+++ b/flang/test/Lower/OpenMP/if-clause.f90
@@ -116,7 +116,7 @@ program main
   do i = 1, 10
   end do
   !$omp end parallel do simd
-  
+
   ! CHECK:      omp.parallel
   ! CHECK-SAME: if({{.*}})
   ! CHECK:      omp.wsloop
@@ -124,7 +124,7 @@ program main
   do i = 1, 10
   end do
   !$omp end parallel do simd
-  
+
   ! CHECK:      omp.parallel
   ! CHECK-SAME: if({{.*}})
   ! CHECK:      omp.wsloop
@@ -134,7 +134,7 @@ program main
   do i = 1, 10
   end do
   !$omp end parallel do simd
-  
+
   ! CHECK:      omp.parallel
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
@@ -147,7 +147,7 @@ program main
   ! ----------------------------------------------------------------------------
   ! SIMD
   ! ----------------------------------------------------------------------------
-  ! CHECK:      omp.simdloop
+  ! CHECK:      omp.simd
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
   !$omp simd
@@ -155,14 +155,14 @@ program main
   end do
   !$omp end simd
 
-  ! CHECK:      omp.simdloop
+  ! CHECK:      omp.simd
   ! CHECK-SAME: if({{.*}})
   !$omp simd if(.true.)
   do i = 1, 10
   end do
   !$omp end simd
 
-  ! CHECK:      omp.simdloop
+  ! CHECK:      omp.simd
   ! CHECK-SAME: if({{.*}})
   !$omp simd if(simd: .true.)
   do i = 1, 10
@@ -281,7 +281,6 @@ program main
   end do
   !$omp end target parallel do
 
-  
   ! CHECK:      omp.target
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
@@ -360,7 +359,7 @@ program main
   ! CHECK:      omp.target
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.simdloop
+  ! CHECK:      omp.simd
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
   !$omp target simd
@@ -370,7 +369,7 @@ program main
 
   ! CHECK:      omp.target
   ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.simdloop
+  ! CHECK:      omp.simd
   ! CHECK-SAME: if({{.*}})
   !$omp target simd if(.true.)
   do i = 1, 10
@@ -379,7 +378,7 @@ program main
 
   ! CHECK:      omp.target
   ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.simdloop
+  ! CHECK:      omp.simd
   ! CHECK-SAME: if({{.*}})
   !$omp target simd if(target: .true.) if(simd: .false.)
   do i = 1, 10
@@ -388,7 +387,7 @@ program main
 
   ! CHECK:      omp.target
   ! CHECK-SAME: if({{.*}})
-  ! CHECK:      omp.simdloop
+  ! CHECK:      omp.simd
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
   !$omp target simd if(target: .true.)
@@ -399,7 +398,7 @@ program main
   ! CHECK:      omp.target
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.simdloop
+  ! CHECK:      omp.simd
   ! CHECK-SAME: if({{.*}})
   !$omp target simd if(simd: .true.)
   do i = 1, 10
diff --git a/flang/test/Lower/OpenMP/loop-combined.f90 b/flang/test/Lower/OpenMP/loop-combined.f90
index 70488b6a769ce..298634b3f6f82 100644
--- a/flang/test/Lower/OpenMP/loop-combined.f90
+++ b/flang/test/Lower/OpenMP/loop-combined.f90
@@ -75,7 +75,7 @@ program main
   ! TARGET SIMD
   ! ----------------------------------------------------------------------------
   ! CHECK: omp.target
-  ! CHECK: omp.simdloop
+  ! CHECK: omp.simd
   !$omp target simd
   do i = 1, 10
   end do
diff --git a/flang/test/Lower/OpenMP/parallel-private-clause.f90 b/flang/test/Lower/OpenMP/parallel-private-clause.f90
index 5578b6710da7c..775f7b4f2cb10 100644
--- a/flang/test/Lower/OpenMP/parallel-private-clause.f90
+++ b/flang/test/Lower/OpenMP/parallel-private-clause.f90
@@ -411,7 +411,8 @@ subroutine simd_loop_1
   ! FIRDialect:     %[[UB:.*]] = arith.constant 9 : i32
   ! FIRDialect:     %[[STEP:.*]] = arith.constant 1 : i32
 
-  ! FIRDialect: omp.simdloop for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+  ! FIRDialect: omp.simd {
+  ! FIRDialect-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   !$OMP SIMD PRIVATE(r)
   do i=1, 9
   ! FIRDialect:     fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/simd.f90 b/flang/test/Lower/OpenMP/simd.f90
index 135b38c792623..190aa61521217 100644
--- a/flang/test/Lower/OpenMP/simd.f90
+++ b/flang/test/Lower/OpenMP/simd.f90
@@ -3,33 +3,35 @@
 !RUN: %flang_fc1 -flang-experimental-hlfir -emit-hlfir -fopenmp %s -o - | FileCheck %s
 !RUN: bbc -hlfir -emit-hlfir -fopenmp %s -o - | FileCheck %s
 
-!CHECK-LABEL: func @_QPsimdloop()
-subroutine simdloop
-integer :: i
+!CHECK-LABEL: func @_QPsimd()
+subroutine simd
+  integer :: i
   !$OMP SIMD
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK-NEXT: %[[UB:.*]] = arith.constant 9 : i32
   ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK-NEXT: omp.simdloop for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+  ! CHECK-NEXT: omp.simd {
+  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i=1, 9
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
     ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]]#0 : !fir.ref<i32>
     ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
-  !$OMP END SIMD 
+  !$OMP END SIMD
 end subroutine
 
-!CHECK-LABEL: func @_QPsimdloop_with_if_clause
-subroutine simdloop_with_if_clause(n, threshold)
-  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimdloop_with_if_clauseEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-integer :: i, n, threshold
+!CHECK-LABEL: func @_QPsimd_with_if_clause
+subroutine simd_with_if_clause(n, threshold)
+  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimd_with_if_clauseEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  integer :: i, n, threshold
   !$OMP SIMD IF( n .GE. threshold )
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
   ! CHECK: %[[COND:.*]] = arith.cmpi sge
-  ! CHECK: omp.simdloop if(%[[COND:.*]]) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive  step (%[[STEP]]) {
+  ! CHECK: omp.simd if(%[[COND:.*]]) {
+  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
     ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]]#0 : !fir.ref<i32>
@@ -39,15 +41,16 @@ subroutine simdloop_with_if_clause(n, threshold)
   !$OMP END SIMD
 end subroutine
 
-!CHECK-LABEL: func @_QPsimdloop_with_simdlen_clause
-subroutine simdloop_with_simdlen_clause(n, threshold)
-  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimdloop_with_simdlen_clauseEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-integer :: i, n, threshold
+!CHECK-LABEL: func @_QPsimd_with_simdlen_clause
+subroutine simd_with_simdlen_clause(n, threshold)
+  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimd_with_simdlen_clauseEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  integer :: i, n, threshold
   !$OMP SIMD SIMDLEN(2)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simdloop simdlen(2) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive  step (%[[STEP]]) {
+  ! CHECK: omp.simd simdlen(2) {
+  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
     ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]]#0 : !fir.ref<i32>
@@ -57,16 +60,17 @@ subroutine simdloop_with_simdlen_clause(n, threshold)
   !$OMP END SIMD
 end subroutine
 
-!CHECK-LABEL: func @_QPsimdloop_with_simdlen_clause_from_param
-subroutine simdloop_with_simdlen_clause_from_param(n, threshold)
-  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimdloop_with_simdlen_clause_from_paramEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-integer :: i, n, threshold
-integer, parameter :: simdlen = 2;
+!CHECK-LABEL: func @_QPsimd_with_simdlen_clause_from_param
+subroutine simd_with_simdlen_clause_from_param(n, threshold)
+  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimd_with_simdlen_clause_from_paramEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  integer :: i, n, threshold
+  integer, parameter :: simdlen = 2;
   !$OMP SIMD SIMDLEN(simdlen)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simdloop simdlen(2) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive  step (%[[STEP]]) {
+  ! CHECK: omp.simd simdlen(2) {
+  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
     ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]]#0 : !fir.ref<i32>
@@ -76,16 +80,17 @@ subroutine simdloop_with_simdlen_clause_from_param(n, threshold)
   !$OMP END SIMD
 end subroutine
 
-!CHECK-LABEL: func @_QPsimdloop_with_simdlen_clause_from_expr_from_param
-subroutine simdloop_with_simdlen_clause_from_expr_from_param(n, threshold)
-  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimdloop_with_simdlen_clause_from_expr_from_paramEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-integer :: i, n, threshold
-integer, parameter :: simdlen = 2;
+!CHECK-LABEL: func @_QPsimd_with_simdlen_clause_from_expr_from_param
+subroutine simd_with_simdlen_clause_from_expr_from_param(n, threshold)
+  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimd_with_simdlen_clause_from_expr_from_paramEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  integer :: i, n, threshold
+  integer, parameter :: simdlen = 2;
   !$OMP SIMD SIMDLEN(simdlen*2 + 2)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simdloop simdlen(6) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive  step (%[[STEP]]) {
+  ! CHECK: omp.simd simdlen(6) {
+  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
     ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]]#0 : !fir.ref<i32>
@@ -95,15 +100,16 @@ subroutine simdloop_with_simdlen_clause_from_expr_from_param(n, threshold)
   !$OMP END SIMD
 end subroutine
 
-!CHECK-LABEL: func @_QPsimdloop_with_safelen_clause
-subroutine simdloop_with_safelen_clause(n, threshold)
-  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimdloop_with_safelen_clauseEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-integer :: i, n, threshold
+!CHECK-LABEL: func @_QPsimd_with_safelen_clause
+subroutine simd_with_safelen_clause(n, threshold)
+  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimd_with_safelen_clauseEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  integer :: i, n, threshold
   !$OMP SIMD SAFELEN(2)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simdloop safelen(2) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive  step (%[[STEP]]) {
+  ! CHECK: omp.simd safelen(2) {
+  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
     ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]]#0 : !fir.ref<i32>
@@ -113,16 +119,17 @@ subroutine simdloop_with_safelen_clause(n, threshold)
   !$OMP END SIMD
 end subroutine
 
-!CHECK-LABEL: func @_QPsimdloop_with_safelen_clause_from_expr_from_param
-subroutine simdloop_with_safelen_clause_from_expr_from_param(n, threshold)
-  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimdloop_with_safelen_clause_from_expr_from_paramEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-integer :: i, n, threshold
-integer, parameter :: safelen = 2;
+!CHECK-LABEL: func @_QPsimd_with_safelen_clause_from_expr_from_param
+subroutine simd_with_safelen_clause_from_expr_from_param(n, threshold)
+  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimd_with_safelen_clause_from_expr_from_paramEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  integer :: i, n, threshold
+  integer, parameter :: safelen = 2;
   !$OMP SIMD SAFELEN(safelen*2 + 2)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simdloop safelen(6) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive  step (%[[STEP]]) {
+  ! CHECK: omp.simd safelen(6) {
+  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
     ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]]#0 : !fir.ref<i32>
@@ -132,15 +139,16 @@ subroutine simdloop_with_safelen_clause_from_expr_from_param(n, threshold)
   !$OMP END SIMD
 end subroutine
 
-!CHECK-LABEL: func @_QPsimdloop_with_simdlen_safelen_clause
-subroutine simdloop_with_simdlen_safelen_clause(n, threshold)
-  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimdloop_with_simdlen_safelen_clauseEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-integer :: i, n, threshold
+!CHECK-LABEL: func @_QPsimd_with_simdlen_safelen_clause
+subroutine simd_with_simdlen_safelen_clause(n, threshold)
+  ! CHECK: %[[ARG_N:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsimd_with_simdlen_safelen_clauseEn"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  integer :: i, n, threshold
   !$OMP SIMD SIMDLEN(1) SAFELEN(2)
   ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
   ! CHECK: %[[UB:.*]] = fir.load %[[ARG_N]]#0
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
-  ! CHECK: omp.simdloop simdlen(1) safelen(2) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive  step (%[[STEP]]) {
+  ! CHECK: omp.simd simdlen(1) safelen(2) {
+  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i = 1, n
     ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
     ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]]#0 : !fir.ref<i32>
@@ -150,20 +158,21 @@ subroutine simdloop_with_simdlen_safelen_clause(n, threshold)
   !$OMP END SIMD
 end subroutine
 
-!CHECK-LABEL: func @_QPsimdloop_with_collapse_clause
-subroutine simdloop_with_collapse_clause(n)
-integer :: i, j, n
-integer :: A(n,n)
-! CHECK: %[[LOWER_I:.*]] = arith.constant 1 : i32
-! CHECK: %[[UPPER_I:.*]] = fir.load %[[PARAM_ARG:.*]] : !fir.ref<i32>
-! CHECK: %[[STEP_I:.*]] = arith.constant 1 : i32
-! CHECK: %[[LOWER_J:.*]] = arith.constant 1 : i32
-! CHECK: %[[UPPER_J:.*]] = fir.load %[[PARAM_ARG:.*]] : !fir.ref<i32>
-! CHECK: %[[STEP_J:.*]] = arith.constant 1 : i32
-! CHECK: omp.simdloop  for (%[[ARG_0:.*]], %[[ARG_1:.*]]) : i32 = (
-! CHECK-SAME:               %[[LOWER_I]], %[[LOWER_J]]) to (
-! CHECK-SAME:               %[[UPPER_I]], %[[UPPER_J]]) inclusive step (
-! CHECK-SAME:               %[[STEP_I]], %[[STEP_J]]) {
+!CHECK-LABEL: func @_QPsimd_with_collapse_clause
+subroutine simd_with_collapse_clause(n)
+  integer :: i, j, n
+  integer :: A(n,n)
+  ! CHECK: %[[LOWER_I:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[UPPER_I:.*]] = fir.load %[[PARAM_ARG:.*]] : !fir.ref<i32>
+  ! CHECK: %[[STEP_I:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[LOWER_J:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[UPPER_J:.*]] = fir.load %[[PARAM_ARG:.*]] : !fir.ref<i32>
+  ! CHECK: %[[STEP_J:.*]] = arith.constant 1 : i32
+  ! CHECK: omp.simd {
+  ! CHECK-NEXT: omp.loop_nest (%[[ARG_0:.*]], %[[ARG_1:.*]]) : i32 = (
+  ! CHECK-SAME:                %[[LOWER_I]], %[[LOWER_J]]) to (
+  ! CHECK-SAME:                %[[UPPER_I]], %[[UPPER_J]]) inclusive step (
+  ! CHECK-SAME:                %[[STEP_I]], %[[STEP_J]]) {
   !$OMP SIMD COLLAPSE(2)
   do i = 1, n
     do j = 1, n
diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90
index 6f72b5a34d069..51b66327dfb24 100644
--- a/flang/test/Lower/OpenMP/target.f90
+++ b/flang/test/Lower/OpenMP/target.f90
@@ -490,8 +490,8 @@ end subroutine omp_target_implicit_bounds
 !CHECK-LABEL: func.func @_QPomp_target_thread_limit() {
 subroutine omp_target_thread_limit
    integer :: a
-   !CHECK: %[[VAL_1:.*]] = arith.constant 64 : i32
    !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "a"}
+   !CHECK: %[[VAL_1:.*]] = arith.constant 64 : i32
    !CHECK: omp.target   thread_limit(%[[VAL_1]] : i32) map_entries(%[[MAP]] -> %{{.*}} : !fir.ref<i32>) {
    !CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
    !$omp target map(tofrom: a) thread_limit(64)
diff --git a/flang/test/Lower/OpenMP/threadprivate-hlfir.f90 b/flang/test/Lower/OpenMP/threadprivate-hlfir.f90
index d39ae1e701183..7d02987c5eade 100644
--- a/flang/test/Lower/OpenMP/threadprivate-hlfir.f90
+++ b/flang/test/Lower/OpenMP/threadprivate-hlfir.f90
@@ -24,3 +24,4 @@ subroutine sub()
     print *, a
   !$omp end parallel
 end subroutine
+
diff --git a/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90 b/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90
index b47bff5bebb0b..a8d29baf74f22 100644
--- a/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90
+++ b/flang/test/Lower/OpenMP/threadprivate-host-association-2.f90
@@ -12,7 +12,7 @@
 !CHECK:   fir.call @_QFPsub() fastmath<contract> : () -> ()
 !CHECK:   return
 !CHECK: }
-!CHECK: func.func private @_QFPsub() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+!CHECK: func.func private @_QFPsub() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 !CHECK:   %[[A:.*]] = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFEa"}
 !CHECK:   %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] {uniq_name = "_QFEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:   %[[A_ADDR:.*]] = fir.address_of(@_QFEa) : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/threadprivate-host-association.f90 b/flang/test/Lower/OpenMP/threadprivate-host-association.f90
index 98f7b51bb9711..096e510c19c69 100644
--- a/flang/test/Lower/OpenMP/threadprivate-host-association.f90
+++ b/flang/test/Lower/OpenMP/threadprivate-host-association.f90
@@ -11,7 +11,7 @@
 !CHECK:   fir.call @_QFPsub() fastmath<contract> : () -> ()
 !CHECK:   return
 !CHECK: }
-!CHECK: func.func private @_QFPsub() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+!CHECK: func.func private @_QFPsub() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 !CHECK:   %[[A:.*]] = fir.address_of(@_QFEa) : !fir.ref<i32>
 !CHECK:   %[[A_DECL:.*]]:2 = hlfir.declare %[[A]] {uniq_name = "_QFEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:   %[[TP_A:.*]] = omp.threadprivate %[[A_DECL]]#1 : !fir.ref<i32> -> !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90 b/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90
index 33b5971656010..d849dd206b943 100644
--- a/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90
+++ b/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90
@@ -21,7 +21,7 @@ subroutine only_use_device_ptr
 
 !CHECK: func.func @{{.*}}mix_use_device_ptr_and_addr()
 !CHECK: omp.target_data use_device_ptr({{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) use_device_addr(%{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) {
-!CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>):
+!CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
 subroutine mix_use_device_ptr_and_addr 
     use iso_c_binding
     integer, pointer, dimension(:) :: array
@@ -47,7 +47,7 @@ subroutine only_use_device_addr
 
 !CHECK: func.func @{{.*}}mix_use_device_ptr_and_addr_and_map()
 !CHECK: omp.target_data map_entries(%{{.*}}, %{{.*}} : !fir.ref<i32>, !fir.ref<i32>) use_device_ptr(%{{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) use_device_addr(%{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) {
-!CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>):
+!CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
 subroutine mix_use_device_ptr_and_addr_and_map
     use iso_c_binding
     integer :: i, j
diff --git a/flang/test/Lower/character-elemental.f90 b/flang/test/Lower/character-elemental.f90
index 6c46454176f53..9a9cf8bf2d9c6 100644
--- a/flang/test/Lower/character-elemental.f90
+++ b/flang/test/Lower/character-elemental.f90
@@ -5,6 +5,12 @@ subroutine substring_main
   character*7 :: string(2) = ['12     ', '12     ']
   integer :: result(2)
   integer :: ival
+interface
+  elemental function inner(arg)
+    character(len=*), intent(in) :: arg
+    integer :: inner
+  end function inner
+end interface
 
   ival = 1
   ! CHECK: %[[a0:.*]] = fir.alloca i32 {bindc_name = "ival", uniq_name = "_QFsubstring_mainEival"}
@@ -26,14 +32,7 @@ subroutine substring_main
   ! CHECK: %[[a14:.*]] = fir.coordinate_of %[[a13]], %[[a12]] : (!fir.ref<!fir.array<7x!fir.char<1>>>, index) -> !fir.ref<!fir.char<1>>
   ! CHECK: %[[a15:.*]] = fir.convert %[[a14]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<!fir.char<1,?>>
   ! CHECK: %[[a16:.*]] = fir.emboxchar %[[a15]], {{.*}} : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
-  ! CHECK: %[[a17:.*]] = fir.call @_QFsubstring_mainPinner(%[[a16]]) {{.*}}: (!fir.boxchar<1>) -> i32
+  ! CHECK: %[[a17:.*]] = fir.call @_QPinner(%[[a16]]) {{.*}}: (!fir.boxchar<1>) -> i32
   result = inner(string(1:2)(ival:ival))
   print *, result
-contains
-  elemental function inner(arg)
-    character(len=*), intent(in) :: arg
-    integer :: inner
-
-    inner = len(arg)
-  end function inner
 end subroutine substring_main
diff --git a/flang/test/Lower/equivalence-with-host-assoc.f90 b/flang/test/Lower/equivalence-with-host-assoc.f90
index 0ffb1bc5bf9ee..b8ce72f3787c0 100644
--- a/flang/test/Lower/equivalence-with-host-assoc.f90
+++ b/flang/test/Lower/equivalence-with-host-assoc.f90
@@ -10,7 +10,7 @@ subroutine inner
     i1 = j1
   end subroutine inner
 end subroutine test1
-! FIR-LABEL:   func.func private @_QFtest1Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! FIR-LABEL:   func.func private @_QFtest1Pinner() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 ! FIR:           %[[VAL_0:.*]] = fir.address_of(@_QFtest1Ei1) : !fir.ref<!fir.array<1xi32>>
 ! FIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<1xi32>>) -> !fir.ref<!fir.array<4xi8>>
 ! FIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -24,7 +24,7 @@ end subroutine test1
 ! FIR:           return
 ! FIR:         }
 
-! HLFIR-LABEL:   func.func private @_QFtest1Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! HLFIR-LABEL:   func.func private @_QFtest1Pinner() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 ! HLFIR:           %[[VAL_0:.*]] = fir.address_of(@_QFtest1Ei1) : !fir.ref<!fir.array<1xi32>>
 ! HLFIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<1xi32>>) -> !fir.ref<!fir.array<4xi8>>
 ! HLFIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -54,7 +54,7 @@ subroutine inner
     end subroutine inner
   end subroutine host
 end module test2
-! FIR-LABEL:   func.func private @_QMtest2FhostPinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! FIR-LABEL:   func.func private @_QMtest2FhostPinner() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 ! FIR:           %[[VAL_0:.*]] = fir.address_of(@_QMtest2FhostEf1) : !fir.ref<!fir.array<1xi32>>
 ! FIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<1xi32>>) -> !fir.ref<!fir.array<4xi8>>
 ! FIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -68,7 +68,7 @@ end module test2
 ! FIR:           return
 ! FIR:         }
 
-! HLFIR-LABEL:   func.func private @_QMtest2FhostPinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! HLFIR-LABEL:   func.func private @_QMtest2FhostPinner() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 ! HLFIR:           %[[VAL_0:.*]] = fir.address_of(@_QMtest2FhostEf1) : !fir.ref<!fir.array<1xi32>>
 ! HLFIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<1xi32>>) -> !fir.ref<!fir.array<4xi8>>
 ! HLFIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -94,7 +94,7 @@ subroutine inner
     i1 = j1 + k1
   end subroutine inner
 end subroutine test3
-! FIR-LABEL:   func.func private @_QFtest3Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! FIR-LABEL:   func.func private @_QFtest3Pinner() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 ! FIR:           %[[VAL_0:.*]] = fir.address_of(@blk_) : !fir.ref<tuple<i32>>
 ! FIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<tuple<i32>>) -> !fir.ref<!fir.array<?xi8>>
 ! FIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -115,7 +115,7 @@ end subroutine test3
 ! FIR:           return
 ! FIR:         }
 
-! HLFIR-LABEL:   func.func private @_QFtest3Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! HLFIR-LABEL:   func.func private @_QFtest3Pinner() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 ! HLFIR:           %[[VAL_0:.*]] = fir.address_of(@blk_) : !fir.ref<tuple<i32>>
 ! HLFIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<tuple<i32>>) -> !fir.ref<!fir.array<?xi8>>
 ! HLFIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -149,7 +149,7 @@ subroutine inner
     i1 = j1 + k1
   end subroutine inner
 end subroutine test4
-! FIR-LABEL:   func.func private @_QFtest4Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! FIR-LABEL:   func.func private @_QFtest4Pinner() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 ! FIR:           %[[VAL_0:.*]] = fir.address_of(@blk_) : !fir.ref<tuple<i32>>
 ! FIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<tuple<i32>>) -> !fir.ref<!fir.array<?xi8>>
 ! FIR:           %[[VAL_2:.*]] = arith.constant 0 : index
@@ -170,7 +170,7 @@ end subroutine test4
 ! FIR:           return
 ! FIR:         }
 
-! HLFIR-LABEL:   func.func private @_QFtest4Pinner() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! HLFIR-LABEL:   func.func private @_QFtest4Pinner() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 ! HLFIR:           %[[VAL_0:.*]] = fir.address_of(@blk_) : !fir.ref<tuple<i32>>
 ! HLFIR:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<tuple<i32>>) -> !fir.ref<!fir.array<?xi8>>
 ! HLFIR:           %[[VAL_2:.*]] = arith.constant 0 : index
diff --git a/flang/test/Lower/explicit-interface-results-2.f90 b/flang/test/Lower/explicit-interface-results-2.f90
index 86aae720e7fcf..a63ee5fc91794 100644
--- a/flang/test/Lower/explicit-interface-results-2.f90
+++ b/flang/test/Lower/explicit-interface-results-2.f90
@@ -70,7 +70,7 @@ subroutine host4()
   call internal_proc_a()
 contains
 ! CHECK-LABEL: func private @_QFhost4Pinternal_proc_a
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
   subroutine internal_proc_a()
     call takes_array(return_array())
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
@@ -94,7 +94,7 @@ subroutine host5()
   implicit none
   call internal_proc_a()
 contains
-! CHECK-LABEL: func private @_QFhost5Pinternal_proc_a() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! CHECK-LABEL: func private @_QFhost5Pinternal_proc_a() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
   subroutine internal_proc_a()
     call takes_array(return_array())
 ! CHECK:  %[[VAL_0:.*]] = fir.address_of(@_QMsome_moduleEn_module) : !fir.ref<i32>
diff --git a/flang/test/Lower/host-associated-functions.f90 b/flang/test/Lower/host-associated-functions.f90
index 78d081748c2f4..d67a74fa39980 100644
--- a/flang/test/Lower/host-associated-functions.f90
+++ b/flang/test/Lower/host-associated-functions.f90
@@ -20,7 +20,7 @@ subroutine capture_char_func_dummy(char_func_dummy, n)
   call internal()
 contains
   ! CHECK-LABEL: func private @_QFcapture_char_func_dummyPinternal(
-  ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>, !fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+  ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>, !fir.ref<i32>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
   subroutine internal()
   ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
   ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>, !fir.ref<i32>>>, i32) -> !fir.ref<tuple<!fir.boxproc<() -> ()>, i64>>
@@ -56,7 +56,7 @@ subroutine capture_char_func_assumed_dummy(char_func_dummy)
   call internal()
 contains
 ! CHECK-LABEL: func private @_QFcapture_char_func_assumed_dummyPinternal(
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
   subroutine internal()
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<tuple<!fir.boxproc<() -> ()>, i64>>>, i32) -> !fir.ref<tuple<!fir.boxproc<() -> ()>, i64>>
@@ -110,7 +110,7 @@ function array_func()
 contains
   subroutine internal()
 ! CHECK-LABEL: func private @_QFcapture_array_funcPinternal(
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 0 : i32
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<tuple<!fir.ref<i32>>>, i32) -> !fir.llvm_ptr<!fir.ref<i32>>
 ! CHECK:  %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.llvm_ptr<!fir.ref<i32>>
diff --git a/flang/test/Lower/host-associated-globals.f90 b/flang/test/Lower/host-associated-globals.f90
index fe612e777aeaa..c91a5a46af0d5 100644
--- a/flang/test/Lower/host-associated-globals.f90
+++ b/flang/test/Lower/host-associated-globals.f90
@@ -37,7 +37,7 @@ subroutine bar()
     print *, j_in_equiv, not_in_equiv
  end subroutine
 end subroutine
-! CHECK-LABEL: func.func private @_QFtest_commonPbar() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! CHECK-LABEL: func.func private @_QFtest_commonPbar() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_0:.*]] = fir.address_of(@x_) : !fir.ref<!fir.array<12xi8>>
 ! CHECK:  %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<12xi8>>) -> !fir.ref<!fir.array<?xi8>>
 ! CHECK:  %[[VAL_2:.*]] = arith.constant 4 : index
@@ -59,7 +59,7 @@ subroutine bar()
     print *, j_in_equiv, not_in_equiv
  end subroutine
 end subroutine
-! CHECK-LABEL: func.func private @_QFsaved_equivPbar() attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! CHECK-LABEL: func.func private @_QFsaved_equivPbar() attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_0:.*]] = fir.address_of(@_QFsaved_equivEi) : !fir.ref<!fir.array<8xi8>>
 ! CHECK:  %[[VAL_1:.*]] = arith.constant 4 : index
 ! CHECK:  %[[VAL_2:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_1]] : (!fir.ref<!fir.array<8xi8>>, index) -> !fir.ref<i8>
@@ -80,7 +80,7 @@ subroutine bar()
  end subroutine
 end subroutine
 ! CHECK-LABEL: func.func private @_QFmixed_capturePbar(
-! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! CHECK-SAME:    %[[VAL_0:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:  %[[VAL_1:.*]] = fir.address_of(@_QFmixed_captureEsaved_i) : !fir.ref<!fir.array<4xi8>>
 ! CHECK:  %[[VAL_2:.*]] = arith.constant 0 : index
 ! CHECK:  %[[VAL_3:.*]] = fir.coordinate_of %[[VAL_1]], %[[VAL_2]] : (!fir.ref<!fir.array<4xi8>>, index) -> !fir.ref<i8>
diff --git a/flang/test/Lower/host-associated.f90 b/flang/test/Lower/host-associated.f90
index f88903c8af80f..cdc7e6a05288a 100644
--- a/flang/test/Lower/host-associated.f90
+++ b/flang/test/Lower/host-associated.f90
@@ -20,7 +20,7 @@ subroutine test1
   print *, i
 contains
   ! CHECK-LABEL: func private @_QFtest1Ptest1_internal(
-  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
   ! CHECK: %[[iaddr:.*]] = fir.coordinate_of %[[arg]], %c0
   ! CHECK: %[[i:.*]] = fir.load %[[iaddr]] : !fir.llvm_ptr<!fir.ref<i32>>
   ! CHECK: %[[val:.*]] = fir.call @_QPifoo() {{.*}}: () -> i32
@@ -47,7 +47,7 @@ subroutine test2
   print *, a, b
 contains
   ! CHECK-LABEL: func private @_QFtest2Ptest2_internal(
-  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<f32>, !fir.ref<f32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<f32>, !fir.ref<f32>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test2_internal
     ! CHECK: %[[a:.*]] = fir.coordinate_of %[[arg]], %c0
     ! CHECK: %[[aa:.*]] = fir.load %[[a]] : !fir.llvm_ptr<!fir.ref<f32>>
@@ -62,7 +62,7 @@ subroutine test2_internal
   end subroutine test2_internal
 
   ! CHECK-LABEL: func private @_QFtest2Ptest2_inner(
-  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<f32>, !fir.ref<f32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+  ! CHECK-SAME: %[[arg:[^:]*]]: !fir.ref<tuple<!fir.ref<f32>, !fir.ref<f32>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test2_inner
     ! CHECK: %[[a:.*]] = fir.coordinate_of %[[arg]], %c0
     ! CHECK: %[[aa:.*]] = fir.load %[[a]] : !fir.llvm_ptr<!fir.ref<f32>>
@@ -96,7 +96,7 @@ subroutine test6(c)
 
 contains
   ! CHECK-LABEL: func private @_QFtest6Ptest6_inner(
-  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test6_inner
     ! CHECK: %[[coor:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.boxchar<1>>>, i32) -> !fir.ref<!fir.boxchar<1>>
     ! CHECK: %[[load:.*]] = fir.load %[[coor]] : !fir.ref<!fir.boxchar<1>>
@@ -138,7 +138,7 @@ subroutine test3(p,q,i)
   
 contains
   ! CHECK-LABEL: func private @_QFtest3Ptest3_inner(
-  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test3_inner
     ! CHECK: %[[pcoor:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>>>, i32) -> !fir.ref<!fir.box<!fir.array<?xf32>>>
     ! CHECK: %[[p:.*]] = fir.load %[[pcoor]] : !fir.ref<!fir.box<!fir.array<?xf32>>>
@@ -185,7 +185,7 @@ subroutine test3a(p)
   
 contains
   ! CHECK: func private @_QFtest3aPtest3a_inner(
-  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.box<!fir.array<10xf32>>, !fir.box<!fir.array<10xf32>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+  ! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.box<!fir.array<10xf32>>, !fir.box<!fir.array<10xf32>>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test3a_inner
     ! CHECK: %[[pcoor:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.box<!fir.array<10xf32>>, !fir.box<!fir.array<10xf32>>>>, i32) -> !fir.ref<!fir.box<!fir.array<10xf32>>>
     ! CHECK: %[[p:.*]] = fir.load %[[pcoor]] : !fir.ref<!fir.box<!fir.array<10xf32>>>
@@ -229,7 +229,7 @@ subroutine test4
   
 contains
   ! CHECK-LABEL: func private @_QFtest4Ptest4_inner(
-  ! CHECK-SAME:%[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+  ! CHECK-SAME:%[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test4_inner
     ! CHECK: %[[ptup:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>>>, i32) -> !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<f32>>>>
     ! CHECK: %[[p:.*]] = fir.load %[[ptup]] : !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<f32>>>>
@@ -271,7 +271,7 @@ subroutine test5
   
 contains
   ! CHECK-LABEL: func private @_QFtest5Ptest5_inner(
-  ! CHECK-SAME:%[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+  ! CHECK-SAME:%[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
   subroutine test5_inner
     ! CHECK: %[[ptup:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>>>, i32) -> !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>>
     ! CHECK: %[[p:.*]] = fir.load %[[ptup]] : !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>>
@@ -309,7 +309,7 @@ subroutine test7(j, k)
 contains
 
 ! CHECK-LABEL: func private @_QFtest7Ptest7_inner(
-! CHECK-SAME: %[[i:.*]]: !fir.ref<i32>{{.*}}, %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) -> i32 attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! CHECK-SAME: %[[i:.*]]: !fir.ref<i32>{{.*}}, %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) -> i32 attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 elemental integer function test7_inner(i)
   implicit none
   integer, intent(in) :: i
@@ -330,7 +330,7 @@ subroutine issue990()
   call bar()
 contains
 ! CHECK-LABEL: func private @_QFissue990Pbar(
-! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 subroutine bar()
   integer :: stmt_func, i
   stmt_func(i) = i + captured
@@ -352,7 +352,7 @@ subroutine issue990b()
   call bar()
 contains
 ! CHECK-LABEL: func private @_QFissue990bPbar(
-! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 subroutine bar()
   ! CHECK: %[[tupAddr:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.ref<i32>>>, i32) -> !fir.llvm_ptr<!fir.ref<i32>>
   ! CHECK: %[[addr:.*]] = fir.load %[[tupAddr]] : !fir.llvm_ptr<!fir.ref<i32>>
@@ -373,7 +373,7 @@ real function dummy_proc(x)
  call bar()
 contains
 ! CHECK-LABEL: func private @_QFtest8Pbar(
-! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxproc<() -> ()>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxproc<() -> ()>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 subroutine bar()
   ! CHECK: %[[tupAddr:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.boxproc<() -> ()>>>, i32) -> !fir.ref<!fir.boxproc<() -> ()>>
   ! CHECK: %[[dummyProc:.*]] = fir.load %[[tupAddr]] : !fir.ref<!fir.boxproc<() -> ()>>
@@ -393,7 +393,7 @@ subroutine dummy_proc()
  call bar()
 contains
 ! CHECK-LABEL: func private @_QFtest9Pbar(
-! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxproc<() -> ()>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.boxproc<() -> ()>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 subroutine bar()
   ! CHECK: %[[tupAddr:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.boxproc<() -> ()>>>, i32) -> !fir.ref<!fir.boxproc<() -> ()>>
   ! CHECK: %[[dummyProc:.*]] = fir.load %[[tupAddr]] : !fir.ref<!fir.boxproc<() -> ()>>
@@ -416,7 +416,7 @@ subroutine test10(i)
  call bar()
 contains
 ! CHECK-LABEL: func private @_QFtest10Pbar(
-! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! CHECK-SAME: %[[tup:.*]]: !fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 subroutine bar()
   ! CHECK: %[[tupAddr:.*]] = fir.coordinate_of %[[tup]], %c0{{.*}} : (!fir.ref<tuple<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>>>, i32) -> !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>>
   ! CHECK: fir.load %[[tupAddr]] : !fir.llvm_ptr<!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>>
@@ -435,7 +435,7 @@ subroutine bar()
 
 ! CHECK-LABEL: func private @_QFtest_proc_dummyPtest_proc_dummy_a(
 ! CHECK-SAME:          %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "j"},
-! CHECK-SAME:          %[[VAL_1:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! CHECK-SAME:          %[[VAL_1:.*]]: !fir.ref<tuple<!fir.ref<i32>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK:         %[[VAL_2:.*]] = arith.constant 0 : i32
 ! CHECK:         %[[VAL_3:.*]] = fir.coordinate_of %[[VAL_1]], %[[VAL_2]] : (!fir.ref<tuple<!fir.ref<i32>>>, i32) -> !fir.llvm_ptr<!fir.ref<i32>>
 ! CHECK:         %[[VAL_4:.*]] = fir.load %[[VAL_3]] : !fir.llvm_ptr<!fir.ref<i32>>
@@ -528,7 +528,7 @@ end subroutine test_proc_dummy_other
 ! CHECK-LABEL: func private @_QFtest_proc_dummy_charPgen_message(
 ! CHECK-SAME:                                            %[[VAL_0:.*]]: !fir.ref<!fir.char<1,10>>,
 ! CHECK-SAME:                                            %[[VAL_1:.*]]: index,
-! CHECK-SAME:                                            %[[VAL_2:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) -> !fir.boxchar<1> attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! CHECK-SAME:                                            %[[VAL_2:.*]]: !fir.ref<tuple<!fir.boxchar<1>>> {fir.host_assoc}) -> !fir.boxchar<1> attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK-DAG:         %[[VAL_3:.*]] = arith.constant 0 : i32
 ! CHECK-DAG:         %[[VAL_4:.*]] = arith.constant 10 : index
 ! CHECK-DAG:         %[[VAL_5:.*]] = arith.constant false
diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90
index e031b4805dc5b..70c1f768e389a 100644
--- a/flang/test/Lower/polymorphic.f90
+++ b/flang/test/Lower/polymorphic.f90
@@ -520,7 +520,7 @@ subroutine internal
   end subroutine
 
 ! CHECK-LABEL: func.func private @_QMpolymorphic_testFhost_assocPinternal(
-! CHECK-SAME: %[[TUPLE:.*]]: !fir.ref<tuple<!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>> {fir.host_assoc}) attributes {fir.internal_proc, llvm.linkage = #llvm.linkage<internal>} {
+! CHECK-SAME: %[[TUPLE:.*]]: !fir.ref<tuple<!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>> {fir.host_assoc}) attributes {fir.host_symbol = {{.*}}, llvm.linkage = #llvm.linkage<internal>} {
 ! CHECK: %[[POS_IN_TUPLE:.*]] = arith.constant 0 : i32
 ! CHECK: %[[COORD_OF_CLASS:.*]] = fir.coordinate_of %[[TUPLE]], %[[POS_IN_TUPLE]] : (!fir.ref<tuple<!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>>, i32) -> !fir.ref<!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>
 ! CHECK: %[[CLASS:.*]] = fir.load %[[COORD_OF_CLASS]] : !fir.ref<!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>>
diff --git a/flang/test/Semantics/OpenMP/clause-validity01.f90 b/flang/test/Semantics/OpenMP/clause-validity01.f90
index 74f154bb0ad67..21b99cb82549a 100644
--- a/flang/test/Semantics/OpenMP/clause-validity01.f90
+++ b/flang/test/Semantics/OpenMP/clause-validity01.f90
@@ -342,8 +342,8 @@
   a = 1.0
   !ERROR: COPYPRIVATE clause is not allowed on the END WORKSHARE directive
   !$omp end workshare nowait copyprivate(a)
+  !ERROR: NOWAIT clause is not allowed on the OMP WORKSHARE directive, use it on OMP END WORKSHARE directive 
   !$omp workshare nowait
-  !ERROR: NOWAIT clause is not allowed on the WORKSHARE directive, use it on OMP END WORKSHARE directive
   !$omp end workshare
   !$omp end parallel
 
diff --git a/flang/test/Semantics/OpenMP/threadprivate07.f90 b/flang/test/Semantics/OpenMP/threadprivate07.f90
new file mode 100644
index 0000000000000..c9a006ca0e083
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/threadprivate07.f90
@@ -0,0 +1,15 @@
+! RUN: %python %S/../test_errors.py %s %flang_fc1 -fopenmp
+
+! Check Threadprivate Directive with local variable of a BLOCK construct.
+
+program main
+  call sub1()
+  print *, 'pass'
+end program main
+
+subroutine sub1()
+  BLOCK
+    integer, save :: a
+    !$omp threadprivate(a)
+  END BLOCK
+end subroutine
diff --git a/flang/unittests/Runtime/AccessTest.cpp b/flang/unittests/Runtime/AccessTest.cpp
new file mode 100644
index 0000000000000..66f19f78c7cfb
--- /dev/null
+++ b/flang/unittests/Runtime/AccessTest.cpp
@@ -0,0 +1,422 @@
+//===-- flang/unittests/Runtime/AccessTest.cpp ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// TODO: ACCESS is not yet implemented on Windows
+#ifndef _WIN32
+
+#include "CrashHandlerFixture.h"
+#include "gtest/gtest.h"
+#include "flang/Runtime/extensions.h"
+#include "llvm/ADT/Twine.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+namespace {
+
+struct AccessTests : public CrashHandlerFixture {};
+
+struct AccessType {
+  bool read{false};
+  bool write{false};
+  bool execute{false};
+  bool exists{false};
+};
+
+} // namespace
+
+static std::string addPIDSuffix(const char *name) {
+  std::stringstream ss;
+  ss << name;
+  ss << '.';
+
+  ss << getpid();
+
+  return ss.str();
+}
+
+static bool exists(const std::string &path) {
+  return access(path.c_str(), F_OK) == 0;
+}
+
+// Implementation of std::filesystem::temp_directory_path adapted from libcxx
+// See llvm-project/libcxx/src/filesystem/operations.cpp
+// Using std::filesystem is inconvenient because the required flags are not
+// consistent accross compilers and CMake doesn't have built in support to
+// determine the correct flags.
+static const char *temp_directory_path() {
+  // TODO: Windows
+  const char *env_paths[] = {"TMPDIR", "TMP", "TEMP", "TEMPDIR"};
+  const char *ret = nullptr;
+
+  for (auto &ep : env_paths) {
+    if ((ret = getenv(ep))) {
+      break;
+    }
+  }
+
+  if (ret == nullptr) {
+#if defined(__ANDROID__)
+    ret = "/data/local/tmp";
+#else
+    ret = "/tmp";
+#endif
+  }
+
+  assert(exists(ret));
+  return ret;
+}
+
+static std::string createTemporaryFile(
+    const char *name, const AccessType &accessType) {
+  std::string path =
+      (llvm::Twine{temp_directory_path()} + "/" + addPIDSuffix(name)).str();
+
+  // O_CREAT | O_EXCL enforces that this file is newly created by this call.
+  // This feels risky. If we don't have permission to create files in the
+  // temporary directory or if the files already exist, the test will fail.
+  // But we can't use std::tmpfile() because we need a path to the file and
+  // to control the filesystem permissions
+  mode_t mode{0};
+  if (accessType.read) {
+    mode |= S_IRUSR;
+  }
+  if (accessType.write) {
+    mode |= S_IWUSR;
+  }
+  if (accessType.execute) {
+    mode |= S_IXUSR;
+  }
+
+  int file = open(path.c_str(), O_CREAT | O_EXCL, mode);
+  if (file == -1) {
+    return {};
+  }
+
+  close(file);
+
+  return path;
+}
+
+static std::int64_t callAccess(
+    const std::string &path, const AccessType &accessType) {
+  const char *cpath{path.c_str()};
+  std::int64_t pathlen = std::strlen(cpath);
+
+  std::string mode;
+  if (accessType.read) {
+    mode += 'r';
+  }
+  if (accessType.write) {
+    mode += 'w';
+  }
+  if (accessType.execute) {
+    mode += 'x';
+  }
+  if (accessType.exists) {
+    mode += ' ';
+  }
+
+  const char *cmode = mode.c_str();
+  std::int64_t modelen = std::strlen(cmode);
+
+  return FORTRAN_PROCEDURE_NAME(access)(cpath, pathlen, cmode, modelen);
+}
+
+TEST(AccessTests, TestExists) {
+  AccessType accessType;
+  accessType.exists = true;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_EQ(res, 0);
+}
+
+TEST(AccessTests, TestNotExists) {
+  std::string nonExistant{addPIDSuffix(__func__)};
+  ASSERT_FALSE(exists(nonExistant));
+
+  AccessType accessType;
+  accessType.exists = true;
+  std::int64_t res = callAccess(nonExistant, accessType);
+
+  ASSERT_NE(res, 0);
+}
+
+TEST(AccessTests, TestRead) {
+  AccessType accessType;
+  accessType.read = true;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_EQ(res, 0);
+}
+
+TEST(AccessTests, TestNotRead) {
+  AccessType accessType;
+  accessType.read = false;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  accessType.read = true;
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_NE(res, 0);
+}
+
+TEST(AccessTests, TestWrite) {
+  AccessType accessType;
+  accessType.write = true;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_EQ(res, 0);
+}
+
+TEST(AccessTests, TestNotWrite) {
+  AccessType accessType;
+  accessType.write = false;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  accessType.write = true;
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_NE(res, 0);
+}
+
+TEST(AccessTests, TestReadWrite) {
+  AccessType accessType;
+  accessType.read = true;
+  accessType.write = true;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_EQ(res, 0);
+}
+
+TEST(AccessTests, TestNotReadWrite0) {
+  AccessType accessType;
+  accessType.read = false;
+  accessType.write = false;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  accessType.read = true;
+  accessType.write = true;
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_NE(res, 0);
+}
+
+TEST(AccessTests, TestNotReadWrite1) {
+  AccessType accessType;
+  accessType.read = true;
+  accessType.write = false;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  accessType.read = true;
+  accessType.write = true;
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_NE(res, 0);
+}
+
+TEST(AccessTests, TestNotReadWrite2) {
+  AccessType accessType;
+  accessType.read = false;
+  accessType.write = true;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  accessType.read = true;
+  accessType.write = true;
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_NE(res, 0);
+}
+
+TEST(AccessTests, TestExecute) {
+  AccessType accessType;
+  accessType.execute = true;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_EQ(res, 0);
+}
+
+TEST(AccessTests, TestNotExecute) {
+  AccessType accessType;
+  accessType.execute = false;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  accessType.execute = true;
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_NE(res, 0);
+}
+
+TEST(AccessTests, TestRWX) {
+  AccessType accessType;
+  accessType.read = true;
+  accessType.write = true;
+  accessType.execute = true;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_EQ(res, 0);
+}
+
+TEST(AccessTests, TestNotRWX0) {
+  AccessType accessType;
+  accessType.read = false;
+  accessType.write = false;
+  accessType.execute = false;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  accessType.read = true;
+  accessType.write = true;
+  accessType.execute = true;
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_NE(res, 0);
+}
+
+TEST(AccessTests, TestNotRWX1) {
+  AccessType accessType;
+  accessType.read = true;
+  accessType.write = false;
+  accessType.execute = false;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  accessType.read = true;
+  accessType.write = true;
+  accessType.execute = true;
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_NE(res, 0);
+}
+
+TEST(AccessTests, TestNotRWX2) {
+  AccessType accessType;
+  accessType.read = true;
+  accessType.write = true;
+  accessType.execute = false;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  accessType.read = true;
+  accessType.write = true;
+  accessType.execute = true;
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_NE(res, 0);
+}
+
+TEST(AccessTests, TestNotRWX3) {
+  AccessType accessType;
+  accessType.read = true;
+  accessType.write = false;
+  accessType.execute = true;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  accessType.read = true;
+  accessType.write = true;
+  accessType.execute = true;
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_NE(res, 0);
+}
+
+TEST(AccessTests, TestNotRWX4) {
+  AccessType accessType;
+  accessType.read = false;
+  accessType.write = true;
+  accessType.execute = true;
+
+  std::string path = createTemporaryFile(__func__, accessType);
+  ASSERT_FALSE(path.empty());
+
+  accessType.read = true;
+  accessType.write = true;
+  accessType.execute = true;
+  std::int64_t res = callAccess(path, accessType);
+
+  ASSERT_EQ(unlink(path.c_str()), 0);
+
+  ASSERT_NE(res, 0);
+}
+
+#endif // !_WIN32
diff --git a/flang/unittests/Runtime/CMakeLists.txt b/flang/unittests/Runtime/CMakeLists.txt
index 23f02aa751246..f7caacad3a598 100644
--- a/flang/unittests/Runtime/CMakeLists.txt
+++ b/flang/unittests/Runtime/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_flang_unittest(FlangRuntimeTests
+  AccessTest.cpp
   Allocatable.cpp
   ArrayConstructor.cpp
   BufferTest.cpp
diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt
index f33f9430c7920..4e3d1cb9f5337 100644
--- a/libc/config/baremetal/arm/entrypoints.txt
+++ b/libc/config/baremetal/arm/entrypoints.txt
@@ -201,6 +201,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.fenv.fesetround
     libc.src.fenv.feraiseexcept
     libc.src.fenv.fetestexcept
+    libc.src.fenv.fetestexceptflag
     libc.src.fenv.feupdateenv
 
     # math.h entrypoints
diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt
index dad187fa0496d..7efd9bcd5b3cb 100644
--- a/libc/config/baremetal/riscv/entrypoints.txt
+++ b/libc/config/baremetal/riscv/entrypoints.txt
@@ -201,6 +201,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.fenv.fesetround
     libc.src.fenv.feraiseexcept
     libc.src.fenv.fetestexcept
+    libc.src.fenv.fetestexceptflag
     libc.src.fenv.feupdateenv
 
     # math.h entrypoints
diff --git a/libc/config/darwin/arm/entrypoints.txt b/libc/config/darwin/arm/entrypoints.txt
index aea2f6d5771e8..e1303265b9ac4 100644
--- a/libc/config/darwin/arm/entrypoints.txt
+++ b/libc/config/darwin/arm/entrypoints.txt
@@ -112,6 +112,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.fenv.fesetround
     libc.src.fenv.feraiseexcept
     libc.src.fenv.fetestexcept
+    libc.src.fenv.fetestexceptflag
     libc.src.fenv.feupdateenv
 
     # math.h entrypoints
diff --git a/libc/config/darwin/x86_64/entrypoints.txt b/libc/config/darwin/x86_64/entrypoints.txt
index 09fe3d7b47687..02912decadcf7 100644
--- a/libc/config/darwin/x86_64/entrypoints.txt
+++ b/libc/config/darwin/x86_64/entrypoints.txt
@@ -106,6 +106,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     # libc.src.fenv.fesetround
     # libc.src.fenv.feraiseexcept
     # libc.src.fenv.fetestexcept
+    # libc.src.fenv.fetestexceptflag
     # libc.src.fenv.feupdateenv
 
     ## Currently disabled for failing tests.
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 2952baacdd67f..1ac6bd9300008 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -324,6 +324,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.fenv.fesetround
     libc.src.fenv.feraiseexcept
     libc.src.fenv.fetestexcept
+    libc.src.fenv.fetestexceptflag
     libc.src.fenv.feupdateenv
 
     # math.h entrypoints
diff --git a/libc/config/linux/api.td b/libc/config/linux/api.td
index 9964971f191b7..5fb92a9c299cc 100644
--- a/libc/config/linux/api.td
+++ b/libc/config/linux/api.td
@@ -175,6 +175,7 @@ def PThreadAPI : PublicAPI<"pthread.h"> {
       "__pthread_start_t",
       "__pthread_tss_dtor_t",
       "pthread_attr_t",
+      "pthread_condattr_t",
       "pthread_mutex_t",
       "pthread_mutexattr_t",
       "pthread_t",
@@ -241,10 +242,30 @@ def SysSendfileAPI : PublicAPI<"sys/sendfile.h"> {
 }
 
 def SysTypesAPI : PublicAPI<"sys/types.h"> {
-  let Types = ["blkcnt_t", "blksize_t", "clockid_t", "dev_t", "gid_t", "ino_t",
-               "mode_t", "nlink_t", "off_t", "pid_t", "pthread_attr_t", "pthread_key_t",
-               "pthread_mutex_t", "pthread_mutexattr_t", "pthread_once_t", "pthread_t",
-               "size_t", "ssize_t", "suseconds_t", "time_t", "uid_t"];
+  let Types = [
+    "blkcnt_t",
+    "blksize_t",
+    "clockid_t",
+    "dev_t",
+    "gid_t",
+    "ino_t",
+    "mode_t",
+    "nlink_t",
+    "off_t",
+    "pid_t",
+    "pthread_attr_t",
+    "pthread_condattr_t",
+    "pthread_key_t",
+    "pthread_mutex_t",
+    "pthread_mutexattr_t",
+    "pthread_once_t",
+    "pthread_t",
+    "size_t",
+    "ssize_t",
+    "suseconds_t",
+    "time_t",
+    "uid_t"
+  ];
 }
 
 def SysUtsNameAPI : PublicAPI<"sys/utsname.h"> {
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index 35fd588a9a6c4..335981ff7dc7c 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -192,6 +192,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.fenv.fesetround
     libc.src.fenv.feraiseexcept
     libc.src.fenv.fetestexcept
+    libc.src.fenv.fetestexceptflag
     libc.src.fenv.feupdateenv
 
     # math.h entrypoints
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 47c03a61c45a9..87e82e5eb9a06 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -332,6 +332,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.fenv.fesetround
     libc.src.fenv.feraiseexcept
     libc.src.fenv.fetestexcept
+    libc.src.fenv.fetestexceptflag
     libc.src.fenv.feupdateenv
 
     # math.h entrypoints
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 8fdd4575e27e2..2d8136536b218 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -346,6 +346,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.fenv.fesetround
     libc.src.fenv.feraiseexcept
     libc.src.fenv.fetestexcept
+    libc.src.fenv.fetestexceptflag
     libc.src.fenv.feupdateenv
 
     # math.h entrypoints
@@ -638,6 +639,12 @@ if(LLVM_LIBC_FULL_BUILD)
     libc.src.pthread.pthread_attr_setguardsize
     libc.src.pthread.pthread_attr_setstack
     libc.src.pthread.pthread_attr_setstacksize
+    libc.src.pthread.pthread_condattr_destroy
+    libc.src.pthread.pthread_condattr_getclock
+    libc.src.pthread.pthread_condattr_getpshared
+    libc.src.pthread.pthread_condattr_init
+    libc.src.pthread.pthread_condattr_setclock
+    libc.src.pthread.pthread_condattr_setpshared
     libc.src.pthread.pthread_create
     libc.src.pthread.pthread_detach
     libc.src.pthread.pthread_equal
diff --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt
index c46c947bf3135..71216530c4041 100644
--- a/libc/config/windows/entrypoints.txt
+++ b/libc/config/windows/entrypoints.txt
@@ -110,6 +110,7 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.fenv.fesetround
     libc.src.fenv.feraiseexcept
     libc.src.fenv.fetestexcept
+    libc.src.fenv.fetestexceptflag
     libc.src.fenv.feupdateenv
 
     # math.h entrypoints
diff --git a/libc/docs/c23.rst b/libc/docs/c23.rst
index 4138c9d7104f3..44724fe1660cb 100644
--- a/libc/docs/c23.rst
+++ b/libc/docs/c23.rst
@@ -21,7 +21,7 @@ Additions:
 * fenv.h
 
   * fesetexcept |check|
-  * fetestexceptflag
+  * fetestexceptflag |check|
   * fegetmode
   * fesetmode
 * math.h
diff --git a/libc/docs/fenv.rst b/libc/docs/fenv.rst
index 6574fb7246ddd..1dee5515e1174 100644
--- a/libc/docs/fenv.rst
+++ b/libc/docs/fenv.rst
@@ -42,7 +42,7 @@ fenv.h Functions
     - |check|
     - 7.6.6.3
   * - fesetexcept
-    -
+    - |check|
     - 7.6.4.4
   * - fesetexceptflag
     - |check|
@@ -57,7 +57,7 @@ fenv.h Functions
     - |check|
     - 7.6.4.7
   * - fetestexceptflag
-    -
+    - |check|
     - 7.6.4.6
   * - feupdateenv
     - |check|
diff --git a/libc/examples/README.md b/libc/examples/README.md
index 36b886090c6c1..1bc4a67294f2a 100644
--- a/libc/examples/README.md
+++ b/libc/examples/README.md
@@ -59,7 +59,7 @@ have installed them, you have to inform CMake that we are linking against the
 full libc as follows:
 
 ```bash
-cmake ../ -G <GEN> -DLIBC_FULLBUILD=ON    \
+cmake ../ -G <GEN> -DLLVM_LIBC_FULL_BUILD=ON    \
   -DCMAKE_SYSROOT=<SYSROOT>               \
   -DCMAKE_C_COMPILER=<SYSROOT>/bin/clang  \
   -DCMAKE_TRY_COMPILE_TARGET_TYPE=STATIC_LIBRARY
diff --git a/libc/examples/examples.cmake b/libc/examples/examples.cmake
index 81e99e3cbede9..6bb6b41c252f5 100644
--- a/libc/examples/examples.cmake
+++ b/libc/examples/examples.cmake
@@ -4,13 +4,13 @@ function(add_example name)
     ${ARGN}
   )
 
-  if(LIBC_FULLBUILD)
+  if(LLVM_LIBC_FULL_BUILD)
     target_link_options(${name} PRIVATE -static -rtlib=compiler-rt -fuse-ld=lld)
   elseif(LIBC_OVERLAY_ARCHIVE_DIR)
     target_link_directories(${name} PRIVATE ${LIBC_OVERLAY_ARCHIVE_DIR})
     target_link_options(${name} PRIVATE -l:libllvmlibc.a)
   else()
-    message(FATAL_ERROR "Either LIBC_FULLBUILD should be on or "
+    message(FATAL_ERROR "Either LLVM_LIBC_FULL_BUILD should be on or "
                         "LIBC_OVERLAY_ARCHIVE_DIR should be set.")
   endif()
 endfunction()
diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt
index ecb952b60cc06..f53766777e753 100644
--- a/libc/hdr/types/CMakeLists.txt
+++ b/libc/hdr/types/CMakeLists.txt
@@ -28,7 +28,7 @@ add_proxy_header_library(
     fenv_t.h
   FULL_BUILD_DEPENDS
     libc.include.llvm-libc-types.fenv_t
-    libc.incude.fenv
+    libc.include.fenv
 )
 
 add_proxy_header_library(
@@ -37,5 +37,5 @@ add_proxy_header_library(
     fexcept_t.h
   FULL_BUILD_DEPENDS
     libc.include.llvm-libc-types.fexcept_t
-    libc.incude.fenv
+    libc.include.fenv
 )
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index b85366c8deafe..f5ba2791af3fb 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -321,6 +321,7 @@ add_gen_header(
     .llvm-libc-types.__pthread_start_t
     .llvm-libc-types.__pthread_tss_dtor_t
     .llvm-libc-types.pthread_attr_t
+    .llvm-libc-types.pthread_condattr_t
     .llvm-libc-types.pthread_mutex_t
     .llvm-libc-types.pthread_mutexattr_t
     .llvm-libc-types.pthread_t
diff --git a/libc/include/llvm-libc-types/CMakeLists.txt b/libc/include/llvm-libc-types/CMakeLists.txt
index 93a79e1477b33..f26fc0729dc94 100644
--- a/libc/include/llvm-libc-types/CMakeLists.txt
+++ b/libc/include/llvm-libc-types/CMakeLists.txt
@@ -49,11 +49,12 @@ add_header(pid_t HDR pid_t.h)
 add_header(posix_spawn_file_actions_t HDR posix_spawn_file_actions_t.h)
 add_header(posix_spawnattr_t HDR posix_spawnattr_t.h)
 add_header(pthread_attr_t HDR pthread_attr_t.h DEPENDS .size_t)
+add_header(pthread_condattr_t HDR pthread_condattr_t.h DEPENDS .clockid_t)
 add_header(pthread_key_t HDR pthread_key_t.h)
 add_header(pthread_mutex_t HDR pthread_mutex_t.h DEPENDS .__futex_word .__mutex_type)
-add_header(pthread_t HDR pthread_t.h DEPENDS .__thread_type)
 add_header(pthread_mutexattr_t HDR pthread_mutexattr_t.h)
 add_header(pthread_once_t HDR pthread_once_t.h DEPENDS .__futex_word)
+add_header(pthread_t HDR pthread_t.h DEPENDS .__thread_type)
 add_header(rlim_t HDR rlim_t.h)
 add_header(time_t HDR time_t.h)
 add_header(stack_t HDR stack_t.h)
diff --git a/libc/include/llvm-libc-types/pthread_condattr_t.h b/libc/include/llvm-libc-types/pthread_condattr_t.h
new file mode 100644
index 0000000000000..b91fc2950aa3f
--- /dev/null
+++ b/libc/include/llvm-libc-types/pthread_condattr_t.h
@@ -0,0 +1,18 @@
+//===-- Definition of pthread_condattr_t type -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_TYPES_PTHREAD_CONDATTR_T_H
+#define LLVM_LIBC_TYPES_PTHREAD_CONDATTR_T_H
+
+#include "clockid_t.h"
+
+typedef struct {
+  clockid_t clock;
+  int pshared;
+} pthread_condattr_t;
+
+#endif // LLVM_LIBC_TYPES_PTHREAD_CONDATTR_T_H
diff --git a/libc/include/pthread.h.def b/libc/include/pthread.h.def
index abeb839ee83d1..a94d770657e10 100644
--- a/libc/include/pthread.h.def
+++ b/libc/include/pthread.h.def
@@ -11,6 +11,9 @@
 
 #include "__llvm-libc-common.h"
 
+// TODO: move to a pthreads-macros.h file:
+// https://github.com/llvm/llvm-project/issues/88997
+
 #define PTHREAD_STACK_MIN (1 << 14) // 16KB
 
 #define PTHREAD_MUTEX_INITIALIZER {0}
@@ -32,6 +35,9 @@ enum {
   PTHREAD_MUTEX_ROBUST = 0x1,
 };
 
+#define PTHREAD_PROCESS_PRIVATE 0
+#define PTHREAD_PROCESS_SHARED 1
+
 %%public_api()
 
 #endif // LLVM_LIBC_PTHREAD_H
diff --git a/libc/spec/posix.td b/libc/spec/posix.td
index 7095a3964ee3f..0c88dbd848a3f 100644
--- a/libc/spec/posix.td
+++ b/libc/spec/posix.td
@@ -26,6 +26,7 @@ def UidT : NamedType<"uid_t">;
 def GidT : NamedType<"gid_t">;
 def DevT : NamedType<"dev_t">;
 def ClockIdT : NamedType<"clockid_t">;
+def RestrictedClockIdTPtr : RestrictedPtrType<ClockIdT>;
 def BlkSizeT : NamedType<"blksize_t">;
 def BlkCntT : NamedType<"blkcnt_t">;
 def NLinkT : NamedType<"nlink_t">;
@@ -105,6 +106,10 @@ def POSIX : StandardSpec<"POSIX"> {
   ConstType ConstPThreadAttrTPtr = ConstType<PThreadAttrTPtr>;
   ConstType ConstRestrictedPThreadAttrTPtr = ConstType<RestrictedPThreadAttrTPtr>;
 
+  NamedType PThreadCondAttrTType = NamedType<"pthread_condattr_t">;
+  PtrType PThreadCondAttrTPtr = PtrType<PThreadCondAttrTType>;
+  ConstType ConstRestrictedPThreadCondAttrTPtr = ConstType<RestrictedPtrType<PThreadCondAttrTType>>;
+
   NamedType PThreadMutexAttrTType = NamedType<"pthread_mutexattr_t">;
   PtrType PThreadMutexAttrTPtr = PtrType<PThreadMutexAttrTType>;
   RestrictedPtrType RestrictedPThreadMutexAttrTPtr = RestrictedPtrType<PThreadMutexAttrTType>;
@@ -980,7 +985,9 @@ def POSIX : StandardSpec<"POSIX"> {
     [], // Macros
     [
         AtForkCallbackT,
+        ClockIdT,
         PThreadAttrTType,
+        PThreadCondAttrTType,
         PThreadKeyT,
         PThreadMutexAttrTType,
         PThreadMutexTType,
@@ -1047,6 +1054,36 @@ def POSIX : StandardSpec<"POSIX"> {
           RetValSpec<IntType>,
           [ArgSpec<PThreadAttrTPtr>, ArgSpec<VoidPtr>, ArgSpec<SizeTType>]
       >,
+      FunctionSpec<
+          "pthread_condattr_destroy",
+          RetValSpec<IntType>,
+          [ArgSpec<PThreadCondAttrTPtr>]
+      >,
+      FunctionSpec<
+          "pthread_condattr_getclock",
+          RetValSpec<IntType>,
+          [ArgSpec<ConstRestrictedPThreadCondAttrTPtr>, ArgSpec<RestrictedClockIdTPtr>]
+      >,
+      FunctionSpec<
+          "pthread_condattr_getpshared",
+          RetValSpec<IntType>,
+          [ArgSpec<ConstRestrictedPThreadCondAttrTPtr>, ArgSpec<RestrictedIntPtr>]
+      >,
+      FunctionSpec<
+          "pthread_condattr_init",
+          RetValSpec<IntType>,
+          [ArgSpec<PThreadCondAttrTPtr>]
+      >,
+      FunctionSpec<
+          "pthread_condattr_setclock",
+          RetValSpec<IntType>,
+          [ArgSpec<PThreadCondAttrTPtr>, ArgSpec<ClockIdT>]
+      >,
+      FunctionSpec<
+          "pthread_condattr_setpshared",
+          RetValSpec<IntType>,
+          [ArgSpec<PThreadCondAttrTPtr>, ArgSpec<IntType>]
+      >,
       FunctionSpec<
           "pthread_create",
           RetValSpec<IntType>,
@@ -1522,9 +1559,30 @@ def POSIX : StandardSpec<"POSIX"> {
   HeaderSpec SysTypes = HeaderSpec<
     "sys/types.h",
     [], // Macros
-    [BlkCntT, BlkSizeT, ClockIdT, DevT, GidT, InoT, ModeTType, NLinkT, OffTType, PidT,
-     PThreadAttrTType, PThreadKeyT, PThreadMutexTType, PThreadMutexAttrTType, PThreadOnceT, PThreadTType,
-     SizeTType, SSizeTType, SuSecondsT, TimeTType, UidT],
+    [
+      BlkCntT,
+      BlkSizeT,
+      ClockIdT,
+      DevT,
+      GidT,
+      InoT,
+      ModeTType,
+      NLinkT,
+      OffTType,
+      PThreadAttrTType,
+      PThreadCondAttrTType,
+      PThreadKeyT,
+      PThreadMutexAttrTType,
+      PThreadMutexTType,
+      PThreadOnceT,
+      PThreadTType,
+      PidT,
+      SSizeTType,
+      SizeTType,
+      SuSecondsT,
+      TimeTType,
+      UidT
+    ], // Types
     [], // Enumerations
     []  // Functions
   >;
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 63d0449867114..01aa7c70b3b9d 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -149,6 +149,11 @@ def StdC : StandardSpec<"stdc"> {
               RetValSpec<IntType>,
               [ArgSpec<IntType>]
           >,
+          FunctionSpec<
+              "fetestexceptflag",
+              RetValSpec<IntType>,
+              [ArgSpec<ConstFExceptTPtr>, ArgSpec<IntType>]
+          >,
           FunctionSpec<
               "feraiseexcept",
               RetValSpec<IntType>,
diff --git a/libc/src/__support/macros/sanitizer.h b/libc/src/__support/macros/sanitizer.h
index bd9b62b7121a1..baf44f7996cab 100644
--- a/libc/src/__support/macros/sanitizer.h
+++ b/libc/src/__support/macros/sanitizer.h
@@ -47,14 +47,13 @@
 // Functions to unpoison memory
 //-----------------------------------------------------------------------------
 
-#if defined(LIBC_HAVE_MEMORY_SANITIZER) && __has_builtin(__builtin_constant_p)
+#if defined(LIBC_HAVE_MEMORY_SANITIZER)
 // Only perform MSAN unpoison in non-constexpr context.
 #include <sanitizer/msan_interface.h>
 #define MSAN_UNPOISON(addr, size)                                              \
   do {                                                                         \
-    if (!__builtin_constant_p(*addr)) {                                        \
+    if (!__builtin_is_constant_evaluated())                                    \
       __msan_unpoison(addr, size);                                             \
-    }                                                                          \
   } while (0)
 #else
 #define MSAN_UNPOISON(ptr, size)
diff --git a/libc/src/fenv/CMakeLists.txt b/libc/src/fenv/CMakeLists.txt
index a28a7ca4c2d82..c5431b1b9d55e 100644
--- a/libc/src/fenv/CMakeLists.txt
+++ b/libc/src/fenv/CMakeLists.txt
@@ -17,7 +17,6 @@ add_entrypoint_object(
   HDRS
     fesetround.h
   DEPENDS
-    libc.hdr.fenv_macros
     libc.src.__support.FPUtil.fenv_impl
   COMPILE_OPTIONS
     -O2
@@ -30,7 +29,6 @@ add_entrypoint_object(
   HDRS
     feclearexcept.h
   DEPENDS
-    libc.hdr.fenv_macros
     libc.src.__support.FPUtil.fenv_impl
   COMPILE_OPTIONS
     -O2
@@ -43,7 +41,6 @@ add_entrypoint_object(
   HDRS
     feraiseexcept.h
   DEPENDS
-    libc.hdr.fenv_macros
     libc.src.__support.FPUtil.fenv_impl
   COMPILE_OPTIONS
     -O2
@@ -56,7 +53,19 @@ add_entrypoint_object(
   HDRS
     fetestexcept.h
   DEPENDS
-    libc.hdr.fenv_macros
+    libc.src.__support.FPUtil.fenv_impl
+  COMPILE_OPTIONS
+    -O2
+)
+
+add_entrypoint_object(
+  fetestexceptflag
+  SRCS
+    fetestexceptflag.cpp
+  HDRS
+    fetestexceptflag.h
+  DEPENDS
+    libc.hdr.types.fexcept_t
     libc.src.__support.FPUtil.fenv_impl
   COMPILE_OPTIONS
     -O2
@@ -69,7 +78,6 @@ add_entrypoint_object(
   HDRS
     fegetenv.h
   DEPENDS
-    libc.hdr.fenv_macros
     libc.hdr.types.fenv_t
     libc.src.__support.FPUtil.fenv_impl
   COMPILE_OPTIONS
@@ -83,7 +91,6 @@ add_entrypoint_object(
   HDRS
     fesetenv.h
   DEPENDS
-    libc.hdr.fenv_macros
     libc.hdr.types.fenv_t
     libc.src.__support.FPUtil.fenv_impl
   COMPILE_OPTIONS
@@ -111,7 +118,6 @@ add_entrypoint_object(
   HDRS
     fesetexcept.h
   DEPENDS
-    libc.hdr.fenv_macros
     libc.src.__support.FPUtil.fenv_impl
   COMPILE_OPTIONS
     -O2
@@ -166,7 +172,6 @@ add_entrypoint_object(
   HDRS
     feenableexcept.h
   DEPENDS
-    libc.hdr.fenv_macros
     libc.src.__support.FPUtil.fenv_impl
   COMPILE_OPTIONS
     -O2
@@ -179,7 +184,6 @@ add_entrypoint_object(
   HDRS
     fedisableexcept.h
   DEPENDS
-    libc.hdr.fenv_macros
     libc.src.__support.FPUtil.fenv_impl
   COMPILE_OPTIONS
     -O2
@@ -192,7 +196,6 @@ add_entrypoint_object(
   HDRS
     fegetexcept.h
   DEPENDS
-    libc.hdr.fenv_macros
     libc.src.__support.FPUtil.fenv_impl
   COMPILE_OPTIONS
     -O2
diff --git a/libc/src/fenv/fegetexceptflag.h b/libc/src/fenv/fegetexceptflag.h
index ad72161e536f8..fcb9598658d43 100644
--- a/libc/src/fenv/fegetexceptflag.h
+++ b/libc/src/fenv/fegetexceptflag.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_FENV_FEGETEXCEPTFLAG_H
 #define LLVM_LIBC_SRC_FENV_FEGETEXCEPTFLAG_H
 
-#include <fenv.h>
+#include "hdr/types/fexcept_t.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/fenv/fesetexceptflag.h b/libc/src/fenv/fesetexceptflag.h
index 15e62eda1b840..a018358dc9dfc 100644
--- a/libc/src/fenv/fesetexceptflag.h
+++ b/libc/src/fenv/fesetexceptflag.h
@@ -9,7 +9,7 @@
 #ifndef LLVM_LIBC_SRC_FENV_FESETEXCEPTFLAG_H
 #define LLVM_LIBC_SRC_FENV_FESETEXCEPTFLAG_H
 
-#include <fenv.h>
+#include "hdr/types/fexcept_t.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/fenv/fetestexceptflag.cpp b/libc/src/fenv/fetestexceptflag.cpp
new file mode 100644
index 0000000000000..63453350a199f
--- /dev/null
+++ b/libc/src/fenv/fetestexceptflag.cpp
@@ -0,0 +1,23 @@
+//===-- Implementation of fetestexceptflag function -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/fenv/fetestexceptflag.h"
+#include "hdr/types/fexcept_t.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, fetestexceptflag,
+                   (const fexcept_t *flagp, int excepts)) {
+  static_assert(sizeof(int) >= sizeof(fexcept_t),
+                "fexcept_t value cannot fit in an int value.");
+  return *flagp | fputil::test_except(excepts);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/fenv/fetestexceptflag.h b/libc/src/fenv/fetestexceptflag.h
new file mode 100644
index 0000000000000..1c8b0b843f547
--- /dev/null
+++ b/libc/src/fenv/fetestexceptflag.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for fetestexceptflag --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_FENV_FETESTEXCEPTFLAG_H
+#define LLVM_LIBC_SRC_FENV_FETESTEXCEPTFLAG_H
+
+#include "hdr/types/fexcept_t.h"
+
+namespace LIBC_NAMESPACE {
+
+int fetestexceptflag(const fexcept_t *, int excepts);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_FENV_FETESTEXCEPTFLAG_H
diff --git a/libc/src/fenv/feupdateenv.cpp b/libc/src/fenv/feupdateenv.cpp
index 7e81b9476da91..0664863538155 100644
--- a/libc/src/fenv/feupdateenv.cpp
+++ b/libc/src/fenv/feupdateenv.cpp
@@ -10,7 +10,7 @@
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/common.h"
 
-#include <fenv.h>
+#include "hdr/types/fenv_t.h"
 
 namespace LIBC_NAMESPACE {
 
diff --git a/libc/src/pthread/CMakeLists.txt b/libc/src/pthread/CMakeLists.txt
index d5e6c802a8452..3d6cf6dde010b 100644
--- a/libc/src/pthread/CMakeLists.txt
+++ b/libc/src/pthread/CMakeLists.txt
@@ -100,6 +100,71 @@ add_entrypoint_object(
     libc.src.pthread.pthread_attr_setstacksize
 )
 
+add_entrypoint_object(
+  pthread_condattr_destroy
+  SRCS
+    pthread_condattr_destroy.cpp
+  HDRS
+    pthread_condattr_destroy.h
+  DEPENDS
+    libc.include.pthread
+)
+
+add_entrypoint_object(
+  pthread_condattr_getclock
+  SRCS
+    pthread_condattr_getclock.cpp
+  HDRS
+    pthread_condattr_getclock.h
+  DEPENDS
+    libc.include.pthread
+    libc.include.sys_types
+)
+
+add_entrypoint_object(
+  pthread_condattr_getpshared
+  SRCS
+    pthread_condattr_getpshared.cpp
+  HDRS
+    pthread_condattr_getpshared.h
+  DEPENDS
+    libc.include.pthread
+)
+
+add_entrypoint_object(
+  pthread_condattr_init
+  SRCS
+    pthread_condattr_init.cpp
+  HDRS
+    pthread_condattr_init.h
+  DEPENDS
+    libc.include.pthread
+    libc.include.time
+)
+
+add_entrypoint_object(
+  pthread_condattr_setclock
+  SRCS
+    pthread_condattr_setclock.cpp
+  HDRS
+    pthread_condattr_setclock.h
+  DEPENDS
+    libc.include.errno
+    libc.include.pthread
+    libc.include.sys_types
+    libc.include.time
+)
+
+add_entrypoint_object(
+  pthread_condattr_setpshared
+  SRCS
+    pthread_condattr_setpshared.cpp
+  HDRS
+    pthread_condattr_setpshared.h
+  DEPENDS
+    libc.include.pthread
+)
+
 add_header_library(
   pthread_mutexattr
   HDRS
diff --git a/libc/src/pthread/pthread_condattr_destroy.cpp b/libc/src/pthread/pthread_condattr_destroy.cpp
new file mode 100644
index 0000000000000..41994c6941ffe
--- /dev/null
+++ b/libc/src/pthread/pthread_condattr_destroy.cpp
@@ -0,0 +1,24 @@
+//===-- Implementation of the pthread_condattr_destroy --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "pthread_condattr_destroy.h"
+
+#include "src/__support/common.h"
+
+#include <pthread.h>
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, pthread_condattr_destroy,
+                   (pthread_condattr_t * attr [[gnu::unused]])) {
+  // Initializing a pthread_condattr_t acquires no resources, so this is a
+  // no-op.
+  return 0;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/pthread/pthread_condattr_destroy.h b/libc/src/pthread/pthread_condattr_destroy.h
new file mode 100644
index 0000000000000..2910fa9f96168
--- /dev/null
+++ b/libc/src/pthread/pthread_condattr_destroy.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for pthread_condattr_destroy ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_DESTROY_H
+#define LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_DESTROY_H
+
+#include <pthread.h>
+
+namespace LIBC_NAMESPACE {
+
+int pthread_condattr_destroy(pthread_condattr_t *attr);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_DESTROY_H
diff --git a/libc/src/pthread/pthread_condattr_getclock.cpp b/libc/src/pthread/pthread_condattr_getclock.cpp
new file mode 100644
index 0000000000000..a3a3963f4f429
--- /dev/null
+++ b/libc/src/pthread/pthread_condattr_getclock.cpp
@@ -0,0 +1,25 @@
+//===-- Implementation of the pthread_condattr_getclock -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "pthread_condattr_getclock.h"
+
+#include "src/__support/common.h"
+
+#include <pthread.h>   // pthread_condattr_t
+#include <sys/types.h> // clockid_t
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, pthread_condattr_getclock,
+                   (const pthread_condattr_t *__restrict attr,
+                    clockid_t *__restrict clock_id)) {
+  *clock_id = attr->clock;
+  return 0;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/pthread/pthread_condattr_getclock.h b/libc/src/pthread/pthread_condattr_getclock.h
new file mode 100644
index 0000000000000..d5878c4f45b53
--- /dev/null
+++ b/libc/src/pthread/pthread_condattr_getclock.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for pthread_condattr_getclock -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_GETCLOCK_H
+#define LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_GETCLOCK_H
+
+#include <pthread.h>   // pthread_condattr_t
+#include <sys/types.h> // clockid_t
+
+namespace LIBC_NAMESPACE {
+
+int pthread_condattr_getclock(const pthread_condattr_t *__restrict attr,
+                              clockid_t *__restrict clock_id);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_GETCLOCK_H
diff --git a/libc/src/pthread/pthread_condattr_getpshared.cpp b/libc/src/pthread/pthread_condattr_getpshared.cpp
new file mode 100644
index 0000000000000..0c5fdc115c25d
--- /dev/null
+++ b/libc/src/pthread/pthread_condattr_getpshared.cpp
@@ -0,0 +1,24 @@
+//===-- Implementation of the pthread_condattr_getpshared -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "pthread_condattr_getpshared.h"
+
+#include "src/__support/common.h"
+
+#include <pthread.h>
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, pthread_condattr_getpshared,
+                   (const pthread_condattr_t *__restrict attr,
+                    int *__restrict pshared)) {
+  *pshared = attr->pshared;
+  return 0;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/pthread/pthread_condattr_getpshared.h b/libc/src/pthread/pthread_condattr_getpshared.h
new file mode 100644
index 0000000000000..3d7a0c1d357c6
--- /dev/null
+++ b/libc/src/pthread/pthread_condattr_getpshared.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for pthread_condattr_getpshared ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_PSHARED_H
+#define LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_PSHARED_H
+
+#include <pthread.h>
+
+namespace LIBC_NAMESPACE {
+
+int pthread_condattr_getpshared(const pthread_condattr_t *__restrict attr,
+                                int *__restrict pshared);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_PSHARED_H
diff --git a/libc/src/pthread/pthread_condattr_init.cpp b/libc/src/pthread/pthread_condattr_init.cpp
new file mode 100644
index 0000000000000..54633b2e3a5ea
--- /dev/null
+++ b/libc/src/pthread/pthread_condattr_init.cpp
@@ -0,0 +1,24 @@
+//===-- Implementation of the pthread_condattr_init -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "pthread_condattr_init.h"
+
+#include "src/__support/common.h"
+
+#include <pthread.h> // pthread_condattr_t, PTHREAD_PROCESS_PRIVATE
+#include <time.h>    // CLOCK_REALTIME
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, pthread_condattr_init, (pthread_condattr_t * attr)) {
+  attr->clock = CLOCK_REALTIME;
+  attr->pshared = PTHREAD_PROCESS_PRIVATE;
+  return 0;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/pthread/pthread_condattr_init.h b/libc/src/pthread/pthread_condattr_init.h
new file mode 100644
index 0000000000000..9f3c06bb6f4ae
--- /dev/null
+++ b/libc/src/pthread/pthread_condattr_init.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for pthread_condattr_init ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_INIT_H
+#define LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_INIT_H
+
+#include <pthread.h>
+
+namespace LIBC_NAMESPACE {
+
+int pthread_condattr_init(pthread_condattr_t *attr);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_INIT_H
diff --git a/libc/src/pthread/pthread_condattr_setclock.cpp b/libc/src/pthread/pthread_condattr_setclock.cpp
new file mode 100644
index 0000000000000..6eca8b30ef7f8
--- /dev/null
+++ b/libc/src/pthread/pthread_condattr_setclock.cpp
@@ -0,0 +1,30 @@
+//===-- Implementation of the pthread_condattr_setclock -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "pthread_condattr_setclock.h"
+
+#include "src/__support/common.h"
+
+#include <errno.h>     // EINVAL
+#include <pthread.h>   // pthread_condattr_t
+#include <sys/types.h> // clockid_t
+#include <time.h>      // CLOCK_MONOTONIC, CLOCK_REALTIME
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, pthread_condattr_setclock,
+                   (pthread_condattr_t * attr, clockid_t clock)) {
+
+  if (clock != CLOCK_MONOTONIC && clock != CLOCK_REALTIME)
+    return EINVAL;
+
+  attr->clock = clock;
+  return 0;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/pthread/pthread_condattr_setclock.h b/libc/src/pthread/pthread_condattr_setclock.h
new file mode 100644
index 0000000000000..328766fe78833
--- /dev/null
+++ b/libc/src/pthread/pthread_condattr_setclock.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for pthread_condattr_setclock -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_SETCLOCK_H
+#define LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_SETCLOCK_H
+
+#include <pthread.h>
+#include <sys/types.h> // clockid_t
+
+namespace LIBC_NAMESPACE {
+
+int pthread_condattr_setclock(pthread_condattr_t *attr, clockid_t clock);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_SETCLOCK_H
diff --git a/libc/src/pthread/pthread_condattr_setpshared.cpp b/libc/src/pthread/pthread_condattr_setpshared.cpp
new file mode 100644
index 0000000000000..7f1560acad843
--- /dev/null
+++ b/libc/src/pthread/pthread_condattr_setpshared.cpp
@@ -0,0 +1,28 @@
+//===-- Implementation of the pthread_condattr_setpshared -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "pthread_condattr_setpshared.h"
+
+#include "src/__support/common.h"
+
+#include <errno.h> // EINVAL
+#include <pthread.h> // pthread_condattr_t, PTHREAD_PROCESS_SHARED, PTHREAD_PROCESS_PRIVATE
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, pthread_condattr_setpshared,
+                   (pthread_condattr_t * attr, int pshared)) {
+
+  if (pshared != PTHREAD_PROCESS_SHARED && pshared != PTHREAD_PROCESS_PRIVATE)
+    return EINVAL;
+
+  attr->pshared = pshared;
+  return 0;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/pthread/pthread_condattr_setpshared.h b/libc/src/pthread/pthread_condattr_setpshared.h
new file mode 100644
index 0000000000000..8083bdec78cc4
--- /dev/null
+++ b/libc/src/pthread/pthread_condattr_setpshared.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for pthread_condattr_setpshared ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_SETPSHARED_H
+#define LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_SETPSHARED_H
+
+#include <pthread.h>
+
+namespace LIBC_NAMESPACE {
+
+int pthread_condattr_setpshared(pthread_condattr_t *attr, int pshared);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_PTHREAD_PTHREAD_CONDATTR_SETPSHARED_H
diff --git a/libc/test/src/fenv/CMakeLists.txt b/libc/test/src/fenv/CMakeLists.txt
index 577735599dc01..f277b65e2d42b 100644
--- a/libc/test/src/fenv/CMakeLists.txt
+++ b/libc/test/src/fenv/CMakeLists.txt
@@ -48,6 +48,7 @@ add_libc_unittest(
   DEPENDS
     libc.src.fenv.fegetexceptflag
     libc.src.fenv.fesetexceptflag
+    libc.src.fenv.fetestexceptflag
     libc.src.__support.FPUtil.fenv_impl
 )
 
diff --git a/libc/test/src/fenv/exception_flags_test.cpp b/libc/test/src/fenv/exception_flags_test.cpp
index d1d8bfcc53db5..9d2be6426a6d0 100644
--- a/libc/test/src/fenv/exception_flags_test.cpp
+++ b/libc/test/src/fenv/exception_flags_test.cpp
@@ -1,4 +1,4 @@
-//===-- Unittests for fegetexceptflag and fesetexceptflag -----------------===//
+//===-- Unittests for fe{get|set|test}exceptflag --------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,11 +9,12 @@
 #include "hdr/types/fexcept_t.h"
 #include "src/fenv/fegetexceptflag.h"
 #include "src/fenv/fesetexceptflag.h"
+#include "src/fenv/fetestexceptflag.h"
 
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "test/UnitTest/Test.h"
 
-TEST(LlvmLibcFenvTest, GetExceptFlagAndSetExceptFlag) {
+TEST(LlvmLibcFenvTest, GetSetTestExceptFlag) {
   // We will disable all exceptions to prevent invocation of the exception
   // handler.
   LIBC_NAMESPACE::fputil::disable_except(FE_ALL_EXCEPT);
@@ -39,19 +40,36 @@ TEST(LlvmLibcFenvTest, GetExceptFlagAndSetExceptFlag) {
     ASSERT_EQ(LIBC_NAMESPACE::fesetexceptflag(&eflags, FE_ALL_EXCEPT), 0);
     ASSERT_NE(LIBC_NAMESPACE::fputil::test_except(FE_ALL_EXCEPT) & e, 0);
 
+    // Exception flags are exactly the flags corresponding to the previously
+    // raised exception.
+    ASSERT_EQ(LIBC_NAMESPACE::fetestexceptflag(&eflags, FE_ALL_EXCEPT),
+              LIBC_NAMESPACE::fputil::test_except(FE_ALL_EXCEPT));
+
     // Cleanup. We clear all excepts as raising excepts like FE_OVERFLOW
     // can also raise FE_INEXACT.
     LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
   }
 
-  // Next, we will raise one exception and save the flags.
+  // Next, we will raise one exception, save the flag and clear all exceptions.
   LIBC_NAMESPACE::fputil::raise_except(FE_INVALID);
-  fexcept_t eflags;
-  LIBC_NAMESPACE::fegetexceptflag(&eflags, FE_ALL_EXCEPT);
-  // Clear all exceptions and raise two other exceptions.
+  fexcept_t invalid_flag;
+  LIBC_NAMESPACE::fegetexceptflag(&invalid_flag, FE_ALL_EXCEPT);
+  ASSERT_EQ(LIBC_NAMESPACE::fetestexceptflag(&invalid_flag, FE_ALL_EXCEPT),
+            FE_INVALID);
   LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT);
+
+  // Raise two other exceptions and verify that they are set.
   LIBC_NAMESPACE::fputil::raise_except(FE_OVERFLOW | FE_INEXACT);
+  fexcept_t overflow_and_inexact_flag;
+  LIBC_NAMESPACE::fegetexceptflag(&overflow_and_inexact_flag, FE_ALL_EXCEPT);
+  ASSERT_EQ(LIBC_NAMESPACE::fetestexceptflag(&overflow_and_inexact_flag,
+                                             FE_ALL_EXCEPT),
+            FE_OVERFLOW | FE_INEXACT);
+  ASSERT_EQ(LIBC_NAMESPACE::fetestexceptflag(&overflow_and_inexact_flag,
+                                             FE_OVERFLOW | FE_INEXACT),
+            FE_OVERFLOW | FE_INEXACT);
+
   // When we set the flags and test, we should only see FE_INVALID.
-  LIBC_NAMESPACE::fesetexceptflag(&eflags, FE_ALL_EXCEPT);
+  LIBC_NAMESPACE::fesetexceptflag(&invalid_flag, FE_ALL_EXCEPT);
   EXPECT_EQ(LIBC_NAMESPACE::fputil::test_except(FE_ALL_EXCEPT), FE_INVALID);
 }
diff --git a/libc/test/src/pthread/CMakeLists.txt b/libc/test/src/pthread/CMakeLists.txt
index fb0d22ab9d2a5..51954a5babd2c 100644
--- a/libc/test/src/pthread/CMakeLists.txt
+++ b/libc/test/src/pthread/CMakeLists.txt
@@ -39,3 +39,21 @@ add_libc_unittest(
     libc.src.pthread.pthread_mutexattr_setrobust
     libc.src.pthread.pthread_mutexattr_settype
 )
+
+add_libc_unittest(
+  pthread_condattr_test
+  SUITE
+    libc_pthread_unittests
+  SRCS
+    pthread_condattr_test.cpp
+  DEPENDS
+    libc.include.errno
+    libc.include.pthread
+    libc.include.time
+    libc.src.pthread.pthread_condattr_destroy
+    libc.src.pthread.pthread_condattr_getclock
+    libc.src.pthread.pthread_condattr_getpshared
+    libc.src.pthread.pthread_condattr_init
+    libc.src.pthread.pthread_condattr_setclock
+    libc.src.pthread.pthread_condattr_setpshared
+  )
diff --git a/libc/test/src/pthread/pthread_condattr_test.cpp b/libc/test/src/pthread/pthread_condattr_test.cpp
new file mode 100644
index 0000000000000..accb62de92e45
--- /dev/null
+++ b/libc/test/src/pthread/pthread_condattr_test.cpp
@@ -0,0 +1,71 @@
+//===-- Unittests for pthread_condattr_t ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "test/UnitTest/Test.h"
+
+#include <errno.h>
+#include <pthread.h>
+#include <time.h>
+
+TEST(LlvmLibcPThreadCondAttrTest, InitAndDestroy) {
+  pthread_condattr_t cond;
+  ASSERT_EQ(pthread_condattr_init(&cond), 0);
+  ASSERT_EQ(pthread_condattr_destroy(&cond), 0);
+}
+
+TEST(LlvmLibcPThreadCondAttrTest, GetDefaultValues) {
+  pthread_condattr_t cond;
+
+  // Invalid clock id.
+  clockid_t clock = 7;
+  // Invalid value.
+  int pshared = 42;
+
+  ASSERT_EQ(pthread_condattr_init(&cond), 0);
+  ASSERT_EQ(pthread_condattr_getclock(&cond, &clock), 0);
+  ASSERT_EQ(clock, CLOCK_REALTIME);
+  ASSERT_EQ(pthread_condattr_getpshared(&cond, &pshared), 0);
+  ASSERT_EQ(pshared, PTHREAD_PROCESS_PRIVATE);
+  ASSERT_EQ(pthread_condattr_destroy(&cond), 0);
+}
+
+TEST(LlvmLibcPThreadCondAttrTest, SetGoodValues) {
+  pthread_condattr_t cond;
+
+  // Invalid clock id.
+  clockid_t clock = 7;
+  // Invalid value.
+  int pshared = 42;
+
+  ASSERT_EQ(pthread_condattr_init(&cond), 0);
+  ASSERT_EQ(pthread_condattr_setclock(&cond, CLOCK_MONOTONIC), 0);
+  ASSERT_EQ(pthread_condattr_getclock(&cond, &clock), 0);
+  ASSERT_EQ(clock, CLOCK_MONOTONIC);
+  ASSERT_EQ(pthread_condattr_setpshared(&cond, PTHREAD_PROCESS_SHARED), 0);
+  ASSERT_EQ(pthread_condattr_getpshared(&cond, &pshared), 0);
+  ASSERT_EQ(pshared, PTHREAD_PROCESS_SHARED);
+  ASSERT_EQ(pthread_condattr_destroy(&cond), 0);
+}
+
+TEST(LlvmLibcPThreadCondAttrTest, SetBadValues) {
+  pthread_condattr_t cond;
+
+  // Invalid clock id.
+  clockid_t clock = 7;
+  // Invalid value.
+  int pshared = 42;
+
+  ASSERT_EQ(pthread_condattr_init(&cond), 0);
+  ASSERT_EQ(pthread_condattr_setclock(&cond, clock), EINVAL);
+  ASSERT_EQ(pthread_condattr_getclock(&cond, &clock), 0);
+  ASSERT_EQ(clock, CLOCK_REALTIME);
+  ASSERT_EQ(pthread_condattr_setpshared(&cond, pshared), EINVAL);
+  ASSERT_EQ(pthread_condattr_getpshared(&cond, &pshared), 0);
+  ASSERT_EQ(pshared, PTHREAD_PROCESS_PRIVATE);
+  ASSERT_EQ(pthread_condattr_destroy(&cond), 0);
+}
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 7528228b3b7f9..f605c3bbbe9dc 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -50,7 +50,7 @@ if( LIBCLC_STANDALONE_BUILD OR CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DI
   endif()
 
   # Import required tools as targets
-  foreach( tool clang llvm-as llvm-link opt )
+  foreach( tool IN ITEMS clang llvm-as llvm-link opt )
     find_program( LLVM_TOOL_${tool} ${tool} PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH )
     add_executable( libclc::${tool} IMPORTED GLOBAL )
     set_target_properties( libclc::${tool} PROPERTIES IMPORTED_LOCATION ${LLVM_TOOL_${tool}} )
@@ -68,7 +68,7 @@ else()
     message(FATAL_ERROR "Clang is not enabled, but is required to build libclc in-tree")
   endif()
 
-  foreach( tool clang llvm-as llvm-link opt )
+  foreach( tool IN ITEMS clang llvm-as llvm-link opt )
     add_executable(libclc::${tool} ALIAS ${tool})
   endforeach()
 endif()
@@ -181,7 +181,7 @@ install( FILES ${CMAKE_CURRENT_BINARY_DIR}/libclc.pc DESTINATION "${CMAKE_INSTAL
 install( DIRECTORY generic/include/clc DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" )
 
 if( ENABLE_RUNTIME_SUBNORMAL )
-  foreach( file subnormal_use_default subnormal_disable )
+  foreach( file IN ITEMS subnormal_use_default subnormal_disable )
     link_bc(
        TARGET ${file}
        INPUTS ${PROJECT_SOURCE_DIR}/generic/lib/${file}.ll
@@ -326,7 +326,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
         # Generated files are given just as file names, which we must make
         # absolute to the binary directory.
         set( input_file ${CMAKE_CURRENT_BINARY_DIR}/${file} )
-        set( output_file "${LIBCLC_ARCH_OBJFILE_DIR}/${file}.o" )
+        set( output_file "${LIBCLC_ARCH_OBJFILE_DIR}/${file}.bc" )
       else()
         # Other files are originally relative to each SOURCE file, which are
         # then make relative to the libclc root directory. We must normalize
@@ -336,7 +336,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
         get_filename_component( abs_path ${file} ABSOLUTE BASE_DIR ${PROJECT_SOURCE_DIR} )
         file( RELATIVE_PATH root_rel_path ${PROJECT_SOURCE_DIR} ${abs_path} )
         set( input_file ${PROJECT_SOURCE_DIR}/${file} )
-        set( output_file "${LIBCLC_ARCH_OBJFILE_DIR}/${root_rel_path}.o" )
+        set( output_file "${LIBCLC_ARCH_OBJFILE_DIR}/${root_rel_path}.bc" )
       endif()
 
       get_filename_component( file_dir ${file} DIRECTORY )
@@ -364,7 +364,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
       set( spv_suffix ${arch_suffix}.spv )
       add_custom_command( OUTPUT ${spv_suffix}
         COMMAND ${LLVM_SPIRV} ${spvflags} -o ${spv_suffix} ${builtins_link_lib}
-        DEPENDS ${builtins_link_lib_tgt}
+        DEPENDS ${builtins_link_lib}
       )
       add_custom_target( "prepare-${spv_suffix}" ALL DEPENDS "${spv_suffix}" )
       install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${spv_suffix}
@@ -376,7 +376,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
       add_custom_command( OUTPUT ${builtins_opt_lib_tgt}.bc
         COMMAND libclc::opt ${opt_flags} -o ${builtins_opt_lib_tgt}.bc
           ${builtins_link_lib}
-        DEPENDS libclc::opt ${builtins_link_lib_tgt}
+        DEPENDS libclc::opt ${builtins_link_lib}
       )
       add_custom_target( ${builtins_opt_lib_tgt}
         ALL DEPENDS ${builtins_opt_lib_tgt}.bc
@@ -385,12 +385,13 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
         PROPERTIES TARGET_FILE ${builtins_opt_lib_tgt}.bc
       )
 
+      set( builtins_opt_lib $<TARGET_PROPERTY:${builtins_opt_lib_tgt},TARGET_FILE> )
+
       # Add prepare target
       set( obj_suffix ${arch_suffix}.bc )
       add_custom_command( OUTPUT ${obj_suffix}
-        COMMAND prepare_builtins -o ${obj_suffix}
-          $<TARGET_PROPERTY:${builtins_opt_lib_tgt},TARGET_FILE>
-        DEPENDS ${builtins_opt_lib_tgt} prepare_builtins )
+        COMMAND prepare_builtins -o ${obj_suffix} ${builtins_opt_lib}
+        DEPENDS ${builtins_opt_lib} prepare_builtins )
       add_custom_target( prepare-${obj_suffix} ALL DEPENDS ${obj_suffix} )
 
       # nvptx-- targets don't include workitem builtins
diff --git a/libcxx/.clang-format b/libcxx/.clang-format
index 39ae1322ffa8a..c37ab817bca90 100644
--- a/libcxx/.clang-format
+++ b/libcxx/.clang-format
@@ -24,6 +24,7 @@ AttributeMacros: [
                   '_LIBCPP_CONSTEXPR_SINCE_CXX23',
                   '_LIBCPP_CONSTEXPR',
                   '_LIBCPP_CONSTINIT',
+                  '_LIBCPP_DEPRECATED_ATOMIC_SYNC',
                   '_LIBCPP_DEPRECATED_IN_CXX11',
                   '_LIBCPP_DEPRECATED_IN_CXX14',
                   '_LIBCPP_DEPRECATED_IN_CXX17',
diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 043d5a8295c1a..2977c26646cb2 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -300,9 +300,9 @@ option(LIBCXX_HAS_EXTERNAL_THREAD_API
    This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF)
 
 if (LIBCXX_ENABLE_THREADS)
-  set(LIBCXX_PSTL_CPU_BACKEND "std_thread" CACHE STRING "Which PSTL CPU backend to use")
+  set(LIBCXX_PSTL_BACKEND "std_thread" CACHE STRING "Which PSTL backend to use")
 else()
-  set(LIBCXX_PSTL_CPU_BACKEND "serial" CACHE STRING "Which PSTL CPU backend to use")
+  set(LIBCXX_PSTL_BACKEND "serial" CACHE STRING "Which PSTL backend to use")
 endif()
 
 # Misc options ----------------------------------------------------------------
@@ -792,14 +792,14 @@ elseif (LIBCXX_HARDENING_MODE STREQUAL "debug")
   config_define(8 _LIBCPP_HARDENING_MODE_DEFAULT)
 endif()
 
-if (LIBCXX_PSTL_CPU_BACKEND STREQUAL "serial")
-  config_define(1 _LIBCPP_PSTL_CPU_BACKEND_SERIAL)
-elseif(LIBCXX_PSTL_CPU_BACKEND STREQUAL "std_thread")
-  config_define(1 _LIBCPP_PSTL_CPU_BACKEND_THREAD)
-elseif(LIBCXX_PSTL_CPU_BACKEND STREQUAL "libdispatch")
-  config_define(1 _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
+if (LIBCXX_PSTL_BACKEND STREQUAL "serial")
+  config_define(1 _LIBCPP_PSTL_BACKEND_SERIAL)
+elseif(LIBCXX_PSTL_BACKEND STREQUAL "std_thread")
+  config_define(1 _LIBCPP_PSTL_BACKEND_STD_THREAD)
+elseif(LIBCXX_PSTL_BACKEND STREQUAL "libdispatch")
+  config_define(1 _LIBCPP_PSTL_BACKEND_LIBDISPATCH)
 else()
-  message(FATAL_ERROR "LIBCXX_PSTL_CPU_BACKEND is set to ${LIBCXX_PSTL_CPU_BACKEND}, which is not a valid backend.
+  message(FATAL_ERROR "LIBCXX_PSTL_BACKEND is set to ${LIBCXX_PSTL_BACKEND}, which is not a valid backend.
                        Valid backends are: serial, std_thread and libdispatch")
 endif()
 
diff --git a/libcxx/cmake/caches/Apple.cmake b/libcxx/cmake/caches/Apple.cmake
index cec13c08acf10..8768653e620ad 100644
--- a/libcxx/cmake/caches/Apple.cmake
+++ b/libcxx/cmake/caches/Apple.cmake
@@ -7,7 +7,7 @@ set(LIBCXX_ENABLE_STATIC ON CACHE BOOL "")
 set(LIBCXX_ENABLE_SHARED ON CACHE BOOL "")
 set(LIBCXX_CXX_ABI libcxxabi CACHE STRING "")
 set(LIBCXX_ENABLE_VENDOR_AVAILABILITY_ANNOTATIONS ON CACHE BOOL "")
-set(LIBCXX_PSTL_CPU_BACKEND libdispatch CACHE STRING "")
+set(LIBCXX_PSTL_BACKEND libdispatch CACHE STRING "")
 
 set(LIBCXX_HERMETIC_STATIC_LIBRARY ON CACHE BOOL "")
 set(LIBCXXABI_HERMETIC_STATIC_LIBRARY ON CACHE BOOL "")
diff --git a/libcxx/docs/BuildingLibcxx.rst b/libcxx/docs/BuildingLibcxx.rst
index 28145ed1049e0..a0a0cdb433974 100644
--- a/libcxx/docs/BuildingLibcxx.rst
+++ b/libcxx/docs/BuildingLibcxx.rst
@@ -206,6 +206,12 @@ libc++ specific options
 
   Toggle the installation of the libc++ headers.
 
+.. option:: LIBCXX_INSTALL_MODULES:BOOL
+
+  **Default**: ``OFF``
+
+  Toggle the installation of the experimental libc++ module sources.
+
 .. option:: LIBCXX_ENABLE_SHARED:BOOL
 
   **Default**: ``ON``
diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index e5db17daa4823..53cc7a77d1af4 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -75,6 +75,10 @@ Improvements and New Features
 Deprecations and Removals
 -------------------------
 
+- The C++20 synchronization library (``<barrier>``, ``<latch>``, ``atomic::wait``, etc.) has been deprecated
+  in language modes prior to C++20. If you are using these features prior to C++20, please update to ``-std=c++20``.
+  In LLVM 20, the C++20 synchronization library will be removed entirely in language modes prior to C++20.
+
 - TODO: The ``LIBCXX_ENABLE_ASSERTIONS`` CMake variable that was used to enable the safe mode has been deprecated and setting
   it triggers an error; use the ``LIBCXX_HARDENING_MODE`` CMake variable with the value ``extensive`` instead. Similarly,
   the ``_LIBCPP_ENABLE_ASSERTIONS`` macro has been deprecated (setting it to ``1`` still enables the extensive mode in
@@ -93,7 +97,7 @@ Deprecations and Removals
 - The ``_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS`` and ``_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_VOID_SPECIALIZATION``
   macros have been removed in LLVM 19.
 
-- TODO: The ``_LIBCPP_ENABLE_CXX17_REMOVED_FEATURES`` and ``_LIBCPP_ENABLE_CXX20_REMOVED_FEATURES`` macros have
+- The ``_LIBCPP_ENABLE_CXX17_REMOVED_FEATURES`` and ``_LIBCPP_ENABLE_CXX20_REMOVED_FEATURES`` macros have
   been removed in LLVM 19. C++17 and C++20 removed features can still be re-enabled individually.
 
 - The ``_LIBCPP_INLINE_VISIBILITY`` and ``_VSTD`` macros have been removed in LLVM 19.
diff --git a/libcxx/docs/Status/FormatPaper.csv b/libcxx/docs/Status/FormatPaper.csv
index e9d407e79e253..8ace18815f537 100644
--- a/libcxx/docs/Status/FormatPaper.csv
+++ b/libcxx/docs/Status/FormatPaper.csv
@@ -24,7 +24,7 @@ Section,Description,Dependencies,Assignee,Status,First released version
 `[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::year_month_weekday``",,Mark de Wever,|Complete|,16.0
 `[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::year_month_weekday_last``",,Mark de Wever,|Complete|,16.0
 `[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::hh_mm_ss<duration<Rep, Period>>``",,Mark de Wever,|Complete|,17.0
-`[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::sys_info``",A ``<chrono>`` implementation,Mark de Wever,,
+`[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::sys_info``",,Mark de Wever,|Complete|,19.0
 `[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::local_info``",A ``<chrono>`` implementation,Mark de Wever,,
 `[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::zoned_time<Duration, TimeZonePtr>``",A ``<chrono>`` implementation,Mark de Wever,,
 
diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index c0e85ad4d5e24..8f945656de1ca 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -208,12 +208,6 @@ safety annotations.
 
 C++17 Specific Configuration Macros
 -----------------------------------
-**_LIBCPP_ENABLE_CXX17_REMOVED_FEATURES**:
-  This macro is used to re-enable all the features removed in C++17. The effect
-  is equivalent to manually defining each macro listed below.
-  This macro is deprecated and will be removed in LLVM-19. Use the
-  individual macros listed below.
-
 **_LIBCPP_ENABLE_CXX17_REMOVED_AUTO_PTR**:
   This macro is used to re-enable `auto_ptr`.
 
@@ -238,12 +232,6 @@ C++20 Specific Configuration Macros
   This macro is used to re-enable the function
   ``std::shared_ptr<...>::unique()``.
 
-**_LIBCPP_ENABLE_CXX20_REMOVED_FEATURES**:
-  This macro is used to re-enable all the features removed in C++20. The effect
-  is equivalent to manually defining each macro listed below.
-  This macro is deprecated and will be removed in LLVM-19. Use the
-  individual macros listed below.
-
 **_LIBCPP_ENABLE_CXX20_REMOVED_BINDER_TYPEDEFS**:
   This macro is used to re-enable the `argument_type`, `result_type`,
   `first_argument_type`, and `second_argument_type` members of class
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index a2af1d9915be4..ee4979bfc6f89 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -73,18 +73,12 @@ set(files
   __algorithm/pop_heap.h
   __algorithm/prev_permutation.h
   __algorithm/pstl_any_all_none_of.h
-  __algorithm/pstl_backend.h
-  __algorithm/pstl_backends/cpu_backend.h
   __algorithm/pstl_backends/cpu_backends/any_of.h
-  __algorithm/pstl_backends/cpu_backends/backend.h
   __algorithm/pstl_backends/cpu_backends/fill.h
   __algorithm/pstl_backends/cpu_backends/find_if.h
   __algorithm/pstl_backends/cpu_backends/for_each.h
-  __algorithm/pstl_backends/cpu_backends/libdispatch.h
   __algorithm/pstl_backends/cpu_backends/merge.h
-  __algorithm/pstl_backends/cpu_backends/serial.h
   __algorithm/pstl_backends/cpu_backends/stable_sort.h
-  __algorithm/pstl_backends/cpu_backends/thread.h
   __algorithm/pstl_backends/cpu_backends/transform.h
   __algorithm/pstl_backends/cpu_backends/transform_reduce.h
   __algorithm/pstl_copy.h
@@ -594,6 +588,11 @@ set(files
   __numeric/transform_exclusive_scan.h
   __numeric/transform_inclusive_scan.h
   __numeric/transform_reduce.h
+  __pstl/backends/libdispatch.h
+  __pstl/backends/serial.h
+  __pstl/backends/std_thread.h
+  __pstl/configuration.h
+  __pstl/configuration_fwd.h
   __pstl/cpu_algos/cpu_traits.h
   __random/bernoulli_distribution.h
   __random/binomial_distribution.h
diff --git a/libcxx/include/__algorithm/pstl_any_all_none_of.h b/libcxx/include/__algorithm/pstl_any_all_none_of.h
index 4b1e0e61b5421..911a7e42b3fa3 100644
--- a/libcxx/include/__algorithm/pstl_any_all_none_of.h
+++ b/libcxx/include/__algorithm/pstl_any_all_none_of.h
@@ -60,7 +60,7 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI bool
 any_of(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "any_of requires a ForwardIterator");
   auto __res = std::__any_of(__policy, std::move(__first), std::move(__last), std::move(__pred));
   if (!__res)
     std::__throw_bad_alloc();
@@ -99,7 +99,7 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI bool
 all_of(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Pred __pred) {
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "all_of requires a ForwardIterator");
   auto __res = std::__all_of(__policy, std::move(__first), std::move(__last), std::move(__pred));
   if (!__res)
     std::__throw_bad_alloc();
@@ -136,7 +136,7 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI bool
 none_of(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Pred __pred) {
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "none_of requires a ForwardIterator");
   auto __res = std::__none_of(__policy, std::move(__first), std::move(__last), std::move(__pred));
   if (!__res)
     std::__throw_bad_alloc();
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
deleted file mode 100644
index 53eae58f96095..0000000000000
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backend.h
+++ /dev/null
@@ -1,23 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H
-
-#include <__algorithm/pstl_backends/cpu_backends/any_of.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
-#include <__algorithm/pstl_backends/cpu_backends/fill.h>
-#include <__algorithm/pstl_backends/cpu_backends/find_if.h>
-#include <__algorithm/pstl_backends/cpu_backends/for_each.h>
-#include <__algorithm/pstl_backends/cpu_backends/merge.h>
-#include <__algorithm/pstl_backends/cpu_backends/stable_sort.h>
-#include <__algorithm/pstl_backends/cpu_backends/transform.h>
-#include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
-#include <__config>
-
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h
index be5e54f3fa5c8..3db4765da64b2 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/any_of.h
@@ -11,12 +11,12 @@
 
 #include <__algorithm/any_of.h>
 #include <__algorithm/find_if.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__atomic/atomic.h>
 #include <__atomic/memory_order.h>
 #include <__config>
 #include <__functional/operations.h>
 #include <__iterator/concepts.h>
+#include <__pstl/configuration_fwd.h>
 #include <__pstl/cpu_algos/cpu_traits.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__utility/move.h>
@@ -34,7 +34,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Backend, class _Index, class _Brick>
 _LIBCPP_HIDE_FROM_ABI optional<bool> __parallel_or(_Index __first, _Index __last, _Brick __f) {
   std::atomic<bool> __found(false);
-  auto __ret = __pstl::__cpu_traits<_Backend>::__parallel_for(__first, __last, [__f, &__found](_Index __i, _Index __j) {
+  auto __ret = __pstl::__cpu_traits<_Backend>::__for_each(__first, __last, [__f, &__found](_Index __i, _Index __j) {
     if (!__found.load(std::memory_order_relaxed) && __f(__i, __j)) {
       __found.store(true, std::memory_order_relaxed);
       __pstl::__cpu_traits<_Backend>::__cancel_execution();
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
deleted file mode 100644
index cb9425862a2b0..0000000000000
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/backend.h
+++ /dev/null
@@ -1,45 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_BACKEND_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_BACKEND_H
-
-#include <__config>
-#include <cstddef>
-
-#if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL)
-#  include <__algorithm/pstl_backends/cpu_backends/serial.h>
-#elif defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD)
-#  include <__algorithm/pstl_backends/cpu_backends/thread.h>
-#elif defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
-#  include <__algorithm/pstl_backends/cpu_backends/libdispatch.h>
-#else
-#  error "Invalid CPU backend choice"
-#endif
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-#  if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL)
-using __cpu_backend_tag = __pstl::__serial_backend_tag;
-#  elif defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD)
-using __cpu_backend_tag = __pstl::__std_thread_backend_tag;
-#  elif defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
-using __cpu_backend_tag = __pstl::__libdispatch_backend_tag;
-#  endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && && _LIBCPP_STD_VER >= 17
-
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKEND_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
index 49a32f6c5ce55..b5a49f8417d32 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/fill.h
@@ -10,9 +10,9 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_FILL_H
 
 #include <__algorithm/fill.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
+#include <__pstl/configuration_fwd.h>
 #include <__pstl/cpu_algos/cpu_traits.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__utility/empty.h>
@@ -40,7 +40,7 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_fill(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for(
+    return __pstl::__cpu_traits<__cpu_backend_tag>::__for_each(
         __first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
           [[maybe_unused]] auto __res = std::__pstl_fill<__remove_parallel_policy_t<_ExecutionPolicy>>(
               __cpu_backend_tag{}, __brick_first, __brick_last, __value);
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h
index 11a5668bf25af..2b1754ea3a755 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/find_if.h
@@ -10,12 +10,12 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_FIND_IF_H
 
 #include <__algorithm/find_if.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__atomic/atomic.h>
 #include <__config>
 #include <__functional/operations.h>
 #include <__iterator/concepts.h>
 #include <__iterator/iterator_traits.h>
+#include <__pstl/configuration_fwd.h>
 #include <__pstl/cpu_algos/cpu_traits.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__utility/move.h>
@@ -42,7 +42,7 @@ __parallel_find(_Index __first, _Index __last, _Brick __f, _Compare __comp, bool
   _DifferenceType __initial_dist = __b_first ? __n : -1;
   std::atomic<_DifferenceType> __extremum(__initial_dist);
   // TODO: find out what is better here: parallel_for or parallel_reduce
-  auto __res = __pstl::__cpu_traits<_Backend>::__parallel_for(
+  auto __res = __pstl::__cpu_traits<_Backend>::__for_each(
       __first, __last, [__comp, __f, __first, &__extremum](_Index __i, _Index __j) {
         // See "Reducing Contention Through Priority Updates", PPoPP '13, for discussion of
         // why using a shared variable scales fairly well in this situation.
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
index 1667ec0f0c4f4..6db212ead8ae6 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/for_each.h
@@ -10,9 +10,9 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKNEDS_FOR_EACH_H
 
 #include <__algorithm/for_each.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
+#include <__pstl/configuration_fwd.h>
 #include <__pstl/cpu_algos/cpu_traits.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__utility/empty.h>
@@ -40,7 +40,7 @@ _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_for_each(__cpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for(
+    return __pstl::__cpu_traits<__cpu_backend_tag>::__for_each(
         __first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
           [[maybe_unused]] auto __res = std::__pstl_for_each<__remove_parallel_policy_t<_ExecutionPolicy>>(
               __cpu_backend_tag{}, __brick_first, __brick_last, __func);
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h
index d034447904872..f3e59e8c02854 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/merge.h
@@ -10,9 +10,9 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_MERGE_H
 
 #include <__algorithm/merge.h>
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
+#include <__pstl/configuration_fwd.h>
 #include <__pstl/cpu_algos/cpu_traits.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__utility/move.h>
@@ -46,7 +46,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_merge(
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
-    auto __res = __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_merge(
+    auto __res = __pstl::__cpu_traits<__cpu_backend_tag>::__merge(
         __first1,
         __last1,
         __first2,
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h
index ebfa0fc69147d..9ad8cc8fb0f2d 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/stable_sort.h
@@ -9,9 +9,9 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_STABLE_SORT_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_STABLE_SORT_H
 
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/stable_sort.h>
 #include <__config>
+#include <__pstl/configuration_fwd.h>
 #include <__pstl/cpu_algos/cpu_traits.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__utility/empty.h>
@@ -29,7 +29,7 @@ template <class _ExecutionPolicy, class _RandomAccessIterator, class _Comp>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __pstl_stable_sort(__cpu_backend_tag, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy>) {
-    return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_stable_sort(
+    return __pstl::__cpu_traits<__cpu_backend_tag>::__stable_sort(
         __first, __last, __comp, [](_RandomAccessIterator __g_first, _RandomAccessIterator __g_last, _Comp __g_comp) {
           std::stable_sort(__g_first, __g_last, __g_comp);
         });
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
index d4c383997a67a..65e166d847e12 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform.h
@@ -9,11 +9,11 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_TRANSFORM_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_TRANSFORM_H
 
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__algorithm/transform.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__iterator/iterator_traits.h>
+#include <__pstl/configuration_fwd.h>
 #include <__pstl/cpu_algos/cpu_traits.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_execution_policy.h>
@@ -50,7 +50,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
-    __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for(
+    __pstl::__cpu_traits<__cpu_backend_tag>::__for_each(
         __first, __last, [__op, __first, __result](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
           auto __res = std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>(
               __cpu_backend_tag{}, __brick_first, __brick_last, __result + (__brick_first - __first), __op);
@@ -98,7 +98,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_ForwardOutIterator> __pstl_transform(
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) {
-    auto __res = __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_for(
+    auto __res = __pstl::__cpu_traits<__cpu_backend_tag>::__for_each(
         __first1,
         __last1,
         [__op, __first1, __first2, __result](_ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last) {
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
index 956c7d6a88ce2..af481d505bb91 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
+++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/transform_reduce.h
@@ -9,11 +9,11 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_TRANSFORM_REDUCE_H
 #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_TRANSFORM_REDUCE_H
 
-#include <__algorithm/pstl_backends/cpu_backends/backend.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__iterator/iterator_traits.h>
 #include <__numeric/transform_reduce.h>
+#include <__pstl/configuration_fwd.h>
 #include <__pstl/cpu_algos/cpu_traits.h>
 #include <__type_traits/desugars_to.h>
 #include <__type_traits/is_arithmetic.h>
@@ -120,7 +120,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value) {
-    return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_transform_reduce(
+    return __pstl::__cpu_traits<__cpu_backend_tag>::__transform_reduce(
         __first1,
         std::move(__last1),
         [__first1, __first2, __transform](_ForwardIterator1 __iter) {
@@ -167,7 +167,7 @@ _LIBCPP_HIDE_FROM_ABI optional<_Tp> __pstl_transform_reduce(
     _UnaryOperation __transform) {
   if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
                 __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
-    return __pstl::__cpu_traits<__cpu_backend_tag>::__parallel_transform_reduce(
+    return __pstl::__cpu_traits<__cpu_backend_tag>::__transform_reduce(
         std::move(__first),
         std::move(__last),
         [__transform](_ForwardIterator __iter) { return __transform(*__iter); },
diff --git a/libcxx/include/__algorithm/pstl_copy.h b/libcxx/include/__algorithm/pstl_copy.h
index 1069dcec0e117..0fcea33c3919f 100644
--- a/libcxx/include/__algorithm/pstl_copy.h
+++ b/libcxx/include/__algorithm/pstl_copy.h
@@ -10,12 +10,13 @@
 #define _LIBCPP___ALGORITHM_PSTL_COPY_H
 
 #include <__algorithm/copy_n.h>
-#include <__algorithm/pstl_backend.h>
 #include <__algorithm/pstl_frontend_dispatch.h>
 #include <__algorithm/pstl_transform.h>
 #include <__config>
 #include <__functional/identity.h>
 #include <__iterator/concepts.h>
+#include <__iterator/cpp17_iterator_concepts.h>
+#include <__pstl/configuration.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/is_execution_policy.h>
@@ -67,6 +68,12 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator
 copy(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _ForwardOutIterator __result) {
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(
+      _ForwardIterator, "copy(first, last, result) requires [first, last) to be ForwardIterators");
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(
+      _ForwardOutIterator, "copy(first, last, result) requires result to be a ForwardIterator");
+  _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(
+      _ForwardOutIterator, decltype(*__first), "copy(first, last, result) requires result to be an OutputIterator");
   auto __res = std::__copy(__policy, std::move(__first), std::move(__last), std::move(__result));
   if (!__res)
     std::__throw_bad_alloc();
@@ -106,6 +113,12 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator
 copy_n(_ExecutionPolicy&& __policy, _ForwardIterator __first, _Size __n, _ForwardOutIterator __result) {
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(
+      _ForwardIterator, "copy_n(first, n, result) requires first to be a ForwardIterator");
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(
+      _ForwardOutIterator, "copy_n(first, n, result) requires result to be a ForwardIterator");
+  _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(
+      _ForwardOutIterator, decltype(*__first), "copy_n(first, n, result) requires result to be an OutputIterator");
   auto __res = std::__copy_n(__policy, std::move(__first), std::move(__n), std::move(__result));
   if (!__res)
     std::__throw_bad_alloc();
diff --git a/libcxx/include/__algorithm/pstl_count.h b/libcxx/include/__algorithm/pstl_count.h
index 2781f6bfd3c9e..64c84d855e4f6 100644
--- a/libcxx/include/__algorithm/pstl_count.h
+++ b/libcxx/include/__algorithm/pstl_count.h
@@ -11,14 +11,15 @@
 
 #include <__algorithm/count.h>
 #include <__algorithm/for_each.h>
-#include <__algorithm/pstl_backend.h>
 #include <__algorithm/pstl_for_each.h>
 #include <__algorithm/pstl_frontend_dispatch.h>
 #include <__atomic/atomic.h>
 #include <__config>
 #include <__functional/operations.h>
+#include <__iterator/cpp17_iterator_concepts.h>
 #include <__iterator/iterator_traits.h>
 #include <__numeric/pstl_transform_reduce.h>
+#include <__pstl/configuration.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/remove_cvref.h>
@@ -70,6 +71,8 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 _LIBCPP_HIDE_FROM_ABI __iter_diff_t<_ForwardIterator>
 count_if(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(
+      _ForwardIterator, "count_if(first, last, pred) requires [first, last) to be ForwardIterators");
   auto __res = std::__count_if(__policy, std::move(__first), std::move(__last), std::move(__pred));
   if (!__res)
     std::__throw_bad_alloc();
@@ -106,6 +109,8 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 _LIBCPP_HIDE_FROM_ABI __iter_diff_t<_ForwardIterator>
 count(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(
+      _ForwardIterator, "count(first, last, val) requires [first, last) to be ForwardIterators");
   auto __res = std::__count(__policy, std::move(__first), std::move(__last), __value);
   if (!__res)
     std::__throw_bad_alloc();
diff --git a/libcxx/include/__algorithm/pstl_equal.h b/libcxx/include/__algorithm/pstl_equal.h
index d235c0f4f4197..0b38197d7f63d 100644
--- a/libcxx/include/__algorithm/pstl_equal.h
+++ b/libcxx/include/__algorithm/pstl_equal.h
@@ -13,6 +13,7 @@
 #include <__algorithm/pstl_frontend_dispatch.h>
 #include <__config>
 #include <__functional/operations.h>
+#include <__iterator/cpp17_iterator_concepts.h>
 #include <__iterator/iterator_traits.h>
 #include <__numeric/pstl_transform_reduce.h>
 #include <__utility/move.h>
@@ -74,6 +75,8 @@ equal(_ExecutionPolicy&& __policy,
       _ForwardIterator1 __last1,
       _ForwardIterator2 __first2,
       _Pred __pred) {
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator1, "equal requires ForwardIterators");
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator2, "equal requires ForwardIterators");
   auto __res = std::__equal(__policy, std::move(__first1), std::move(__last1), std::move(__first2), std::move(__pred));
   if (!__res)
     std::__throw_bad_alloc();
@@ -86,6 +89,8 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<__remove_cvref_t<_ExecutionPolicy>>, int> = 0>
 _LIBCPP_HIDE_FROM_ABI bool
 equal(_ExecutionPolicy&& __policy, _ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2) {
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator1, "equal requires ForwardIterators");
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator2, "equal requires ForwardIterators");
   return std::equal(__policy, std::move(__first1), std::move(__last1), std::move(__first2), std::equal_to{});
 }
 
@@ -145,6 +150,8 @@ equal(_ExecutionPolicy&& __policy,
       _ForwardIterator2 __first2,
       _ForwardIterator2 __last2,
       _Pred __pred) {
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator1, "equal requires ForwardIterators");
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator2, "equal requires ForwardIterators");
   auto __res = std::__equal(
       __policy, std::move(__first1), std::move(__last1), std::move(__first2), std::move(__last2), std::move(__pred));
   if (!__res)
@@ -162,6 +169,8 @@ equal(_ExecutionPolicy&& __policy,
       _ForwardIterator1 __last1,
       _ForwardIterator2 __first2,
       _ForwardIterator2 __last2) {
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator1, "equal requires ForwardIterators");
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator2, "equal requires ForwardIterators");
   return std::equal(
       __policy, std::move(__first1), std::move(__last1), std::move(__first2), std::move(__last2), std::equal_to{});
 }
diff --git a/libcxx/include/__algorithm/pstl_fill.h b/libcxx/include/__algorithm/pstl_fill.h
index 488b49a0feec9..fd248506bc4b9 100644
--- a/libcxx/include/__algorithm/pstl_fill.h
+++ b/libcxx/include/__algorithm/pstl_fill.h
@@ -43,7 +43,6 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __fill(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) noexcept {
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
   return std::__pstl_frontend_dispatch(
       _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_fill, _RawPolicy),
       [&](_ForwardIterator __g_first, _ForwardIterator __g_last, const _Tp& __g_value) {
@@ -63,7 +62,7 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 _LIBCPP_HIDE_FROM_ABI void
 fill(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "fill requires ForwardIterators");
   if (!std::__fill(__policy, std::move(__first), std::move(__last), __value))
     std::__throw_bad_alloc();
 }
@@ -79,7 +78,6 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __fill_n(_ExecutionPolicy&& __policy, _ForwardIterator&& __first, _SizeT&& __n, const _Tp& __value) noexcept {
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
   return std::__pstl_frontend_dispatch(
       _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_fill_n, _RawPolicy),
       [&](_ForwardIterator __g_first, _SizeT __g_n, const _Tp& __g_value) {
@@ -102,7 +100,7 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 _LIBCPP_HIDE_FROM_ABI void
 fill_n(_ExecutionPolicy&& __policy, _ForwardIterator __first, _SizeT __n, const _Tp& __value) {
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "fill_n requires ForwardIterators");
   if (!std::__fill_n(__policy, std::move(__first), std::move(__n), __value))
     std::__throw_bad_alloc();
 }
diff --git a/libcxx/include/__algorithm/pstl_find.h b/libcxx/include/__algorithm/pstl_find.h
index 5b694db68aead..b4c4dfb2ffb6f 100644
--- a/libcxx/include/__algorithm/pstl_find.h
+++ b/libcxx/include/__algorithm/pstl_find.h
@@ -11,10 +11,10 @@
 
 #include <__algorithm/comp.h>
 #include <__algorithm/find.h>
-#include <__algorithm/pstl_backend.h>
 #include <__algorithm/pstl_frontend_dispatch.h>
 #include <__config>
 #include <__iterator/cpp17_iterator_concepts.h>
+#include <__pstl/configuration.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/remove_cvref.h>
@@ -50,7 +50,7 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _ForwardIterator
 find_if(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "find_if requires ForwardIterators");
   auto __res = std::__find_if(__policy, std::move(__first), std::move(__last), std::move(__pred));
   if (!__res)
     std::__throw_bad_alloc();
@@ -88,7 +88,7 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _ForwardIterator
 find_if_not(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "find_if_not requires ForwardIterators");
   auto __res = std::__find_if_not(__policy, std::move(__first), std::move(__last), std::move(__pred));
   if (!__res)
     std::__throw_bad_alloc();
@@ -125,7 +125,7 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _ForwardIterator
 find(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "find requires ForwardIterators");
   auto __res = std::__find(__policy, std::move(__first), std::move(__last), __value);
   if (!__res)
     std::__throw_bad_alloc();
diff --git a/libcxx/include/__algorithm/pstl_for_each.h b/libcxx/include/__algorithm/pstl_for_each.h
index bb7b5a61a6dc0..a99eb6d97fd27 100644
--- a/libcxx/include/__algorithm/pstl_for_each.h
+++ b/libcxx/include/__algorithm/pstl_for_each.h
@@ -11,11 +11,11 @@
 
 #include <__algorithm/for_each.h>
 #include <__algorithm/for_each_n.h>
-#include <__algorithm/pstl_backend.h>
 #include <__algorithm/pstl_frontend_dispatch.h>
 #include <__config>
 #include <__iterator/concepts.h>
 #include <__iterator/cpp17_iterator_concepts.h>
+#include <__pstl/configuration.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/remove_cvref.h>
@@ -53,7 +53,7 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 _LIBCPP_HIDE_FROM_ABI void
 for_each(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Function __func) {
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "for_each requires ForwardIterators");
   if (!std::__for_each(__policy, std::move(__first), std::move(__last), std::move(__func)))
     std::__throw_bad_alloc();
 }
@@ -93,7 +93,7 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 _LIBCPP_HIDE_FROM_ABI void
 for_each_n(_ExecutionPolicy&& __policy, _ForwardIterator __first, _Size __size, _Function __func) {
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "for_each_n requires a ForwardIterator");
   auto __res = std::__for_each_n(__policy, std::move(__first), std::move(__size), std::move(__func));
   if (!__res)
     std::__throw_bad_alloc();
diff --git a/libcxx/include/__algorithm/pstl_generate.h b/libcxx/include/__algorithm/pstl_generate.h
index 7133c6f4f4c62..350c0e4798be6 100644
--- a/libcxx/include/__algorithm/pstl_generate.h
+++ b/libcxx/include/__algorithm/pstl_generate.h
@@ -9,12 +9,12 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_GENERATE_H
 #define _LIBCPP___ALGORITHM_PSTL_GENERATE_H
 
-#include <__algorithm/pstl_backend.h>
 #include <__algorithm/pstl_for_each.h>
 #include <__algorithm/pstl_frontend_dispatch.h>
 #include <__config>
 #include <__iterator/cpp17_iterator_concepts.h>
 #include <__iterator/iterator_traits.h>
+#include <__pstl/configuration.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/remove_cvref.h>
@@ -42,7 +42,6 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 [[nodiscard]] _LIBCPP_HIDE_FROM_ABI optional<__empty>
 __generate(_ExecutionPolicy&& __policy, _ForwardIterator&& __first, _ForwardIterator&& __last, _Generator&& __gen) {
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
   return std::__pstl_frontend_dispatch(
       _LIBCPP_PSTL_CUSTOMIZATION_POINT(__pstl_generate, _RawPolicy),
       [&__policy](_ForwardIterator __g_first, _ForwardIterator __g_last, _Generator __g_gen) {
@@ -63,7 +62,7 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 _LIBCPP_HIDE_FROM_ABI void
 generate(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Generator __gen) {
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "generate requires ForwardIterators");
   if (!std::__generate(__policy, std::move(__first), std::move(__last), std::move(__gen)))
     std::__throw_bad_alloc();
 }
@@ -100,7 +99,7 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 _LIBCPP_HIDE_FROM_ABI void
 generate_n(_ExecutionPolicy&& __policy, _ForwardIterator __first, _Size __n, _Generator __gen) {
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "generate_n requires a ForwardIterator");
   if (!std::__generate_n(__policy, std::move(__first), std::move(__n), std::move(__gen)))
     std::__throw_bad_alloc();
 }
diff --git a/libcxx/include/__algorithm/pstl_is_partitioned.h b/libcxx/include/__algorithm/pstl_is_partitioned.h
index b654302122072..c016b388e3784 100644
--- a/libcxx/include/__algorithm/pstl_is_partitioned.h
+++ b/libcxx/include/__algorithm/pstl_is_partitioned.h
@@ -10,10 +10,11 @@
 #define _LIBCPP___ALGORITHM_PSTL_IS_PARITTIONED
 
 #include <__algorithm/pstl_any_all_none_of.h>
-#include <__algorithm/pstl_backend.h>
 #include <__algorithm/pstl_find.h>
 #include <__algorithm/pstl_frontend_dispatch.h>
 #include <__config>
+#include <__iterator/cpp17_iterator_concepts.h>
+#include <__pstl/configuration.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/remove_cvref.h>
@@ -62,6 +63,7 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI bool
 is_partitioned(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred) {
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "is_partitioned requires ForwardIterators");
   auto __res = std::__is_partitioned(__policy, std::move(__first), std::move(__last), std::move(__pred));
   if (!__res)
     std::__throw_bad_alloc();
diff --git a/libcxx/include/__algorithm/pstl_merge.h b/libcxx/include/__algorithm/pstl_merge.h
index 3d262db6bc0c1..87f634a67f588 100644
--- a/libcxx/include/__algorithm/pstl_merge.h
+++ b/libcxx/include/__algorithm/pstl_merge.h
@@ -9,9 +9,10 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_MERGE_H
 #define _LIBCPP___ALGORITHM_PSTL_MERGE_H
 
-#include <__algorithm/pstl_backend.h>
 #include <__config>
 #include <__functional/operations.h>
+#include <__iterator/cpp17_iterator_concepts.h>
+#include <__pstl/configuration.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/remove_cvref.h>
@@ -70,6 +71,10 @@ merge(_ExecutionPolicy&& __policy,
       _ForwardIterator2 __last2,
       _ForwardOutIterator __result,
       _Comp __comp = {}) {
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator1, "merge requires ForwardIterators");
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator2, "merge requires ForwardIterators");
+  _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(_ForwardOutIterator, decltype(*__first1), "merge requires an OutputIterator");
+  _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(_ForwardOutIterator, decltype(*__first2), "merge requires an OutputIterator");
   auto __res = std::__merge(
       __policy,
       std::move(__first1),
diff --git a/libcxx/include/__algorithm/pstl_move.h b/libcxx/include/__algorithm/pstl_move.h
index d8441f1a6c2e1..3155ddedf91bb 100644
--- a/libcxx/include/__algorithm/pstl_move.h
+++ b/libcxx/include/__algorithm/pstl_move.h
@@ -10,12 +10,13 @@
 #define _LIBCPP___ALGORITHM_PSTL_MOVE_H
 
 #include <__algorithm/copy_n.h>
-#include <__algorithm/pstl_backend.h>
 #include <__algorithm/pstl_frontend_dispatch.h>
 #include <__algorithm/pstl_transform.h>
 #include <__config>
 #include <__functional/identity.h>
+#include <__iterator/cpp17_iterator_concepts.h>
 #include <__iterator/iterator_traits.h>
+#include <__pstl/configuration.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_constant_evaluated.h>
 #include <__type_traits/is_execution_policy.h>
@@ -69,6 +70,10 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator
 move(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last, _ForwardOutIterator __result) {
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "move requires ForwardIterators");
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardOutIterator, "move requires an OutputIterator");
+  _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(
+      _ForwardOutIterator, decltype(std::move(*__first)), "move requires an OutputIterator");
   auto __res = std::__move(__policy, std::move(__first), std::move(__last), std::move(__result));
   if (!__res)
     std::__throw_bad_alloc();
diff --git a/libcxx/include/__algorithm/pstl_replace.h b/libcxx/include/__algorithm/pstl_replace.h
index b1caf3fd4ac0a..b2ded54dfe25f 100644
--- a/libcxx/include/__algorithm/pstl_replace.h
+++ b/libcxx/include/__algorithm/pstl_replace.h
@@ -9,12 +9,13 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_REPLACE_H
 #define _LIBCPP___ALGORITHM_PSTL_REPLACE_H
 
-#include <__algorithm/pstl_backend.h>
 #include <__algorithm/pstl_for_each.h>
 #include <__algorithm/pstl_frontend_dispatch.h>
 #include <__algorithm/pstl_transform.h>
 #include <__config>
+#include <__iterator/cpp17_iterator_concepts.h>
 #include <__iterator/iterator_traits.h>
+#include <__pstl/configuration.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/remove_cvref.h>
 #include <__utility/move.h>
@@ -74,6 +75,7 @@ replace_if(_ExecutionPolicy&& __policy,
            _ForwardIterator __last,
            _Pred __pred,
            const _Tp& __new_value) {
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "replace_if requires ForwardIterators");
   auto __res = std::__replace_if(__policy, std::move(__first), std::move(__last), std::move(__pred), __new_value);
   if (!__res)
     std::__throw_bad_alloc();
@@ -121,6 +123,7 @@ replace(_ExecutionPolicy&& __policy,
         _ForwardIterator __last,
         const _Tp& __old_value,
         const _Tp& __new_value) {
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "replace requires ForwardIterators");
   if (!std::__replace(__policy, std::move(__first), std::move(__last), __old_value, __new_value))
     std::__throw_bad_alloc();
 }
@@ -177,6 +180,11 @@ _LIBCPP_HIDE_FROM_ABI void replace_copy_if(
     _ForwardOutIterator __result,
     _Pred __pred,
     const _Tp& __new_value) {
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "replace_copy_if requires ForwardIterators");
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardOutIterator, "replace_copy_if requires ForwardIterators");
+  _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(
+      _ForwardOutIterator, decltype(*__first), "replace_copy_if requires an OutputIterator");
+  _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(_ForwardOutIterator, const _Tp&, "replace_copy requires an OutputIterator");
   if (!std::__replace_copy_if(
           __policy, std::move(__first), std::move(__last), std::move(__result), std::move(__pred), __new_value))
     std::__throw_bad_alloc();
@@ -233,6 +241,11 @@ _LIBCPP_HIDE_FROM_ABI void replace_copy(
     _ForwardOutIterator __result,
     const _Tp& __old_value,
     const _Tp& __new_value) {
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "replace_copy requires ForwardIterators");
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardOutIterator, "replace_copy requires ForwardIterators");
+  _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(
+      _ForwardOutIterator, decltype(*__first), "replace_copy requires an OutputIterator");
+  _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(_ForwardOutIterator, const _Tp&, "replace_copy requires an OutputIterator");
   if (!std::__replace_copy(
           __policy, std::move(__first), std::move(__last), std::move(__result), __old_value, __new_value))
     std::__throw_bad_alloc();
diff --git a/libcxx/include/__algorithm/pstl_rotate_copy.h b/libcxx/include/__algorithm/pstl_rotate_copy.h
index 346aab1d4a55c..1a32b710877c1 100644
--- a/libcxx/include/__algorithm/pstl_rotate_copy.h
+++ b/libcxx/include/__algorithm/pstl_rotate_copy.h
@@ -9,9 +9,10 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_ROTATE_COPY_H
 #define _LIBCPP___ALGORITHM_PSTL_ROTATE_COPY_H
 
-#include <__algorithm/pstl_backend.h>
 #include <__algorithm/pstl_copy.h>
 #include <__algorithm/pstl_frontend_dispatch.h>
+#include <__iterator/cpp17_iterator_concepts.h>
+#include <__pstl/configuration.h>
 #include <__type_traits/is_execution_policy.h>
 #include <optional>
 
@@ -69,6 +70,10 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator rotate_copy(
     _ForwardIterator __middle,
     _ForwardIterator __last,
     _ForwardOutIterator __result) {
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "rotate_copy requires ForwardIterators");
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardOutIterator, "rotate_copy requires ForwardIterators");
+  _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(
+      _ForwardOutIterator, decltype(*__first), "rotate_copy requires an OutputIterator");
   auto __res =
       std::__rotate_copy(__policy, std::move(__first), std::move(__middle), std::move(__last), std::move(__result));
   if (!__res)
diff --git a/libcxx/include/__algorithm/pstl_sort.h b/libcxx/include/__algorithm/pstl_sort.h
index a931f768111a2..769dd81af77e0 100644
--- a/libcxx/include/__algorithm/pstl_sort.h
+++ b/libcxx/include/__algorithm/pstl_sort.h
@@ -9,11 +9,12 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_SORT_H
 #define _LIBCPP___ALGORITHM_PSTL_SORT_H
 
-#include <__algorithm/pstl_backend.h>
 #include <__algorithm/pstl_frontend_dispatch.h>
 #include <__algorithm/pstl_stable_sort.h>
 #include <__config>
 #include <__functional/operations.h>
+#include <__iterator/cpp17_iterator_concepts.h>
+#include <__pstl/configuration.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/remove_cvref.h>
 #include <__utility/empty.h>
@@ -60,6 +61,7 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 _LIBCPP_HIDE_FROM_ABI void
 sort(_ExecutionPolicy&& __policy, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp) {
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(_RandomAccessIterator, "sort requires RandomAccessIterators");
   if (!std::__sort(__policy, std::move(__first), std::move(__last), std::move(__comp)))
     std::__throw_bad_alloc();
 }
@@ -70,6 +72,7 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 _LIBCPP_HIDE_FROM_ABI void
 sort(_ExecutionPolicy&& __policy, _RandomAccessIterator __first, _RandomAccessIterator __last) {
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(_RandomAccessIterator, "sort requires RandomAccessIterators");
   std::sort(std::forward<_ExecutionPolicy>(__policy), std::move(__first), std::move(__last), less{});
 }
 
diff --git a/libcxx/include/__algorithm/pstl_stable_sort.h b/libcxx/include/__algorithm/pstl_stable_sort.h
index 8ea0bb3f9a8d5..f5e0dd40f72b4 100644
--- a/libcxx/include/__algorithm/pstl_stable_sort.h
+++ b/libcxx/include/__algorithm/pstl_stable_sort.h
@@ -9,9 +9,10 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_STABLE_SORT_H
 #define _LIBCPP___ALGORITHM_PSTL_STABLE_SORT_H
 
-#include <__algorithm/pstl_backend.h>
 #include <__config>
 #include <__functional/operations.h>
+#include <__iterator/cpp17_iterator_concepts.h>
+#include <__pstl/configuration.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/remove_cvref.h>
@@ -48,6 +49,7 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 _LIBCPP_HIDE_FROM_ABI void stable_sort(
     _ExecutionPolicy&& __policy, _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp = {}) {
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(_RandomAccessIterator, "stable_sort requires RandomAccessIterators");
   if (!std::__stable_sort(__policy, std::move(__first), std::move(__last), std::move(__comp)))
     std::__throw_bad_alloc();
 }
diff --git a/libcxx/include/__algorithm/pstl_transform.h b/libcxx/include/__algorithm/pstl_transform.h
index f95938782fc3b..80e1d6b496f2e 100644
--- a/libcxx/include/__algorithm/pstl_transform.h
+++ b/libcxx/include/__algorithm/pstl_transform.h
@@ -9,9 +9,9 @@
 #ifndef _LIBCPP___ALGORITHM_PSTL_TRANSFORM_H
 #define _LIBCPP___ALGORITHM_PSTL_TRANSFORM_H
 
-#include <__algorithm/pstl_backend.h>
 #include <__config>
 #include <__iterator/cpp17_iterator_concepts.h>
+#include <__pstl/configuration.h>
 #include <__type_traits/enable_if.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__type_traits/remove_cvref.h>
@@ -58,9 +58,10 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator transform(
     _ForwardIterator __last,
     _ForwardOutIterator __result,
     _UnaryOperation __op) {
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator);
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardOutIterator);
-  _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(_ForwardOutIterator, decltype(__op(*__first)));
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "transform requires ForwardIterators");
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardOutIterator, "transform requires an OutputIterator");
+  _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(
+      _ForwardOutIterator, decltype(__op(*__first)), "transform requires an OutputIterator");
   auto __res = std::__transform(__policy, std::move(__first), std::move(__last), std::move(__result), std::move(__op));
   if (!__res)
     std::__throw_bad_alloc();
@@ -100,10 +101,11 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator transform(
     _ForwardIterator2 __first2,
     _ForwardOutIterator __result,
     _BinaryOperation __op) {
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator1);
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator2);
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardOutIterator);
-  _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(_ForwardOutIterator, decltype(__op(*__first1, *__first2)));
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator1, "transform requires ForwardIterators");
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator2, "transform requires ForwardIterators");
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardOutIterator, "transform requires an OutputIterator");
+  _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(
+      _ForwardOutIterator, decltype(__op(*__first1, *__first2)), "transform requires an OutputIterator");
   auto __res = std::__transform(
       __policy, std::move(__first1), std::move(__last1), std::move(__first2), std::move(__result), std::move(__op));
   if (!__res)
diff --git a/libcxx/include/__atomic/atomic.h b/libcxx/include/__atomic/atomic.h
index 3dfb6937d0325..bd3f659c22df0 100644
--- a/libcxx/include/__atomic/atomic.h
+++ b/libcxx/include/__atomic/atomic.h
@@ -462,22 +462,26 @@ atomic_wait_explicit(const atomic<_Tp>* __o, typename atomic<_Tp>::value_type __
 // atomic_notify_one
 
 template <class _Tp>
-_LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void atomic_notify_one(volatile atomic<_Tp>* __o) _NOEXCEPT {
+_LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void
+atomic_notify_one(volatile atomic<_Tp>* __o) _NOEXCEPT {
   __o->notify_one();
 }
 template <class _Tp>
-_LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void atomic_notify_one(atomic<_Tp>* __o) _NOEXCEPT {
+_LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void
+atomic_notify_one(atomic<_Tp>* __o) _NOEXCEPT {
   __o->notify_one();
 }
 
 // atomic_notify_all
 
 template <class _Tp>
-_LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void atomic_notify_all(volatile atomic<_Tp>* __o) _NOEXCEPT {
+_LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void
+atomic_notify_all(volatile atomic<_Tp>* __o) _NOEXCEPT {
   __o->notify_all();
 }
 template <class _Tp>
-_LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void atomic_notify_all(atomic<_Tp>* __o) _NOEXCEPT {
+_LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void
+atomic_notify_all(atomic<_Tp>* __o) _NOEXCEPT {
   __o->notify_all();
 }
 
diff --git a/libcxx/include/__atomic/atomic_flag.h b/libcxx/include/__atomic/atomic_flag.h
index 084366237c16e..3ec3366ecaaf9 100644
--- a/libcxx/include/__atomic/atomic_flag.h
+++ b/libcxx/include/__atomic/atomic_flag.h
@@ -49,22 +49,26 @@ struct atomic_flag {
     __cxx_atomic_store(&__a_, _LIBCPP_ATOMIC_FLAG_TYPE(false), __m);
   }
 
-  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void wait(bool __v, memory_order __m = memory_order_seq_cst) const
-      volatile _NOEXCEPT {
+  _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void
+  wait(bool __v, memory_order __m = memory_order_seq_cst) const volatile _NOEXCEPT {
     std::__atomic_wait(*this, _LIBCPP_ATOMIC_FLAG_TYPE(__v), __m);
   }
-  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void
+  _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void
   wait(bool __v, memory_order __m = memory_order_seq_cst) const _NOEXCEPT {
     std::__atomic_wait(*this, _LIBCPP_ATOMIC_FLAG_TYPE(__v), __m);
   }
-  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_one() volatile _NOEXCEPT {
+  _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_one() volatile _NOEXCEPT {
+    std::__atomic_notify_one(*this);
+  }
+  _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_one() _NOEXCEPT {
     std::__atomic_notify_one(*this);
   }
-  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_one() _NOEXCEPT { std::__atomic_notify_one(*this); }
   _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_all() volatile _NOEXCEPT {
     std::__atomic_notify_all(*this);
   }
-  _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_all() _NOEXCEPT { std::__atomic_notify_all(*this); }
+  _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void notify_all() _NOEXCEPT {
+    std::__atomic_notify_all(*this);
+  }
 
 #if _LIBCPP_STD_VER >= 20
   _LIBCPP_HIDE_FROM_ABI constexpr atomic_flag() _NOEXCEPT : __a_(false) {}
@@ -141,41 +145,43 @@ inline _LIBCPP_HIDE_FROM_ABI void atomic_flag_clear_explicit(atomic_flag* __o, m
   __o->clear(__m);
 }
 
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
+inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
 atomic_flag_wait(const volatile atomic_flag* __o, bool __v) _NOEXCEPT {
   __o->wait(__v);
 }
 
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
+inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
 atomic_flag_wait(const atomic_flag* __o, bool __v) _NOEXCEPT {
   __o->wait(__v);
 }
 
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
+inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
 atomic_flag_wait_explicit(const volatile atomic_flag* __o, bool __v, memory_order __m) _NOEXCEPT {
   __o->wait(__v, __m);
 }
 
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
+inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
 atomic_flag_wait_explicit(const atomic_flag* __o, bool __v, memory_order __m) _NOEXCEPT {
   __o->wait(__v, __m);
 }
 
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
+inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
 atomic_flag_notify_one(volatile atomic_flag* __o) _NOEXCEPT {
   __o->notify_one();
 }
 
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void atomic_flag_notify_one(atomic_flag* __o) _NOEXCEPT {
+inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
+atomic_flag_notify_one(atomic_flag* __o) _NOEXCEPT {
   __o->notify_one();
 }
 
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
+inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
 atomic_flag_notify_all(volatile atomic_flag* __o) _NOEXCEPT {
   __o->notify_all();
 }
 
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void atomic_flag_notify_all(atomic_flag* __o) _NOEXCEPT {
+inline _LIBCPP_DEPRECATED_ATOMIC_SYNC _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_SYNC void
+atomic_flag_notify_all(atomic_flag* __o) _NOEXCEPT {
   __o->notify_all();
 }
 
diff --git a/libcxx/include/__availability b/libcxx/include/__availability
index bb3ed0a8da521..aa761eb5bfe5e 100644
--- a/libcxx/include/__availability
+++ b/libcxx/include/__availability
@@ -160,6 +160,15 @@
 #  define _LIBCPP_AVAILABILITY_HAS_TZDB 1
 #  define _LIBCPP_AVAILABILITY_TZDB
 
+// These macros determine whether we assume that std::bad_function_call and
+// std::bad_expected_access provide a key function in the dylib. This allows
+// centralizing their vtable and typeinfo instead of having all TUs provide
+// a weak definition that then gets deduplicated.
+#  define _LIBCPP_AVAILABILITY_HAS_BAD_FUNCTION_CALL_KEY_FUNCTION 1
+#  define _LIBCPP_AVAILABILITY_BAD_FUNCTION_CALL_KEY_FUNCTION
+#  define _LIBCPP_AVAILABILITY_HAS_BAD_EXPECTED_ACCESS_KEY_FUNCTION 1
+#  define _LIBCPP_AVAILABILITY_BAD_EXPECTED_ACCESS_KEY_FUNCTION
+
 #elif defined(__APPLE__)
 
 #  define _LIBCPP_AVAILABILITY_HAS_BAD_OPTIONAL_ACCESS                                                                 \
@@ -290,6 +299,13 @@
 #  else
 #    define _LIBCPP_AVAILABILITY_HAS_ADDITIONAL_IOSTREAM_EXPLICIT_INSTANTIATIONS_1 1
 #  endif
+
+#  define _LIBCPP_AVAILABILITY_HAS_BAD_FUNCTION_CALL_KEY_FUNCTION 0
+#  define _LIBCPP_AVAILABILITY_BAD_FUNCTION_CALL_KEY_FUNCTION __attribute__((unavailable))
+
+#  define _LIBCPP_AVAILABILITY_HAS_BAD_EXPECTED_ACCESS_KEY_FUNCTION 0
+#  define _LIBCPP_AVAILABILITY_BAD_EXPECTED_ACCESS_KEY_FUNCTION __attribute__((unavailable))
+
 #else
 
 // ...New vendors can add availability markup here...
diff --git a/libcxx/include/__chrono/convert_to_tm.h b/libcxx/include/__chrono/convert_to_tm.h
index 1301cd6f1f1ad..d2c5cf922ba67 100644
--- a/libcxx/include/__chrono/convert_to_tm.h
+++ b/libcxx/include/__chrono/convert_to_tm.h
@@ -20,6 +20,7 @@
 #include <__chrono/month_weekday.h>
 #include <__chrono/monthday.h>
 #include <__chrono/statically_widen.h>
+#include <__chrono/sys_info.h>
 #include <__chrono/system_clock.h>
 #include <__chrono/time_point.h>
 #include <__chrono/weekday.h>
@@ -171,6 +172,10 @@ _LIBCPP_HIDE_FROM_ABI _Tm __convert_to_tm(const _ChronoT& __value) {
       if (__value.hours().count() > std::numeric_limits<decltype(__result.tm_hour)>::max())
         std::__throw_format_error("Formatting hh_mm_ss, encountered an hour overflow");
     __result.tm_hour = __value.hours().count();
+#  if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB)
+  } else if constexpr (same_as<_ChronoT, chrono::sys_info>) {
+    // Has no time information.
+#  endif
   } else
     static_assert(sizeof(_ChronoT) == 0, "Add the missing type specialization");
 
diff --git a/libcxx/include/__chrono/formatter.h b/libcxx/include/__chrono/formatter.h
index 4ad59382a4148..0196deb8c1ffb 100644
--- a/libcxx/include/__chrono/formatter.h
+++ b/libcxx/include/__chrono/formatter.h
@@ -10,6 +10,7 @@
 #ifndef _LIBCPP___CHRONO_FORMATTER_H
 #define _LIBCPP___CHRONO_FORMATTER_H
 
+#include <__algorithm/ranges_copy.h>
 #include <__chrono/calendar.h>
 #include <__chrono/concepts.h>
 #include <__chrono/convert_to_tm.h>
@@ -23,6 +24,7 @@
 #include <__chrono/ostream.h>
 #include <__chrono/parser_std_format_spec.h>
 #include <__chrono/statically_widen.h>
+#include <__chrono/sys_info.h>
 #include <__chrono/system_clock.h>
 #include <__chrono/time_point.h>
 #include <__chrono/weekday.h>
@@ -79,7 +81,7 @@ namespace __formatter {
 // small). Therefore a duration uses its own conversion.
 template <class _CharT, class _Rep, class _Period>
 _LIBCPP_HIDE_FROM_ABI void
-__format_sub_seconds(const chrono::duration<_Rep, _Period>& __value, basic_stringstream<_CharT>& __sstr) {
+__format_sub_seconds(basic_stringstream<_CharT>& __sstr, const chrono::duration<_Rep, _Period>& __value) {
   __sstr << std::use_facet<numpunct<_CharT>>(__sstr.getloc()).decimal_point();
 
   using __duration = chrono::duration<_Rep, _Period>;
@@ -110,13 +112,13 @@ __format_sub_seconds(const chrono::duration<_Rep, _Period>& __value, basic_strin
 }
 
 template <class _CharT, __is_time_point _Tp>
-_LIBCPP_HIDE_FROM_ABI void __format_sub_seconds(const _Tp& __value, basic_stringstream<_CharT>& __sstr) {
-  __formatter::__format_sub_seconds(__value.time_since_epoch(), __sstr);
+_LIBCPP_HIDE_FROM_ABI void __format_sub_seconds(basic_stringstream<_CharT>& __sstr, const _Tp& __value) {
+  __formatter::__format_sub_seconds(__sstr, __value.time_since_epoch());
 }
 
 template <class _CharT, class _Duration>
 _LIBCPP_HIDE_FROM_ABI void
-__format_sub_seconds(const chrono::hh_mm_ss<_Duration>& __value, basic_stringstream<_CharT>& __sstr) {
+__format_sub_seconds(basic_stringstream<_CharT>& __sstr, const chrono::hh_mm_ss<_Duration>& __value) {
   __sstr << std::use_facet<numpunct<_CharT>>(__sstr.getloc()).decimal_point();
   if constexpr (chrono::treat_as_floating_point_v<typename _Duration::rep>)
     std::format_to(std::ostreambuf_iterator<_CharT>{__sstr},
@@ -143,7 +145,7 @@ consteval bool __use_fraction() {
 }
 
 template <class _CharT>
-_LIBCPP_HIDE_FROM_ABI void __format_year(int __year, basic_stringstream<_CharT>& __sstr) {
+_LIBCPP_HIDE_FROM_ABI void __format_year(basic_stringstream<_CharT>& __sstr, int __year) {
   if (__year < 0) {
     __sstr << _CharT('-');
     __year = -__year;
@@ -159,7 +161,7 @@ _LIBCPP_HIDE_FROM_ABI void __format_year(int __year, basic_stringstream<_CharT>&
 }
 
 template <class _CharT>
-_LIBCPP_HIDE_FROM_ABI void __format_century(int __year, basic_stringstream<_CharT>& __sstr) {
+_LIBCPP_HIDE_FROM_ABI void __format_century(basic_stringstream<_CharT>& __sstr, int __year) {
   // TODO FMT Write an issue
   // [tab:time.format.spec]
   //   %C The year divided by 100 using floored division. If the result is a
@@ -170,10 +172,51 @@ _LIBCPP_HIDE_FROM_ABI void __format_century(int __year, basic_stringstream<_Char
   __sstr << std::format(_LIBCPP_STATICALLY_WIDEN(_CharT, "{:02}"), __century);
 }
 
+// Implements the %z format specifier according to [tab:time.format.spec], where
+// '__modifier' signals %Oz or %Ez were used. (Both modifiers behave the same,
+// so there is no need to distinguish between them.)
+template <class _CharT>
+_LIBCPP_HIDE_FROM_ABI void
+__format_zone_offset(basic_stringstream<_CharT>& __sstr, chrono::seconds __offset, bool __modifier) {
+  if (__offset < 0s) {
+    __sstr << _CharT('-');
+    __offset = -__offset;
+  } else {
+    __sstr << _CharT('+');
+  }
+
+  chrono::hh_mm_ss __hms{__offset};
+  std::ostreambuf_iterator<_CharT> __out_it{__sstr};
+  // Note HMS does not allow formatting hours > 23, but the offset is not limited to 24H.
+  std::format_to(__out_it, _LIBCPP_STATICALLY_WIDEN(_CharT, "{:02}"), __hms.hours().count());
+  if (__modifier)
+    __sstr << _CharT(':');
+  std::format_to(__out_it, _LIBCPP_STATICALLY_WIDEN(_CharT, "{:02}"), __hms.minutes().count());
+}
+
+// Helper to store the time zone information needed for formatting.
+struct _LIBCPP_HIDE_FROM_ABI __time_zone {
+  // Typically these abbreviations are short and fit in the string's internal
+  // buffer.
+  string __abbrev;
+  chrono::seconds __offset;
+};
+
+template <class _Tp>
+_LIBCPP_HIDE_FROM_ABI __time_zone __convert_to_time_zone([[maybe_unused]] const _Tp& __value) {
+#  if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB)
+  if constexpr (same_as<_Tp, chrono::sys_info>)
+    return {__value.abbrev, __value.offset};
+  else
+#  endif
+    return {"UTC", chrono::seconds{0}};
+}
+
 template <class _CharT, class _Tp>
 _LIBCPP_HIDE_FROM_ABI void __format_chrono_using_chrono_specs(
-    const _Tp& __value, basic_stringstream<_CharT>& __sstr, basic_string_view<_CharT> __chrono_specs) {
+    basic_stringstream<_CharT>& __sstr, const _Tp& __value, basic_string_view<_CharT> __chrono_specs) {
   tm __t              = std::__convert_to_tm<tm>(__value);
+  __time_zone __z     = __formatter::__convert_to_time_zone(__value);
   const auto& __facet = std::use_facet<time_put<_CharT>>(__sstr.getloc());
   for (auto __it = __chrono_specs.begin(); __it != __chrono_specs.end(); ++__it) {
     if (*__it == _CharT('%')) {
@@ -196,7 +239,7 @@ _LIBCPP_HIDE_FROM_ABI void __format_chrono_using_chrono_specs(
         // strftime's output is only defined in the range [00, 99].
         int __year = __t.tm_year + 1900;
         if (__year < 1000 || __year > 9999)
-          __formatter::__format_century(__year, __sstr);
+          __formatter::__format_century(__sstr, __year);
         else
           __facet.put(
               {__sstr}, __sstr, _CharT(' '), std::addressof(__t), std::to_address(__s), std::to_address(__it + 1));
@@ -242,7 +285,7 @@ _LIBCPP_HIDE_FROM_ABI void __format_chrono_using_chrono_specs(
         __facet.put(
             {__sstr}, __sstr, _CharT(' '), std::addressof(__t), std::to_address(__s), std::to_address(__it + 1));
         if constexpr (__use_fraction<_Tp>())
-          __formatter::__format_sub_seconds(__value, __sstr);
+          __formatter::__format_sub_seconds(__sstr, __value);
         break;
 
         // Unlike time_put and strftime the formatting library requires %Y
@@ -283,22 +326,24 @@ _LIBCPP_HIDE_FROM_ABI void __format_chrono_using_chrono_specs(
         // Depending on the platform's libc the range of supported years is
         // limited. Intead of of testing all conditions use the internal
         // implementation unconditionally.
-        __formatter::__format_year(__t.tm_year + 1900, __sstr);
+        __formatter::__format_year(__sstr, __t.tm_year + 1900);
         break;
 
-      case _CharT('F'): {
-        int __year = __t.tm_year + 1900;
-        if (__year < 1000) {
-          __formatter::__format_year(__year, __sstr);
-          __sstr << std::format(_LIBCPP_STATICALLY_WIDEN(_CharT, "-{:02}-{:02}"), __t.tm_mon + 1, __t.tm_mday);
-        } else
-          __facet.put(
-              {__sstr}, __sstr, _CharT(' '), std::addressof(__t), std::to_address(__s), std::to_address(__it + 1));
-      } break;
+      case _CharT('F'):
+        // Depending on the platform's libc the range of supported years is
+        // limited. Instead of testing all conditions use the internal
+        // implementation unconditionally.
+        __formatter::__format_year(__sstr, __t.tm_year + 1900);
+        __sstr << std::format(_LIBCPP_STATICALLY_WIDEN(_CharT, "-{:02}-{:02}"), __t.tm_mon + 1, __t.tm_mday);
+        break;
+
+      case _CharT('z'):
+        __formatter::__format_zone_offset(__sstr, __z.__offset, false);
+        break;
 
       case _CharT('Z'):
-        // TODO FMT Add proper timezone support.
-        __sstr << _LIBCPP_STATICALLY_WIDEN(_CharT, "UTC");
+        // __abbrev is always a char so the copy may convert.
+        ranges::copy(__z.__abbrev, std::ostreambuf_iterator<_CharT>{__sstr});
         break;
 
       case _CharT('O'):
@@ -310,13 +355,19 @@ _LIBCPP_HIDE_FROM_ABI void __format_chrono_using_chrono_specs(
             ++__it;
             __facet.put(
                 {__sstr}, __sstr, _CharT(' '), std::addressof(__t), std::to_address(__s), std::to_address(__it + 1));
-            __formatter::__format_sub_seconds(__value, __sstr);
+            __formatter::__format_sub_seconds(__sstr, __value);
             break;
           }
         }
+
+        // Oz produces the same output as Ez below.
         [[fallthrough]];
       case _CharT('E'):
         ++__it;
+        if (*__it == 'z') {
+          __formatter::__format_zone_offset(__sstr, __z.__offset, true);
+          break;
+        }
         [[fallthrough]];
       default:
         __facet.put(
@@ -365,6 +416,10 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool __weekday_ok(const _Tp& __value) {
     return __value.weekday().ok();
   else if constexpr (__is_hh_mm_ss<_Tp>)
     return true;
+#  if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB)
+  else if constexpr (same_as<_Tp, chrono::sys_info>)
+    return true;
+#  endif
   else
     static_assert(sizeof(_Tp) == 0, "Add the missing type specialization");
 }
@@ -405,6 +460,10 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool __weekday_name_ok(const _Tp& __value) {
     return __value.weekday().ok();
   else if constexpr (__is_hh_mm_ss<_Tp>)
     return true;
+#  if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB)
+  else if constexpr (same_as<_Tp, chrono::sys_info>)
+    return true;
+#  endif
   else
     static_assert(sizeof(_Tp) == 0, "Add the missing type specialization");
 }
@@ -445,6 +504,10 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool __date_ok(const _Tp& __value) {
     return __value.ok();
   else if constexpr (__is_hh_mm_ss<_Tp>)
     return true;
+#  if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB)
+  else if constexpr (same_as<_Tp, chrono::sys_info>)
+    return true;
+#  endif
   else
     static_assert(sizeof(_Tp) == 0, "Add the missing type specialization");
 }
@@ -485,6 +548,10 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool __month_name_ok(const _Tp& __value) {
     return __value.month().ok();
   else if constexpr (__is_hh_mm_ss<_Tp>)
     return true;
+#  if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB)
+  else if constexpr (same_as<_Tp, chrono::sys_info>)
+    return true;
+#  endif
   else
     static_assert(sizeof(_Tp) == 0, "Add the missing type specialization");
 }
@@ -512,7 +579,7 @@ __format_chrono(const _Tp& __value,
     if constexpr (chrono::__is_duration<_Tp>::value) {
       if (__value < __value.zero())
         __sstr << _CharT('-');
-      __formatter::__format_chrono_using_chrono_specs(chrono::abs(__value), __sstr, __chrono_specs);
+      __formatter::__format_chrono_using_chrono_specs(__sstr, chrono::abs(__value), __chrono_specs);
       // TODO FMT When keeping the precision it will truncate the string.
       // Note that the behaviour what the precision does isn't specified.
       __specs.__precision_ = -1;
@@ -556,7 +623,7 @@ __format_chrono(const _Tp& __value,
           __sstr << _CharT('-');
       }
 
-      __formatter::__format_chrono_using_chrono_specs(__value, __sstr, __chrono_specs);
+      __formatter::__format_chrono_using_chrono_specs(__sstr, __value, __chrono_specs);
     }
   }
 
@@ -814,6 +881,20 @@ struct formatter<chrono::hh_mm_ss<_Duration>, _CharT> : public __formatter_chron
     return _Base::__parse(__ctx, __format_spec::__fields_chrono, __format_spec::__flags::__time);
   }
 };
+
+#  if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB)
+template <__fmt_char_type _CharT>
+struct formatter<chrono::sys_info, _CharT> : public __formatter_chrono<_CharT> {
+public:
+  using _Base = __formatter_chrono<_CharT>;
+
+  template <class _ParseContext>
+  _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
+    return _Base::__parse(__ctx, __format_spec::__fields_chrono, __format_spec::__flags::__time_zone);
+  }
+};
+#  endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB)
+
 #endif // if _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__chrono/ostream.h b/libcxx/include/__chrono/ostream.h
index b687ef8059d5f..9476406d76107 100644
--- a/libcxx/include/__chrono/ostream.h
+++ b/libcxx/include/__chrono/ostream.h
@@ -19,6 +19,7 @@
 #include <__chrono/month_weekday.h>
 #include <__chrono/monthday.h>
 #include <__chrono/statically_widen.h>
+#include <__chrono/sys_info.h>
 #include <__chrono/system_clock.h>
 #include <__chrono/weekday.h>
 #include <__chrono/year.h>
@@ -262,6 +263,25 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, const hh_mm_ss<_Duration> __hms
   return __os << std::format(__os.getloc(), _LIBCPP_STATICALLY_WIDEN(_CharT, "{:L%T}"), __hms);
 }
 
+#  if !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB)
+
+template <class _CharT, class _Traits>
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
+operator<<(basic_ostream<_CharT, _Traits>& __os, const sys_info& __info) {
+  // __info.abbrev is always std::basic_string<char>.
+  // Since these strings typically are short the conversion should be cheap.
+  std::basic_string<_CharT> __abbrev{__info.abbrev.begin(), __info.abbrev.end()};
+  return __os << std::format(
+             _LIBCPP_STATICALLY_WIDEN(_CharT, "[{:%F %T}, {:%F %T}) {:%T} {:%Q%q} \"{}\""),
+             __info.begin,
+             __info.end,
+             hh_mm_ss{__info.offset},
+             __info.save,
+             __abbrev);
+}
+
+#  endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB)
+
 } // namespace chrono
 
 #endif // if _LIBCPP_STD_VER >= 20
diff --git a/libcxx/include/__chrono/sys_info.h b/libcxx/include/__chrono/sys_info.h
index 794d22f2ccc1e..461d5322d413b 100644
--- a/libcxx/include/__chrono/sys_info.h
+++ b/libcxx/include/__chrono/sys_info.h
@@ -42,7 +42,7 @@ struct sys_info {
 
 } // namespace chrono
 
-#  endif //_LIBCPP_STD_VER >= 20
+#  endif // _LIBCPP_STD_VER >= 20
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 82782b31c557b..4ccef2ca0d73b 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -16,17 +16,6 @@
 #  pragma GCC system_header
 #endif
 
-#if defined(_LIBCPP_ENABLE_CXX17_REMOVED_FEATURES) && !defined(_LIBCPP_DISABLE_DEPRECATION_WARNINGS)
-#  pragma clang deprecated(                                                                                            \
-      _LIBCPP_ENABLE_CXX17_REMOVED_FEATURES,                                                                           \
-      "_LIBCPP_ENABLE_CXX17_REMOVED_FEATURES is deprecated in LLVM 18 and will be removed in LLVM 19")
-#endif
-#if defined(_LIBCPP_ENABLE_CXX20_REMOVED_FEATURES) && !defined(_LIBCPP_DISABLE_DEPRECATION_WARNINGS)
-#  pragma clang deprecated(                                                                                            \
-      _LIBCPP_ENABLE_CXX20_REMOVED_FEATURES,                                                                           \
-      "_LIBCPP_ENABLE_CXX20_REMOVED_FEATURES is deprecated in LLVM 18 and will be removed in LLVM 19")
-#endif
-
 #if defined(__apple_build_version__)
 // Given AppleClang XX.Y.Z, _LIBCPP_APPLE_CLANG_VER is XXYZ (e.g. AppleClang 14.0.3 => 1403)
 #  define _LIBCPP_COMPILER_CLANG_BASED
@@ -120,14 +109,11 @@
 #    define _LIBCPP_ABI_FIX_UNORDERED_NODE_POINTER_UB
 #    define _LIBCPP_ABI_FORWARD_LIST_REMOVE_NODE_POINTER_UB
 #    define _LIBCPP_ABI_FIX_UNORDERED_CONTAINER_SIZE_TYPE
-// Define a key function for `bad_function_call` in the library, to centralize
-// its vtable and typeinfo to libc++ rather than having all other libraries
-// using that class define their own copies.
-#    define _LIBCPP_ABI_BAD_FUNCTION_CALL_KEY_FUNCTION
-// Override the default return value of exception::what() for
-// bad_function_call::what() with a string that is specific to
-// bad_function_call (see http://wg21.link/LWG2233). This is an ABI break
-// because it changes the vtable layout of bad_function_call.
+// Override the default return value of exception::what() for bad_function_call::what()
+// with a string that is specific to bad_function_call (see http://wg21.link/LWG2233).
+// This is an ABI break on platforms that sign and authenticate vtable function pointers
+// because it changes the mangling of the virtual function located in the vtable, which
+// changes how it gets signed.
 #    define _LIBCPP_ABI_BAD_FUNCTION_CALL_GOOD_WHAT_MESSAGE
 // Enable optimized version of __do_get_(un)signed which avoids redundant copies.
 #    define _LIBCPP_ABI_OPTIMIZED_LOCALE_NUM_GET
@@ -197,19 +183,6 @@
 #    if defined(__FreeBSD__) && __FreeBSD__ < 14
 #      define _LIBCPP_DEPRECATED_ABI_DISABLE_PAIR_TRIVIAL_COPY_CTOR
 #    endif
-// For XCOFF linkers, we have problems if we see a weak hidden version of a symbol
-// in user code (like you get with -fvisibility-inlines-hidden) and then a strong def
-// in the library, so we need to always rely on the library version.
-#    if defined(_AIX)
-#      define _LIBCPP_ABI_BAD_FUNCTION_CALL_KEY_FUNCTION
-#    endif
-#  endif
-
-#  if defined(_LIBCPP_BUILDING_LIBRARY) || _LIBCPP_ABI_VERSION >= 2
-// Define a key function for `bad_function_call` in the library, to centralize
-// its vtable and typeinfo to libc++ rather than having all other libraries
-// using that class define their own copies.
-#    define _LIBCPP_ABI_BAD_FUNCTION_CALL_KEY_FUNCTION
 #  endif
 
 // We had some bugs where we use [[no_unique_address]] together with construct_at,
@@ -972,6 +945,14 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_DEPRECATED_(m)
 #  endif
 
+#  if _LIBCPP_STD_VER < 20
+#    define _LIBCPP_DEPRECATED_ATOMIC_SYNC                                                                             \
+      _LIBCPP_DEPRECATED_("The C++20 synchronization library has been deprecated prior to C++20. Please update to "    \
+                          "using -std=c++20 if you need to use these facilities.")
+#  else
+#    define _LIBCPP_DEPRECATED_ATOMIC_SYNC /* nothing */
+#  endif
+
 #  if !defined(_LIBCPP_CXX03_LANG)
 #    define _LIBCPP_DEPRECATED_IN_CXX11 _LIBCPP_DEPRECATED
 #  else
@@ -1238,21 +1219,6 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_IF_WIDE_CHARACTERS(...) __VA_ARGS__
 #  endif
 
-#  if defined(_LIBCPP_ENABLE_CXX17_REMOVED_FEATURES)
-#    define _LIBCPP_ENABLE_CXX17_REMOVED_AUTO_PTR
-#    define _LIBCPP_ENABLE_CXX17_REMOVED_BINDERS
-#    define _LIBCPP_ENABLE_CXX17_REMOVED_RANDOM_SHUFFLE
-#    define _LIBCPP_ENABLE_CXX17_REMOVED_UNEXPECTED_FUNCTIONS
-#    define _LIBCPP_ENABLE_CXX17_REMOVED_UNARY_BINARY_FUNCTION
-#  endif // _LIBCPP_ENABLE_CXX17_REMOVED_FEATURES
-
-#  if defined(_LIBCPP_ENABLE_CXX20_REMOVED_FEATURES)
-#    define _LIBCPP_ENABLE_CXX20_REMOVED_BINDER_TYPEDEFS
-#    define _LIBCPP_ENABLE_CXX20_REMOVED_NEGATORS
-#    define _LIBCPP_ENABLE_CXX20_REMOVED_RAW_STORAGE_ITERATOR
-#    define _LIBCPP_ENABLE_CXX20_REMOVED_TYPE_TRAITS
-#  endif // _LIBCPP_ENABLE_CXX20_REMOVED_FEATURES
-
 // clang-format off
 #  define _LIBCPP_PUSH_MACROS _Pragma("push_macro(\"min\")") _Pragma("push_macro(\"max\")") _Pragma("push_macro(\"refresh\")") _Pragma("push_macro(\"move\")") _Pragma("push_macro(\"erase\")")
 #  define _LIBCPP_POP_MACROS _Pragma("pop_macro(\"min\")") _Pragma("pop_macro(\"max\")") _Pragma("pop_macro(\"refresh\")") _Pragma("pop_macro(\"move\")") _Pragma("pop_macro(\"erase\")")
diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in
index 7c002c5bfcf8e..89a14609ee3f9 100644
--- a/libcxx/include/__config_site.in
+++ b/libcxx/include/__config_site.in
@@ -32,9 +32,9 @@
 #cmakedefine _LIBCPP_INSTRUMENTED_WITH_ASAN
 
 // PSTL backends
-#cmakedefine _LIBCPP_PSTL_CPU_BACKEND_SERIAL
-#cmakedefine _LIBCPP_PSTL_CPU_BACKEND_THREAD
-#cmakedefine _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH
+#cmakedefine _LIBCPP_PSTL_BACKEND_SERIAL
+#cmakedefine _LIBCPP_PSTL_BACKEND_STD_THREAD
+#cmakedefine _LIBCPP_PSTL_BACKEND_LIBDISPATCH
 
 // Hardening.
 #cmakedefine _LIBCPP_HARDENING_MODE_DEFAULT @_LIBCPP_HARDENING_MODE_DEFAULT@
diff --git a/libcxx/include/__expected/bad_expected_access.h b/libcxx/include/__expected/bad_expected_access.h
index 9d490307b6808..ef29fa5088313 100644
--- a/libcxx/include/__expected/bad_expected_access.h
+++ b/libcxx/include/__expected/bad_expected_access.h
@@ -9,6 +9,7 @@
 #ifndef _LIBCPP___EXPECTED_BAD_EXPECTED_ACCESS_H
 #define _LIBCPP___EXPECTED_BAD_EXPECTED_ACCESS_H
 
+#include <__availability>
 #include <__config>
 #include <__exception/exception.h>
 #include <__utility/move.h>
@@ -28,9 +29,11 @@ template <class _Err>
 class bad_expected_access;
 
 _LIBCPP_DIAGNOSTIC_PUSH
+#  if !_LIBCPP_AVAILABILITY_HAS_BAD_EXPECTED_ACCESS_KEY_FUNCTION
 _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wweak-vtables")
+#  endif
 template <>
-class bad_expected_access<void> : public exception {
+class _LIBCPP_EXPORTED_FROM_ABI bad_expected_access<void> : public exception {
 protected:
   _LIBCPP_HIDE_FROM_ABI bad_expected_access() noexcept                                      = default;
   _LIBCPP_HIDE_FROM_ABI bad_expected_access(const bad_expected_access&) noexcept            = default;
@@ -40,11 +43,11 @@ class bad_expected_access<void> : public exception {
   _LIBCPP_HIDE_FROM_ABI_VIRTUAL ~bad_expected_access() override                             = default;
 
 public:
-  // The way this has been designed (by using a class template below) means that we'll already
-  // have a profusion of these vtables in TUs, and the dynamic linker will already have a bunch
-  // of work to do. So it is not worth hiding the <void> specialization in the dylib, given that
-  // it adds deployment target restrictions.
+#  if _LIBCPP_AVAILABILITY_HAS_BAD_EXPECTED_ACCESS_KEY_FUNCTION
+  const char* what() const noexcept override;
+#  else
   _LIBCPP_HIDE_FROM_ABI_VIRTUAL const char* what() const noexcept override { return "bad access to std::expected"; }
+#  endif
 };
 _LIBCPP_DIAGNOSTIC_POP
 
diff --git a/libcxx/include/__functional/function.h b/libcxx/include/__functional/function.h
index 1faa9e92ebd63..36057706933d4 100644
--- a/libcxx/include/__functional/function.h
+++ b/libcxx/include/__functional/function.h
@@ -11,6 +11,7 @@
 #define _LIBCPP___FUNCTIONAL_FUNCTION_H
 
 #include <__assert>
+#include <__availability>
 #include <__config>
 #include <__exception/exception.h>
 #include <__functional/binary_function.h>
@@ -55,7 +56,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // bad_function_call
 
 _LIBCPP_DIAGNOSTIC_PUSH
+#  if !_LIBCPP_AVAILABILITY_HAS_BAD_FUNCTION_CALL_KEY_FUNCTION
 _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wweak-vtables")
+#  endif
 class _LIBCPP_EXPORTED_FROM_ABI bad_function_call : public exception {
 public:
   _LIBCPP_HIDE_FROM_ABI bad_function_call() _NOEXCEPT                                    = default;
@@ -64,7 +67,7 @@ class _LIBCPP_EXPORTED_FROM_ABI bad_function_call : public exception {
 // Note that when a key function is not used, every translation unit that uses
 // bad_function_call will end up containing a weak definition of the vtable and
 // typeinfo.
-#  ifdef _LIBCPP_ABI_BAD_FUNCTION_CALL_KEY_FUNCTION
+#  if _LIBCPP_AVAILABILITY_HAS_BAD_FUNCTION_CALL_KEY_FUNCTION
   ~bad_function_call() _NOEXCEPT override;
 #  else
   _LIBCPP_HIDE_FROM_ABI_VIRTUAL ~bad_function_call() _NOEXCEPT override {}
diff --git a/libcxx/include/__fwd/ios.h b/libcxx/include/__fwd/ios.h
index 82c865d58cc75..48350709d4ce2 100644
--- a/libcxx/include/__fwd/ios.h
+++ b/libcxx/include/__fwd/ios.h
@@ -18,6 +18,8 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+class _LIBCPP_EXPORTED_FROM_ABI ios_base;
+
 template <class _CharT, class _Traits = char_traits<_CharT> >
 class _LIBCPP_TEMPLATE_VIS basic_ios;
 
diff --git a/libcxx/include/__iterator/cpp17_iterator_concepts.h b/libcxx/include/__iterator/cpp17_iterator_concepts.h
index cdb561e68452a..9d5a392582da4 100644
--- a/libcxx/include/__iterator/cpp17_iterator_concepts.h
+++ b/libcxx/include/__iterator/cpp17_iterator_concepts.h
@@ -157,29 +157,31 @@ concept __cpp17_random_access_iterator =
 _LIBCPP_END_NAMESPACE_STD
 
 #  ifndef _LIBCPP_DISABLE_ITERATOR_CHECKS
-#    define _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(iter_t) static_assert(::std::__cpp17_input_iterator<iter_t>);
-#    define _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(iter_t, write_t)                                                     \
-      static_assert(::std::__cpp17_output_iterator<iter_t, write_t>);
-#    define _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(iter_t) static_assert(::std::__cpp17_forward_iterator<iter_t>);
-#    define _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(iter_t)                                                       \
-      static_assert(::std::__cpp17_bidirectional_iterator<iter_t>);
-#    define _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(iter_t)                                                       \
-      static_assert(::std::__cpp17_random_access_iterator<iter_t>);
+#    define _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(iter_t, message)                                                      \
+      static_assert(::std::__cpp17_input_iterator<iter_t>, message)
+#    define _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(iter_t, write_t, message)                                            \
+      static_assert(::std::__cpp17_output_iterator<iter_t, write_t>, message)
+#    define _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(iter_t, message)                                                    \
+      static_assert(::std::__cpp17_forward_iterator<iter_t>, message)
+#    define _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(iter_t, message)                                              \
+      static_assert(::std::__cpp17_bidirectional_iterator<iter_t>, message)
+#    define _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(iter_t, message)                                              \
+      static_assert(::std::__cpp17_random_access_iterator<iter_t>, message)
 #  else
-#    define _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(iter_t)
-#    define _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(iter_t, write_t)
-#    define _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(iter_t)
-#    define _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(iter_t)
-#    define _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(iter_t)
+#    define _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(iter_t, message) static_assert(true)
+#    define _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(iter_t, write_t, message) static_assert(true)
+#    define _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(iter_t, message) static_assert(true)
+#    define _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(iter_t, message) static_assert(true)
+#    define _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(iter_t, message) static_assert(true)
 #  endif
 
 #else // _LIBCPP_STD_VER >= 20
 
-#  define _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(iter_t)
-#  define _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(iter_t, write_t)
-#  define _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(iter_t)
-#  define _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(iter_t)
-#  define _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(iter_t)
+#  define _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(iter_t, message) static_assert(true)
+#  define _LIBCPP_REQUIRE_CPP17_OUTPUT_ITERATOR(iter_t, write_t, message) static_assert(true)
+#  define _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(iter_t, message) static_assert(true)
+#  define _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(iter_t, message) static_assert(true)
+#  define _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(iter_t, message) static_assert(true)
 
 #endif // _LIBCPP_STD_VER >= 20
 
diff --git a/libcxx/include/__numeric/pstl_reduce.h b/libcxx/include/__numeric/pstl_reduce.h
index f9f666c2bb38b..d678b9480070b 100644
--- a/libcxx/include/__numeric/pstl_reduce.h
+++ b/libcxx/include/__numeric/pstl_reduce.h
@@ -12,6 +12,7 @@
 #include <__algorithm/pstl_frontend_dispatch.h>
 #include <__config>
 #include <__functional/identity.h>
+#include <__iterator/cpp17_iterator_concepts.h>
 #include <__iterator/iterator_traits.h>
 #include <__numeric/pstl_transform_reduce.h>
 #include <__type_traits/is_execution_policy.h>
@@ -66,6 +67,7 @@ reduce(_ExecutionPolicy&& __policy,
        _ForwardIterator __last,
        _Tp __init,
        _BinaryOperation __op = {}) {
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "reduce requires ForwardIterators");
   auto __res = std::__reduce(__policy, std::move(__first), std::move(__last), std::move(__init), std::move(__op));
   if (!__res)
     std::__throw_bad_alloc();
@@ -94,6 +96,7 @@ template <class _ExecutionPolicy,
           enable_if_t<is_execution_policy_v<_RawPolicy>, int> = 0>
 _LIBCPP_HIDE_FROM_ABI __iter_value_type<_ForwardIterator>
 reduce(_ExecutionPolicy&& __policy, _ForwardIterator __first, _ForwardIterator __last) {
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "reduce requires ForwardIterators");
   auto __res = std::__reduce(__policy, std::move(__first), std::move(__last));
   if (!__res)
     std::__throw_bad_alloc();
diff --git a/libcxx/include/__numeric/pstl_transform_reduce.h b/libcxx/include/__numeric/pstl_transform_reduce.h
index 07ecf0d9956bb..fe41b1c86f3b1 100644
--- a/libcxx/include/__numeric/pstl_transform_reduce.h
+++ b/libcxx/include/__numeric/pstl_transform_reduce.h
@@ -9,11 +9,12 @@
 #ifndef _LIBCPP___NUMERIC_PSTL_TRANSFORM_REDUCE_H
 #define _LIBCPP___NUMERIC_PSTL_TRANSFORM_REDUCE_H
 
-#include <__algorithm/pstl_backend.h>
 #include <__algorithm/pstl_frontend_dispatch.h>
 #include <__config>
 #include <__functional/operations.h>
+#include <__iterator/cpp17_iterator_concepts.h>
 #include <__numeric/transform_reduce.h>
+#include <__pstl/configuration.h>
 #include <__type_traits/is_execution_policy.h>
 #include <__utility/move.h>
 #include <optional>
@@ -72,6 +73,8 @@ _LIBCPP_HIDE_FROM_ABI _Tp transform_reduce(
     _Tp __init,
     _BinaryOperation1 __reduce,
     _BinaryOperation2 __transform) {
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator1, "transform_reduce requires ForwardIterators");
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator2, "transform_reduce requires ForwardIterators");
   auto __res = std::__transform_reduce(
       __policy,
       std::move(__first1),
@@ -99,6 +102,8 @@ _LIBCPP_HIDE_FROM_ABI _Tp transform_reduce(
     _ForwardIterator1 __last1,
     _ForwardIterator2 __first2,
     _Tp __init) {
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator1, "transform_reduce requires ForwardIterators");
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator2, "transform_reduce requires ForwardIterators");
   return std::transform_reduce(__policy, __first1, __last1, __first2, __init, plus{}, multiplies{});
 }
 
@@ -140,6 +145,7 @@ _LIBCPP_HIDE_FROM_ABI _Tp transform_reduce(
     _Tp __init,
     _BinaryOperation __reduce,
     _UnaryOperation __transform) {
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(_ForwardIterator, "transform_reduce requires ForwardIterators");
   auto __res = std::__transform_reduce(
       __policy, std::move(__first), std::move(__last), std::move(__init), std::move(__reduce), std::move(__transform));
   if (!__res)
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h b/libcxx/include/__pstl/backends/libdispatch.h
similarity index 89%
rename from libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
rename to libcxx/include/__pstl/backends/libdispatch.h
index 8757f24968037..977b06b9a489c 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h
+++ b/libcxx/include/__pstl/backends/libdispatch.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H
+#ifndef _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H
+#define _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H
 
 #include <__algorithm/inplace_merge.h>
 #include <__algorithm/lower_bound.h>
@@ -23,6 +23,7 @@
 #include <__memory/construct_at.h>
 #include <__memory/unique_ptr.h>
 #include <__numeric/reduce.h>
+#include <__pstl/configuration_fwd.h>
 #include <__pstl/cpu_algos/cpu_traits.h>
 #include <__utility/empty.h>
 #include <__utility/exception_guard.h>
@@ -40,8 +41,6 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __pstl {
 
-struct __libdispatch_backend_tag {};
-
 namespace __libdispatch {
 // ::dispatch_apply is marked as __attribute__((nothrow)) because it doesn't let exceptions propagate, and neither do
 // we.
@@ -85,7 +84,7 @@ template <>
 struct __cpu_traits<__libdispatch_backend_tag> {
   template <class _RandomAccessIterator, class _Functor>
   _LIBCPP_HIDE_FROM_ABI static optional<__empty>
-  __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {
+  __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) {
     return __libdispatch::__dispatch_parallel_for(
         __libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func));
   }
@@ -105,14 +104,14 @@ struct __cpu_traits<__libdispatch_backend_tag> {
             typename _RandomAccessIterator3,
             typename _Compare,
             typename _LeafMerge>
-  _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_merge(
-      _RandomAccessIterator1 __first1,
-      _RandomAccessIterator1 __last1,
-      _RandomAccessIterator2 __first2,
-      _RandomAccessIterator2 __last2,
-      _RandomAccessIterator3 __result,
-      _Compare __comp,
-      _LeafMerge __leaf_merge) noexcept {
+  _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+  __merge(_RandomAccessIterator1 __first1,
+          _RandomAccessIterator1 __last1,
+          _RandomAccessIterator2 __first2,
+          _RandomAccessIterator2 __last2,
+          _RandomAccessIterator3 __result,
+          _Compare __comp,
+          _LeafMerge __leaf_merge) noexcept {
     __libdispatch::__chunk_partitions __partitions =
         __libdispatch::__partition_chunks(std::max<ptrdiff_t>(__last1 - __first1, __last2 - __first2));
 
@@ -201,7 +200,7 @@ struct __cpu_traits<__libdispatch_backend_tag> {
   }
 
   template <class _RandomAccessIterator, class _Transform, class _Value, class _Combiner, class _Reduction>
-  _LIBCPP_HIDE_FROM_ABI static optional<_Value> __parallel_transform_reduce(
+  _LIBCPP_HIDE_FROM_ABI static optional<_Value> __transform_reduce(
       _RandomAccessIterator __first,
       _RandomAccessIterator __last,
       _Transform __transform,
@@ -248,8 +247,8 @@ struct __cpu_traits<__libdispatch_backend_tag> {
   }
 
   template <class _RandomAccessIterator, class _Comp, class _LeafSort>
-  _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_stable_sort(
-      _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) {
+  _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+  __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) {
     const auto __size = __last - __first;
     auto __partitions = __libdispatch::__partition_chunks(__size);
 
@@ -349,4 +348,14 @@ _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H
+// Implement PSTL algorithms based on the __cpu_traits specialized above
+#include <__algorithm/pstl_backends/cpu_backends/any_of.h>
+#include <__algorithm/pstl_backends/cpu_backends/fill.h>
+#include <__algorithm/pstl_backends/cpu_backends/find_if.h>
+#include <__algorithm/pstl_backends/cpu_backends/for_each.h>
+#include <__algorithm/pstl_backends/cpu_backends/merge.h>
+#include <__algorithm/pstl_backends/cpu_backends/stable_sort.h>
+#include <__algorithm/pstl_backends/cpu_backends/transform.h>
+#include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
+
+#endif // _LIBCPP___PSTL_BACKENDS_LIBDISPATCH_H
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h b/libcxx/include/__pstl/backends/serial.h
similarity index 57%
rename from libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h
rename to libcxx/include/__pstl/backends/serial.h
index c3d2905daed17..8bb8945093096 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/serial.h
+++ b/libcxx/include/__pstl/backends/serial.h
@@ -7,10 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_SERIAL_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_SERIAL_H
+#ifndef _LIBCPP___PSTL_BACKENDS_SERIAL_H
+#define _LIBCPP___PSTL_BACKENDS_SERIAL_H
 
 #include <__config>
+#include <__pstl/configuration_fwd.h>
 #include <__pstl/cpu_algos/cpu_traits.h>
 #include <__utility/empty.h>
 #include <__utility/move.h>
@@ -29,26 +30,24 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __pstl {
 
-struct __serial_backend_tag {};
-
 template <>
 struct __cpu_traits<__serial_backend_tag> {
   template <class _RandomAccessIterator, class _Fp>
   _LIBCPP_HIDE_FROM_ABI static optional<__empty>
-  __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) {
+  __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) {
     __f(__first, __last);
     return __empty{};
   }
 
   template <class _Index, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduce>
   _LIBCPP_HIDE_FROM_ABI static optional<_Tp>
-  __parallel_transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) {
+  __transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) {
     return __reduce(std::move(__first), std::move(__last), std::move(__init));
   }
 
   template <class _RandomAccessIterator, class _Compare, class _LeafSort>
-  _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_stable_sort(
-      _RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) {
+  _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+  __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) {
     __leaf_sort(__first, __last, __comp);
     return __empty{};
   }
@@ -60,14 +59,14 @@ struct __cpu_traits<__serial_backend_tag> {
             class _RandomAccessIterator3,
             class _Compare,
             class _LeafMerge>
-  _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_merge(
-      _RandomAccessIterator1 __first1,
-      _RandomAccessIterator1 __last1,
-      _RandomAccessIterator2 __first2,
-      _RandomAccessIterator2 __last2,
-      _RandomAccessIterator3 __outit,
-      _Compare __comp,
-      _LeafMerge __leaf_merge) {
+  _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+  __merge(_RandomAccessIterator1 __first1,
+          _RandomAccessIterator1 __last1,
+          _RandomAccessIterator2 __first2,
+          _RandomAccessIterator2 __last2,
+          _RandomAccessIterator3 __outit,
+          _Compare __comp,
+          _LeafMerge __leaf_merge) {
     __leaf_merge(__first1, __last1, __first2, __last2, __outit, __comp);
     return __empty{};
   }
@@ -82,4 +81,14 @@ _LIBCPP_POP_MACROS
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_SERIAL_H
+// Implement PSTL algorithms based on the __cpu_traits specialized above
+#include <__algorithm/pstl_backends/cpu_backends/any_of.h>
+#include <__algorithm/pstl_backends/cpu_backends/fill.h>
+#include <__algorithm/pstl_backends/cpu_backends/find_if.h>
+#include <__algorithm/pstl_backends/cpu_backends/for_each.h>
+#include <__algorithm/pstl_backends/cpu_backends/merge.h>
+#include <__algorithm/pstl_backends/cpu_backends/stable_sort.h>
+#include <__algorithm/pstl_backends/cpu_backends/transform.h>
+#include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
+
+#endif // _LIBCPP___PSTL_BACKENDS_SERIAL_H
diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h b/libcxx/include/__pstl/backends/std_thread.h
similarity index 59%
rename from libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h
rename to libcxx/include/__pstl/backends/std_thread.h
index 8d1cb221c3d82..ab09f42cfdd8d 100644
--- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/thread.h
+++ b/libcxx/include/__pstl/backends/std_thread.h
@@ -6,11 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_THREAD_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_THREAD_H
+#ifndef _LIBCPP___PSTL_BACKENDS_STD_THREAD_H
+#define _LIBCPP___PSTL_BACKENDS_STD_THREAD_H
 
 #include <__assert>
 #include <__config>
+#include <__pstl/configuration_fwd.h>
 #include <__pstl/cpu_algos/cpu_traits.h>
 #include <__utility/empty.h>
 #include <__utility/move.h>
@@ -32,26 +33,24 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __pstl {
 
-struct __std_thread_backend_tag {};
-
 template <>
 struct __cpu_traits<__std_thread_backend_tag> {
   template <class _RandomAccessIterator, class _Fp>
   _LIBCPP_HIDE_FROM_ABI static optional<__empty>
-  __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) {
+  __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) {
     __f(__first, __last);
     return __empty{};
   }
 
   template <class _Index, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduce>
   _LIBCPP_HIDE_FROM_ABI static optional<_Tp>
-  __parallel_transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) {
+  __transform_reduce(_Index __first, _Index __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduce __reduce) {
     return __reduce(std::move(__first), std::move(__last), std::move(__init));
   }
 
   template <class _RandomAccessIterator, class _Compare, class _LeafSort>
-  _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_stable_sort(
-      _RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) {
+  _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+  __stable_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, _LeafSort __leaf_sort) {
     __leaf_sort(__first, __last, __comp);
     return __empty{};
   }
@@ -63,14 +62,14 @@ struct __cpu_traits<__std_thread_backend_tag> {
             class _RandomAccessIterator3,
             class _Compare,
             class _LeafMerge>
-  _LIBCPP_HIDE_FROM_ABI static optional<__empty> __parallel_merge(
-      _RandomAccessIterator1 __first1,
-      _RandomAccessIterator1 __last1,
-      _RandomAccessIterator2 __first2,
-      _RandomAccessIterator2 __last2,
-      _RandomAccessIterator3 __outit,
-      _Compare __comp,
-      _LeafMerge __leaf_merge) {
+  _LIBCPP_HIDE_FROM_ABI static optional<__empty>
+  __merge(_RandomAccessIterator1 __first1,
+          _RandomAccessIterator1 __last1,
+          _RandomAccessIterator2 __first2,
+          _RandomAccessIterator2 __last2,
+          _RandomAccessIterator3 __outit,
+          _Compare __comp,
+          _LeafMerge __leaf_merge) {
     __leaf_merge(__first1, __last1, __first2, __last2, __outit, __comp);
     return __empty{};
   }
@@ -85,4 +84,14 @@ _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_THREAD_H
+// Implement PSTL algorithms based on the __cpu_traits specialized above
+#include <__algorithm/pstl_backends/cpu_backends/any_of.h>
+#include <__algorithm/pstl_backends/cpu_backends/fill.h>
+#include <__algorithm/pstl_backends/cpu_backends/find_if.h>
+#include <__algorithm/pstl_backends/cpu_backends/for_each.h>
+#include <__algorithm/pstl_backends/cpu_backends/merge.h>
+#include <__algorithm/pstl_backends/cpu_backends/stable_sort.h>
+#include <__algorithm/pstl_backends/cpu_backends/transform.h>
+#include <__algorithm/pstl_backends/cpu_backends/transform_reduce.h>
+
+#endif // _LIBCPP___PSTL_BACKENDS_STD_THREAD_H
diff --git a/libcxx/include/__pstl/configuration.h b/libcxx/include/__pstl/configuration.h
new file mode 100644
index 0000000000000..d32bd21df1f9e
--- /dev/null
+++ b/libcxx/include/__pstl/configuration.h
@@ -0,0 +1,27 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___PSTL_CONFIGURATION_H
+#define _LIBCPP___PSTL_CONFIGURATION_H
+
+#include <__config>
+#include <__pstl/configuration_fwd.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if defined(_LIBCPP_PSTL_BACKEND_SERIAL)
+#  include <__pstl/backends/serial.h>
+#elif defined(_LIBCPP_PSTL_BACKEND_STD_THREAD)
+#  include <__pstl/backends/std_thread.h>
+#elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
+#  include <__pstl/backends/libdispatch.h>
+#endif
+
+#endif // _LIBCPP___PSTL_CONFIGURATION_H
diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__pstl/configuration_fwd.h
similarity index 93%
rename from libcxx/include/__algorithm/pstl_backend.h
rename to libcxx/include/__pstl/configuration_fwd.h
index 3af03ce2fbc8e..995fcfce847cb 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__pstl/configuration_fwd.h
@@ -6,10 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___ALGORITHM_PSTL_BACKEND_H
-#define _LIBCPP___ALGORITHM_PSTL_BACKEND_H
+#ifndef _LIBCPP___PSTL_CONFIGURATION_FWD_H
+#define _LIBCPP___PSTL_CONFIGURATION_FWD_H
 
-#include <__algorithm/pstl_backends/cpu_backend.h>
 #include <__config>
 #include <execution>
 
@@ -191,6 +190,20 @@ into a program termination at the front-end level. When a backend returns a dise
 frontend will turn that into a call to `std::__throw_bad_alloc();` to report the internal failure to the user.
 */
 
+namespace __pstl {
+struct __libdispatch_backend_tag {};
+struct __serial_backend_tag {};
+struct __std_thread_backend_tag {};
+} // namespace __pstl
+
+#  if defined(_LIBCPP_PSTL_BACKEND_SERIAL)
+using __cpu_backend_tag = __pstl::__serial_backend_tag;
+#  elif defined(_LIBCPP_PSTL_BACKEND_STD_THREAD)
+using __cpu_backend_tag = __pstl::__std_thread_backend_tag;
+#  elif defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
+using __cpu_backend_tag = __pstl::__libdispatch_backend_tag;
+#  endif
+
 template <class _ExecutionPolicy>
 struct __select_backend;
 
@@ -206,8 +219,8 @@ struct __select_backend<std::execution::unsequenced_policy> {
 };
 #  endif
 
-#  if defined(_LIBCPP_PSTL_CPU_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_CPU_BACKEND_THREAD) ||                          \
-      defined(_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH)
+#  if defined(_LIBCPP_PSTL_BACKEND_SERIAL) || defined(_LIBCPP_PSTL_BACKEND_STD_THREAD) ||                              \
+      defined(_LIBCPP_PSTL_BACKEND_LIBDISPATCH)
 template <>
 struct __select_backend<std::execution::parallel_policy> {
   using type = __cpu_backend_tag;
@@ -229,4 +242,4 @@ _LIBCPP_END_NAMESPACE_STD
 
 #endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
 
-#endif // _LIBCPP___ALGORITHM_PSTL_BACKEND_H
+#endif // _LIBCPP___PSTL_CONFIGURATION_FWD_H
diff --git a/libcxx/include/__pstl/cpu_algos/cpu_traits.h b/libcxx/include/__pstl/cpu_algos/cpu_traits.h
index 2f0db46e9be83..0483d6918fd01 100644
--- a/libcxx/include/__pstl/cpu_algos/cpu_traits.h
+++ b/libcxx/include/__pstl/cpu_algos/cpu_traits.h
@@ -32,31 +32,30 @@ namespace __pstl {
 // ================
 //
 //  template <class _RandomAccessIterator, class _Functor>
-//  optional<__empty> __parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func);
+//  optional<__empty> __for_each(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func);
 //    - __func must take a subrange of [__first, __last) that should be executed in serial
 //
 //  template <class _Iterator, class _UnaryOp, class _Tp, class _BinaryOp, class _Reduction>
-//  optional<_Tp> __parallel_transform_reduce(_Iterator __first, _Iterator __last, _UnaryOp, _Tp __init, _BinaryOp,
-//  _Reduction);
+//  optional<_Tp> __transform_reduce(_Iterator __first, _Iterator __last, _UnaryOp, _Tp __init, _BinaryOp, _Reduction);
 //
 //  template <class _RandomAccessIterator1,
 //            class _RandomAccessIterator2,
 //            class _RandomAccessIterator3,
 //            class _Compare,
 //            class _LeafMerge>
-//  optional<_RandomAccessIterator3> __parallel_merge(_RandomAccessIterator1 __first1,
-//                                                    _RandomAccessIterator1 __last1,
-//                                                    _RandomAccessIterator2 __first2,
-//                                                    _RandomAccessIterator2 __last2,
-//                                                    _RandomAccessIterator3 __outit,
-//                                                    _Compare __comp,
-//                                                    _LeafMerge __leaf_merge);
+//  optional<_RandomAccessIterator3> __merge(_RandomAccessIterator1 __first1,
+//                                           _RandomAccessIterator1 __last1,
+//                                           _RandomAccessIterator2 __first2,
+//                                           _RandomAccessIterator2 __last2,
+//                                           _RandomAccessIterator3 __outit,
+//                                           _Compare __comp,
+//                                           _LeafMerge __leaf_merge);
 //
 //  template <class _RandomAccessIterator, class _Comp, class _LeafSort>
-//  optional<__empty> __parallel_stable_sort(_RandomAccessIterator __first,
-//                                           _RandomAccessIterator __last,
-//                                           _Comp __comp,
-//                                           _LeafSort __leaf_sort);
+//  optional<__empty> __stable_sort(_RandomAccessIterator __first,
+//                                  _RandomAccessIterator __last,
+//                                  _Comp __comp,
+//                                  _LeafSort __leaf_sort);
 //
 //   void __cancel_execution();
 //      Cancel the execution of other jobs - they aren't needed anymore. This is not a binding request,
diff --git a/libcxx/include/barrier b/libcxx/include/barrier
index c5fd84b91925b..d776078267625 100644
--- a/libcxx/include/barrier
+++ b/libcxx/include/barrier
@@ -257,7 +257,7 @@ public:
 #  endif // !_LIBCPP_HAS_NO_TREE_BARRIER
 
 template <class _CompletionF = __empty_completion>
-class barrier {
+class _LIBCPP_DEPRECATED_ATOMIC_SYNC barrier {
   __barrier_base<_CompletionF> __b_;
 
 public:
diff --git a/libcxx/include/chrono b/libcxx/include/chrono
index 513ae52006e89..f0275ec1eba7a 100644
--- a/libcxx/include/chrono
+++ b/libcxx/include/chrono
@@ -733,6 +733,10 @@ struct sys_info {
   string        abbrev;
 };
 
+template<class charT, class traits>                                              // C++20
+  basic_ostream<charT, traits>&
+    operator<<(basic_ostream<charT, traits>& os, const sys_info& si);
+
 // 25.10.5, class time_zone                                                      // C++20
 enum class choose {earliest, latest};
 class time_zone {
@@ -829,6 +833,7 @@ namespace std {
   template<class charT> struct formatter<chrono::year_month_weekday_last, charT>; // C++20
   template<class Rep, class Period, class charT>
     struct formatter<chrono::hh_mm_ss<duration<Rep, Period>>, charT>;             // C++20
+  template<class charT> struct formatter<chrono::sys_info, charT>;                // C++20
 } // namespace std
 
 namespace chrono {
diff --git a/libcxx/include/iosfwd b/libcxx/include/iosfwd
index 9af5e05031850..2481667dd972c 100644
--- a/libcxx/include/iosfwd
+++ b/libcxx/include/iosfwd
@@ -25,7 +25,6 @@ template<>            struct char_traits<wchar_t>;
 
 template<class T>     class allocator;
 
-class ios_base;
 template <class charT, class traits = char_traits<charT> > class basic_ios;
 
 template <class charT, class traits = char_traits<charT> > class basic_streambuf;
@@ -124,8 +123,6 @@ using wosyncstream = basic_osyncstream<wchar_t>;  // C++20
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-class _LIBCPP_EXPORTED_FROM_ABI ios_base;
-
 template <class _CharT, class _Traits = char_traits<_CharT> >
 class _LIBCPP_TEMPLATE_VIS istreambuf_iterator;
 template <class _CharT, class _Traits = char_traits<_CharT> >
diff --git a/libcxx/include/latch b/libcxx/include/latch
index 3cc7258381143..1937617f7dcc6 100644
--- a/libcxx/include/latch
+++ b/libcxx/include/latch
@@ -66,7 +66,7 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-class latch {
+class _LIBCPP_DEPRECATED_ATOMIC_SYNC latch {
   __atomic_base<ptrdiff_t> __a_;
 
 public:
diff --git a/libcxx/include/libcxx.imp b/libcxx/include/libcxx.imp
index 8820fb8c0936f..a4e2690fc55c9 100644
--- a/libcxx/include/libcxx.imp
+++ b/libcxx/include/libcxx.imp
@@ -73,18 +73,12 @@
   { include: [ "<__algorithm/pop_heap.h>", "private", "<algorithm>", "public" ] },
   { include: [ "<__algorithm/prev_permutation.h>", "private", "<algorithm>", "public" ] },
   { include: [ "<__algorithm/pstl_any_all_none_of.h>", "private", "<algorithm>", "public" ] },
-  { include: [ "<__algorithm/pstl_backend.h>", "private", "<algorithm>", "public" ] },
-  { include: [ "<__algorithm/pstl_backends/cpu_backend.h>", "private", "<algorithm>", "public" ] },
   { include: [ "<__algorithm/pstl_backends/cpu_backends/any_of.h>", "private", "<algorithm>", "public" ] },
-  { include: [ "<__algorithm/pstl_backends/cpu_backends/backend.h>", "private", "<algorithm>", "public" ] },
   { include: [ "<__algorithm/pstl_backends/cpu_backends/fill.h>", "private", "<algorithm>", "public" ] },
   { include: [ "<__algorithm/pstl_backends/cpu_backends/find_if.h>", "private", "<algorithm>", "public" ] },
   { include: [ "<__algorithm/pstl_backends/cpu_backends/for_each.h>", "private", "<algorithm>", "public" ] },
-  { include: [ "<__algorithm/pstl_backends/cpu_backends/libdispatch.h>", "private", "<algorithm>", "public" ] },
   { include: [ "<__algorithm/pstl_backends/cpu_backends/merge.h>", "private", "<algorithm>", "public" ] },
-  { include: [ "<__algorithm/pstl_backends/cpu_backends/serial.h>", "private", "<algorithm>", "public" ] },
   { include: [ "<__algorithm/pstl_backends/cpu_backends/stable_sort.h>", "private", "<algorithm>", "public" ] },
-  { include: [ "<__algorithm/pstl_backends/cpu_backends/thread.h>", "private", "<algorithm>", "public" ] },
   { include: [ "<__algorithm/pstl_backends/cpu_backends/transform.h>", "private", "<algorithm>", "public" ] },
   { include: [ "<__algorithm/pstl_backends/cpu_backends/transform_reduce.h>", "private", "<algorithm>", "public" ] },
   { include: [ "<__algorithm/pstl_copy.h>", "private", "<algorithm>", "public" ] },
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index ce133e471deb7..f996c2cc05459 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -714,32 +714,14 @@ module std_private_algorithm_partition_point                             [system
 module std_private_algorithm_pop_heap                                    [system] { header "__algorithm/pop_heap.h" }
 module std_private_algorithm_prev_permutation                            [system] { header "__algorithm/prev_permutation.h" }
 module std_private_algorithm_pstl_any_all_none_of                        [system] { header "__algorithm/pstl_any_all_none_of.h" }
-module std_private_algorithm_pstl_backend                                [system] {
-  header "__algorithm/pstl_backend.h"
-  export *
-}
-module std_private_algorithm_pstl_backends_cpu_backend                   [system] {
-  header "__algorithm/pstl_backends/cpu_backend.h"
-  export *
-}
-module std_private_algorithm_pstl_backends_cpu_backends_any_of           [system] { header "__algorithm/pstl_backends/cpu_backends/any_of.h" }
-module std_private_algorithm_pstl_backends_cpu_backends_backend          [system] {
-  header "__algorithm/pstl_backends/cpu_backends/backend.h"
-  export *
-}
-module std_private_algorithm_pstl_backends_cpu_backends_fill             [system] { header "__algorithm/pstl_backends/cpu_backends/fill.h" }
-module std_private_algorithm_pstl_backends_cpu_backends_find_if          [system] { header "__algorithm/pstl_backends/cpu_backends/find_if.h" }
-module std_private_algorithm_pstl_backends_cpu_backends_for_each         [system] { header "__algorithm/pstl_backends/cpu_backends/for_each.h" }
-module std_private_algorithm_pstl_backends_cpu_backends_libdispatch      [system] { header "__algorithm/pstl_backends/cpu_backends/libdispatch.h" }
-module std_private_algorithm_pstl_backends_cpu_backends_merge            [system] { header "__algorithm/pstl_backends/cpu_backends/merge.h" }
-module std_private_algorithm_pstl_backends_cpu_backends_serial           [system] { textual header "__algorithm/pstl_backends/cpu_backends/serial.h" }
-module std_private_algorithm_pstl_backends_cpu_backends_stable_sort      [system] { header "__algorithm/pstl_backends/cpu_backends/stable_sort.h" }
-module std_private_algorithm_pstl_backends_cpu_backends_thread           [system] { textual header "__algorithm/pstl_backends/cpu_backends/thread.h" }
-module std_private_algorithm_pstl_backends_cpu_backends_transform        [system] {
-  header "__algorithm/pstl_backends/cpu_backends/transform.h"
-  export std_private_algorithm_transform
-}
-module std_private_algorithm_pstl_backends_cpu_backends_transform_reduce [system] { header "__algorithm/pstl_backends/cpu_backends/transform_reduce.h" }
+module std_private_algorithm_pstl_backends_cpu_backends_any_of           [system] { textual header "__algorithm/pstl_backends/cpu_backends/any_of.h" }
+module std_private_algorithm_pstl_backends_cpu_backends_fill             [system] { textual header "__algorithm/pstl_backends/cpu_backends/fill.h" }
+module std_private_algorithm_pstl_backends_cpu_backends_find_if          [system] { textual header "__algorithm/pstl_backends/cpu_backends/find_if.h" }
+module std_private_algorithm_pstl_backends_cpu_backends_for_each         [system] { textual header "__algorithm/pstl_backends/cpu_backends/for_each.h" }
+module std_private_algorithm_pstl_backends_cpu_backends_merge            [system] { textual header "__algorithm/pstl_backends/cpu_backends/merge.h" }
+module std_private_algorithm_pstl_backends_cpu_backends_stable_sort      [system] { textual header "__algorithm/pstl_backends/cpu_backends/stable_sort.h" }
+module std_private_algorithm_pstl_backends_cpu_backends_transform        [system] { textual header "__algorithm/pstl_backends/cpu_backends/transform.h" }
+module std_private_algorithm_pstl_backends_cpu_backends_transform_reduce [system] { textual header "__algorithm/pstl_backends/cpu_backends/transform_reduce.h" }
 module std_private_algorithm_pstl_copy                                   [system] { header "__algorithm/pstl_copy.h" }
 module std_private_algorithm_pstl_count                                  [system] { header "__algorithm/pstl_count.h" }
 module std_private_algorithm_pstl_equal                                  [system] { header "__algorithm/pstl_equal.h" }
@@ -1613,7 +1595,18 @@ module std_private_numeric_transform_exclusive_scan [system] { header "__numeric
 module std_private_numeric_transform_inclusive_scan [system] { header "__numeric/transform_inclusive_scan.h" }
 module std_private_numeric_transform_reduce         [system] { header "__numeric/transform_reduce.h" }
 
-module std_private_pstl_cpu_algos_cpu_traits [system] { header "__pstl/cpu_algos/cpu_traits.h" }
+module std_private_pstl_backends_libdispatch      [system] { header "__pstl/backends/libdispatch.h" }
+module std_private_pstl_backends_serial           [system] { header "__pstl/backends/serial.h" }
+module std_private_pstl_backends_std_thread       [system] { header "__pstl/backends/std_thread.h" }
+module std_private_pstl_cpu_algos_cpu_traits      [system] { header "__pstl/cpu_algos/cpu_traits.h" }
+module std_private_pstl_configuration_fwd         [system] {
+  header "__pstl/configuration_fwd.h"
+  export *
+}
+module std_private_pstl_configuration             [system] {
+  header "__pstl/configuration.h"
+  export *
+}
 
 module std_private_queue_fwd [system] { header "__fwd/queue.h" }
 
diff --git a/libcxx/include/semaphore b/libcxx/include/semaphore
index 1375ec3f7c04b..cb2f42c106ca8 100644
--- a/libcxx/include/semaphore
+++ b/libcxx/include/semaphore
@@ -127,7 +127,7 @@ private:
 };
 
 template <ptrdiff_t __least_max_value = _LIBCPP_SEMAPHORE_MAX>
-class counting_semaphore {
+class _LIBCPP_DEPRECATED_ATOMIC_SYNC counting_semaphore {
   __atomic_semaphore_base __semaphore_;
 
 public:
@@ -172,7 +172,9 @@ public:
   }
 };
 
-using binary_semaphore = counting_semaphore<1>;
+_LIBCPP_SUPPRESS_DEPRECATED_PUSH
+using binary_semaphore _LIBCPP_DEPRECATED_ATOMIC_SYNC = counting_semaphore<1>;
+_LIBCPP_SUPPRESS_DEPRECATED_POP
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
index 46353986f5d7d..64cf368e6e684 100644
--- a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -575,6 +575,7 @@
 {'is_defined': True, 'name': '__ZNKSt3__118__time_get_storageIcE15__do_date_orderEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNKSt3__118__time_get_storageIwE15__do_date_orderEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNKSt3__119__shared_weak_count13__get_deleterERKSt9type_info', 'type': 'FUNC'}
+{'is_defined': True, 'name': '__ZNKSt3__119bad_expected_accessIvE4whatEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNKSt3__120__codecvt_utf8_utf16IDiE10do_unshiftER11__mbstate_tPcS4_RS4_', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNKSt3__120__codecvt_utf8_utf16IDiE11do_encodingEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNKSt3__120__codecvt_utf8_utf16IDiE13do_max_lengthEv', 'type': 'FUNC'}
@@ -2073,6 +2074,7 @@
 {'is_defined': True, 'name': '__ZTINSt3__117moneypunct_bynameIwLb1EEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt3__119__shared_weak_countE', 'size': 0, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '__ZTINSt3__119bad_expected_accessIvEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt3__120__codecvt_utf8_utf16IDiEE', 'size': 0, 'type': 'OBJECT'}
@@ -2264,6 +2266,7 @@
 {'is_defined': True, 'name': '__ZTSNSt3__117moneypunct_bynameIwLb0EEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTSNSt3__117moneypunct_bynameIwLb1EEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTSNSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '__ZTSNSt3__119bad_expected_accessIvEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTSNSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTSNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTSNSt3__13pmr15memory_resourceE', 'size': 0, 'type': 'OBJECT'}
@@ -2482,6 +2485,7 @@
 {'is_defined': True, 'name': '__ZTVNSt3__117moneypunct_bynameIwLb1EEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTVNSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTVNSt3__119__shared_weak_countE', 'size': 0, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '__ZTVNSt3__119bad_expected_accessIvEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTVNSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTVNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTVNSt3__120__codecvt_utf8_utf16IDiEE', 'size': 0, 'type': 'OBJECT'}
diff --git a/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
index fec3a4505a0c6..8751dffe23025 100644
--- a/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -209,6 +209,7 @@
 {'is_defined': True, 'name': '_ZNKSt6__ndk118__time_get_storageIcE15__do_date_orderEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt6__ndk118__time_get_storageIwE15__do_date_orderEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt6__ndk119__shared_weak_count13__get_deleterERKSt9type_info', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNKSt6__ndk119bad_expected_accessIvE4whatEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt6__ndk120__codecvt_utf8_utf16IDiE10do_unshiftER9mbstate_tPcS4_RS4_', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt6__ndk120__codecvt_utf8_utf16IDiE11do_encodingEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt6__ndk120__codecvt_utf8_utf16IDiE13do_max_lengthEv', 'type': 'FUNC'}
@@ -1722,6 +1723,7 @@
 {'is_defined': True, 'name': '_ZTINSt6__ndk118__time_get_storageIwEE', 'size': 12, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt6__ndk118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 12, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt6__ndk119__shared_weak_countE', 'size': 24, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '_ZTINSt6__ndk119bad_expected_accessIvEE', 'size': 12, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt6__ndk119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 12, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt6__ndk119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 12, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt6__ndk120__codecvt_utf8_utf16IDiEE', 'size': 12, 'type': 'OBJECT'}
@@ -1958,6 +1960,7 @@
 {'is_defined': True, 'name': '_ZTSNSt6__ndk118__time_get_storageIwEE', 'size': 35, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt6__ndk118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 72, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt6__ndk119__shared_weak_countE', 'size': 33, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '_ZTSNSt6__ndk119bad_expected_accessIvEE', 'size': 36, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt6__ndk119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 73, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt6__ndk119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 73, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt6__ndk120__codecvt_utf8_utf16IDiEE', 'size': 38, 'type': 'OBJECT'}
@@ -2188,6 +2191,7 @@
 {'is_defined': True, 'name': '_ZTVNSt6__ndk117moneypunct_bynameIwLb1EEE', 'size': 56, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt6__ndk118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 60, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt6__ndk119__shared_weak_countE', 'size': 28, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '_ZTVNSt6__ndk119bad_expected_accessIvEE', 'size': 20, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt6__ndk119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 40, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt6__ndk119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 40, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt6__ndk120__codecvt_utf8_utf16IDiEE', 'size': 48, 'type': 'OBJECT'}
diff --git a/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
index e52cf98dd4c4f..7e223e6652884 100644
--- a/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -99,6 +99,7 @@
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__118__time_get_storageIcE15__do_date_orderEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__118__time_get_storageIwE15__do_date_orderEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__119__shared_weak_count13__get_deleterERKSt9type_info', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
+{'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__119bad_expected_accessIvE4whatEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE10do_unshiftERPcS2_S2_S3_', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE11do_encodingEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE13do_max_lengthEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
@@ -910,6 +911,7 @@
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__117__widen_from_utf8ILm32EEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__117bad_function_callE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__119__shared_weak_countE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
+{'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__119bad_expected_accessIvEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__120__codecvt_utf8_utf16IDiEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__120__codecvt_utf8_utf16IDsEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__120__codecvt_utf8_utf16IwEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
@@ -969,6 +971,7 @@
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__117__widen_from_utf8ILm32EEE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__117bad_function_callE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__119__shared_weak_countE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'}
+{'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__119bad_expected_accessIvEE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__120__codecvt_utf8_utf16IDiEE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__120__codecvt_utf8_utf16IDsEE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__120__codecvt_utf8_utf16IwEE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'}
@@ -1031,6 +1034,7 @@
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__117__widen_from_utf8ILm32EEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__117bad_function_callE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__119__shared_weak_countE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
+{'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__119bad_expected_accessIvEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__120__codecvt_utf8_utf16IDiEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__120__codecvt_utf8_utf16IDsEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__120__codecvt_utf8_utf16IwEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
diff --git a/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
index 52a04706ddf20..407d0456757af 100644
--- a/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -99,6 +99,7 @@
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__118__time_get_storageIcE15__do_date_orderEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__118__time_get_storageIwE15__do_date_orderEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__119__shared_weak_count13__get_deleterERKSt9type_info', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
+{'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__119bad_expected_accessIvE4whatEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE10do_unshiftERPcS2_S2_S3_', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE11do_encodingEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE13do_max_lengthEv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
@@ -910,6 +911,7 @@
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__117__widen_from_utf8ILm32EEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__117bad_function_callE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__119__shared_weak_countE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
+{'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__119bad_expected_accessIvEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__120__codecvt_utf8_utf16IDiEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__120__codecvt_utf8_utf16IDsEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTINSt3__120__codecvt_utf8_utf16IwEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
@@ -969,6 +971,7 @@
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__117__widen_from_utf8ILm32EEE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__117bad_function_callE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__119__shared_weak_countE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'}
+{'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__119bad_expected_accessIvEE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__120__codecvt_utf8_utf16IDiEE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__120__codecvt_utf8_utf16IDsEE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTSNSt3__120__codecvt_utf8_utf16IwEE', 'storage_mapping_class': 'RO', 'type': 'OBJECT'}
@@ -1031,6 +1034,7 @@
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__117__widen_from_utf8ILm32EEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__117bad_function_callE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__119__shared_weak_countE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
+{'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__119bad_expected_accessIvEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__120__codecvt_utf8_utf16IDiEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__120__codecvt_utf8_utf16IDsEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZTVNSt3__120__codecvt_utf8_utf16IwEE', 'storage_mapping_class': 'RW', 'type': 'OBJECT'}
diff --git a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
index c169b4a992521..d578b41383c0e 100644
--- a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -575,6 +575,7 @@
 {'is_defined': True, 'name': '__ZNKSt3__118__time_get_storageIcE15__do_date_orderEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNKSt3__118__time_get_storageIwE15__do_date_orderEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNKSt3__119__shared_weak_count13__get_deleterERKSt9type_info', 'type': 'FUNC'}
+{'is_defined': True, 'name': '__ZNKSt3__119bad_expected_accessIvE4whatEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNKSt3__120__codecvt_utf8_utf16IDiE10do_unshiftER11__mbstate_tPcS4_RS4_', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNKSt3__120__codecvt_utf8_utf16IDiE11do_encodingEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNKSt3__120__codecvt_utf8_utf16IDiE13do_max_lengthEv', 'type': 'FUNC'}
@@ -2087,6 +2088,7 @@
 {'is_defined': True, 'name': '__ZTINSt3__118__time_get_storageIwEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt3__119__shared_weak_countE', 'size': 0, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '__ZTINSt3__119bad_expected_accessIvEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTINSt3__120__codecvt_utf8_utf16IDiEE', 'size': 0, 'type': 'OBJECT'}
@@ -2291,6 +2293,7 @@
 {'is_defined': True, 'name': '__ZTSNSt3__117moneypunct_bynameIwLb0EEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTSNSt3__117moneypunct_bynameIwLb1EEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTSNSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '__ZTSNSt3__119bad_expected_accessIvEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTSNSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTSNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTSNSt3__13pmr15memory_resourceE', 'size': 0, 'type': 'OBJECT'}
@@ -2516,6 +2519,7 @@
 {'is_defined': True, 'name': '__ZTVNSt3__117moneypunct_bynameIwLb1EEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTVNSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTVNSt3__119__shared_weak_countE', 'size': 0, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '__ZTVNSt3__119bad_expected_accessIvEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTVNSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTVNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 0, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__ZTVNSt3__120__codecvt_utf8_utf16IDiEE', 'size': 0, 'type': 'OBJECT'}
diff --git a/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
index efa2189e9c928..fc0f4fcf415e6 100644
--- a/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -209,6 +209,7 @@
 {'is_defined': True, 'name': '_ZNKSt6__ndk118__time_get_storageIcE15__do_date_orderEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt6__ndk118__time_get_storageIwE15__do_date_orderEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt6__ndk119__shared_weak_count13__get_deleterERKSt9type_info', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNKSt6__ndk119bad_expected_accessIvE4whatEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt6__ndk120__codecvt_utf8_utf16IDiE10do_unshiftER9mbstate_tPcS4_RS4_', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt6__ndk120__codecvt_utf8_utf16IDiE11do_encodingEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt6__ndk120__codecvt_utf8_utf16IDiE13do_max_lengthEv', 'type': 'FUNC'}
@@ -1722,6 +1723,7 @@
 {'is_defined': True, 'name': '_ZTINSt6__ndk118__time_get_storageIwEE', 'size': 24, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt6__ndk118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt6__ndk119__shared_weak_countE', 'size': 40, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '_ZTINSt6__ndk119bad_expected_accessIvEE', 'size': 24, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt6__ndk119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt6__ndk119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt6__ndk120__codecvt_utf8_utf16IDiEE', 'size': 24, 'type': 'OBJECT'}
@@ -1955,6 +1957,7 @@
 {'is_defined': True, 'name': '_ZTSNSt6__ndk118__time_get_storageIwEE', 'size': 35, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt6__ndk118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 72, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt6__ndk119__shared_weak_countE', 'size': 33, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '_ZTSNSt6__ndk119bad_expected_accessIvEE', 'size': 36, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt6__ndk119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 73, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt6__ndk119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 73, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt6__ndk120__codecvt_utf8_utf16IDiEE', 'size': 38, 'type': 'OBJECT'}
@@ -2182,6 +2185,7 @@
 {'is_defined': True, 'name': '_ZTVNSt6__ndk117moneypunct_bynameIwLb1EEE', 'size': 112, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt6__ndk118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 120, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt6__ndk119__shared_weak_countE', 'size': 56, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '_ZTVNSt6__ndk119bad_expected_accessIvEE', 'size': 40, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt6__ndk119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 80, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt6__ndk119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 80, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt6__ndk120__codecvt_utf8_utf16IDiEE', 'size': 96, 'type': 'OBJECT'}
diff --git a/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist
index ebda5b0dfba57..4022339562b3a 100644
--- a/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -266,6 +266,7 @@
 {'is_defined': True, 'name': '_ZNKSt3__118__time_get_storageIcE15__do_date_orderEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt3__118__time_get_storageIwE15__do_date_orderEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt3__119__shared_weak_count13__get_deleterERKSt9type_info', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNKSt3__119bad_expected_accessIvE4whatEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE10do_unshiftER11__mbstate_tPcS4_RS4_', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE11do_encodingEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE13do_max_lengthEv', 'type': 'FUNC'}
@@ -1695,6 +1696,7 @@
 {'is_defined': True, 'name': '_ZTINSt3__118__time_get_storageIwEE', 'size': 24, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt3__119__shared_weak_countE', 'size': 40, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '_ZTINSt3__119bad_expected_accessIvEE', 'size': 24, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt3__120__codecvt_utf8_utf16IDiEE', 'size': 24, 'type': 'OBJECT'}
@@ -1829,6 +1831,7 @@
 {'is_defined': True, 'name': '_ZTSNSt3__118__time_get_storageIwEE', 'size': 32, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 69, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt3__119__shared_weak_countE', 'size': 30, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '_ZTSNSt3__119bad_expected_accessIvEE', 'size': 33, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 70, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 70, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt3__120__codecvt_utf8_utf16IDiEE', 'size': 35, 'type': 'OBJECT'}
@@ -1962,6 +1965,7 @@
 {'is_defined': True, 'name': '_ZTVNSt3__117moneypunct_bynameIwLb1EEE', 'size': 112, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 120, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt3__119__shared_weak_countE', 'size': 56, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '_ZTVNSt3__119bad_expected_accessIvEE', 'size': 40, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 80, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 80, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt3__120__codecvt_utf8_utf16IDiEE', 'size': 96, 'type': 'OBJECT'}
diff --git a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist
index 6432ad3be3585..574c4504c59b8 100644
--- a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -264,6 +264,7 @@
 {'is_defined': True, 'name': '_ZNKSt3__118__time_get_storageIcE15__do_date_orderEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt3__118__time_get_storageIwE15__do_date_orderEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt3__119__shared_weak_count13__get_deleterERKSt9type_info', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNKSt3__119bad_expected_accessIvE4whatEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE10do_unshiftER11__mbstate_tPcS4_RS4_', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE11do_encodingEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE13do_max_lengthEv', 'type': 'FUNC'}
@@ -1696,6 +1697,7 @@
 {'is_defined': True, 'name': '_ZTINSt3__118__time_get_storageIwEE', 'size': 24, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt3__119__shared_weak_countE', 'size': 40, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '_ZTINSt3__119bad_expected_accessIvEE', 'size': 24, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt3__120__codecvt_utf8_utf16IDiEE', 'size': 24, 'type': 'OBJECT'}
@@ -1830,6 +1832,7 @@
 {'is_defined': True, 'name': '_ZTSNSt3__118__time_get_storageIwEE', 'size': 32, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 69, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt3__119__shared_weak_countE', 'size': 30, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '_ZTSNSt3__119bad_expected_accessIvEE', 'size': 33, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 70, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 70, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt3__120__codecvt_utf8_utf16IDiEE', 'size': 35, 'type': 'OBJECT'}
@@ -1963,6 +1966,7 @@
 {'is_defined': True, 'name': '_ZTVNSt3__117moneypunct_bynameIwLb1EEE', 'size': 112, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 120, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt3__119__shared_weak_countE', 'size': 56, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '_ZTVNSt3__119bad_expected_accessIvEE', 'size': 40, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 80, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 80, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt3__120__codecvt_utf8_utf16IDiEE', 'size': 96, 'type': 'OBJECT'}
diff --git a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist
index 1fe84e17b3f7f..665546699e8de 100644
--- a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist
@@ -235,6 +235,7 @@
 {'is_defined': True, 'name': '_ZNKSt3__118__time_get_storageIcE15__do_date_orderEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt3__118__time_get_storageIwE15__do_date_orderEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt3__119__shared_weak_count13__get_deleterERKSt9type_info', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNKSt3__119bad_expected_accessIvE4whatEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE10do_unshiftER11__mbstate_tPcS4_RS4_', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE11do_encodingEv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNKSt3__120__codecvt_utf8_utf16IDiE13do_max_lengthEv', 'type': 'FUNC'}
@@ -1667,6 +1668,7 @@
 {'is_defined': True, 'name': '_ZTINSt3__118__time_get_storageIwEE', 'size': 24, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt3__119__shared_weak_countE', 'size': 40, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '_ZTINSt3__119bad_expected_accessIvEE', 'size': 24, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 24, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTINSt3__120__codecvt_utf8_utf16IDiEE', 'size': 24, 'type': 'OBJECT'}
@@ -1801,6 +1803,7 @@
 {'is_defined': True, 'name': '_ZTSNSt3__118__time_get_storageIwEE', 'size': 32, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 69, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt3__119__shared_weak_countE', 'size': 30, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '_ZTSNSt3__119bad_expected_accessIvEE', 'size': 33, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 70, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 70, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTSNSt3__120__codecvt_utf8_utf16IDiEE', 'size': 35, 'type': 'OBJECT'}
@@ -1934,6 +1937,7 @@
 {'is_defined': True, 'name': '_ZTVNSt3__117moneypunct_bynameIwLb1EEE', 'size': 112, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt3__118basic_stringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 120, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt3__119__shared_weak_countE', 'size': 56, 'type': 'OBJECT'}
+{'is_defined': True, 'name': '_ZTVNSt3__119bad_expected_accessIvEE', 'size': 40, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt3__119basic_istringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 80, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE', 'size': 80, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '_ZTVNSt3__120__codecvt_utf8_utf16IDiEE', 'size': 96, 'type': 'OBJECT'}
diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index 208500ec14fcd..8b28d1b891895 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -10,6 +10,7 @@ set(LIBCXX_SOURCES
   chrono.cpp
   error_category.cpp
   exception.cpp
+  expected.cpp
   filesystem/filesystem_clock.cpp
   filesystem/filesystem_error.cpp
   filesystem/path_parser.h
@@ -326,7 +327,7 @@ set(LIBCXX_EXPERIMENTAL_SOURCES
   experimental/keep.cpp
   )
 
-if (LIBCXX_PSTL_CPU_BACKEND STREQUAL "libdispatch")
+if (LIBCXX_PSTL_BACKEND STREQUAL "libdispatch")
   list(APPEND LIBCXX_EXPERIMENTAL_SOURCES
     pstl/libdispatch.cpp
     )
diff --git a/libcxx/src/expected.cpp b/libcxx/src/expected.cpp
new file mode 100644
index 0000000000000..f30efb5164796
--- /dev/null
+++ b/libcxx/src/expected.cpp
@@ -0,0 +1,13 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <expected>
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+const char* bad_expected_access<void>::what() const noexcept { return "bad access to std::expected"; }
+_LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/src/functional.cpp b/libcxx/src/functional.cpp
index 570bb78e150b7..ef53e3e84da0e 100644
--- a/libcxx/src/functional.cpp
+++ b/libcxx/src/functional.cpp
@@ -10,9 +10,7 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#ifdef _LIBCPP_ABI_BAD_FUNCTION_CALL_KEY_FUNCTION
 bad_function_call::~bad_function_call() noexcept {}
-#endif
 
 #ifdef _LIBCPP_ABI_BAD_FUNCTION_CALL_GOOD_WHAT_MESSAGE
 const char* bad_function_call::what() const noexcept { return "std::bad_function_call"; }
diff --git a/libcxx/src/pstl/libdispatch.cpp b/libcxx/src/pstl/libdispatch.cpp
index d997a9c73463d..3dca702341c85 100644
--- a/libcxx/src/pstl/libdispatch.cpp
+++ b/libcxx/src/pstl/libdispatch.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include <__algorithm/min.h>
-#include <__algorithm/pstl_backends/cpu_backends/libdispatch.h>
 #include <__config>
+#include <__pstl/backends/libdispatch.h>
 #include <dispatch/dispatch.h>
 
 _LIBCPP_BEGIN_NAMESPACE_STD
diff --git a/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp b/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
index 344543d5f19ff..544a9744b7909 100644
--- a/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
+++ b/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
@@ -16,29 +16,29 @@
 #include <cstddef>
 
 struct missing_deref {
-  using difference_type = std::ptrdiff_t;
+  using difference_type   = std::ptrdiff_t;
   using iterator_category = std::input_iterator_tag;
-  using value_type = int;
-  using reference = int&;
+  using value_type        = int;
+  using reference         = int&;
 
   missing_deref& operator++();
 };
 
 struct missing_preincrement {
-  using difference_type = std::ptrdiff_t;
+  using difference_type   = std::ptrdiff_t;
   using iterator_category = std::input_iterator_tag;
-  using value_type = int;
-  using reference = int&;
+  using value_type        = int;
+  using reference         = int&;
 
   int& operator*();
 };
 
 template <class Derived>
 struct valid_iterator {
-  using difference_type = std::ptrdiff_t;
+  using difference_type   = std::ptrdiff_t;
   using iterator_category = std::input_iterator_tag;
-  using value_type = int;
-  using reference = int&;
+  using value_type        = int;
+  using reference         = int&;
 
   int& operator*() const;
   Derived& operator++();
@@ -51,30 +51,30 @@ struct valid_iterator {
 };
 
 struct not_move_constructible : valid_iterator<not_move_constructible> {
-  not_move_constructible(const not_move_constructible&) = default;
-  not_move_constructible(not_move_constructible&&) = delete;
-  not_move_constructible& operator=(not_move_constructible&&) = default;
+  not_move_constructible(const not_move_constructible&)            = default;
+  not_move_constructible(not_move_constructible&&)                 = delete;
+  not_move_constructible& operator=(not_move_constructible&&)      = default;
   not_move_constructible& operator=(const not_move_constructible&) = default;
 };
 
 struct not_copy_constructible : valid_iterator<not_copy_constructible> {
-  not_copy_constructible(const not_copy_constructible&) = delete;
-  not_copy_constructible(not_copy_constructible&&) = default;
-  not_copy_constructible& operator=(not_copy_constructible&&) = default;
+  not_copy_constructible(const not_copy_constructible&)            = delete;
+  not_copy_constructible(not_copy_constructible&&)                 = default;
+  not_copy_constructible& operator=(not_copy_constructible&&)      = default;
   not_copy_constructible& operator=(const not_copy_constructible&) = default;
 };
 
 struct not_move_assignable : valid_iterator<not_move_assignable> {
-  not_move_assignable(const not_move_assignable&) = default;
-  not_move_assignable(not_move_assignable&&) = default;
-  not_move_assignable& operator=(not_move_assignable&&) = delete;
+  not_move_assignable(const not_move_assignable&)            = default;
+  not_move_assignable(not_move_assignable&&)                 = default;
+  not_move_assignable& operator=(not_move_assignable&&)      = delete;
   not_move_assignable& operator=(const not_move_assignable&) = default;
 };
 
 struct not_copy_assignable : valid_iterator<not_copy_assignable> {
-  not_copy_assignable(const not_copy_assignable&) = default;
-  not_copy_assignable(not_copy_assignable&&) = default;
-  not_copy_assignable& operator=(not_copy_assignable&&) = default;
+  not_copy_assignable(const not_copy_assignable&)            = default;
+  not_copy_assignable(not_copy_assignable&&)                 = default;
+  not_copy_assignable& operator=(not_copy_assignable&&)      = default;
   not_copy_assignable& operator=(const not_copy_assignable&) = delete;
 };
 
@@ -89,7 +89,6 @@ void check_iterator_requirements() {
   static_assert(std::__cpp17_iterator<missing_preincrement>); // expected-error {{static assertion failed}}
   // expected-note@*:* {{cannot increment value of type 'missing_preincrement'}}
 
-
   static_assert(std::__cpp17_iterator<not_move_constructible>); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'not_move_constructible' does not satisfy '__cpp17_move_constructible'}}
 
@@ -115,11 +114,13 @@ bool operator==(not_unequality_comparable, not_unequality_comparable);
 bool operator!=(not_unequality_comparable, not_unequality_comparable) = delete;
 
 void check_input_iterator_requirements() {
-  _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(not_equality_comparable); // expected-error {{static assertion failed}}
+  // clang-format off
+  _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(not_equality_comparable, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{'__lhs == __rhs' would be invalid: overload resolution selected deleted operator '=='}}
 
-  _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(not_unequality_comparable); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(not_unequality_comparable, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{'__lhs != __rhs' would be invalid: overload resolution selected deleted operator '!='}}
+  // clang-format on
 }
 
 template <class Derived>
@@ -138,9 +139,9 @@ struct postincrement_not_ref : valid_iterator<postincrement_not_ref> {};
 bool operator==(postincrement_not_ref, postincrement_not_ref);
 
 void check_forward_iterator_requirements() {
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(not_default_constructible); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(not_default_constructible, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'not_default_constructible' does not satisfy '__cpp17_default_constructible'}}
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(postincrement_not_ref); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(postincrement_not_ref, ""); // expected-error {{static assertion failed}}
 #ifndef _AIX
   // expected-note@*:* {{because type constraint 'convertible_to<valid_iterator<postincrement_not_ref>::Proxy, const postincrement_not_ref &>' was not satisfied}}
 #endif
@@ -155,7 +156,6 @@ struct missing_postdecrement : valid_forward_iterator<missing_postdecrement> {
 };
 
 struct not_returning_iter_reference : valid_forward_iterator<not_returning_iter_reference> {
-
   struct Proxy {
     operator const not_returning_iter_reference&();
 
@@ -167,12 +167,14 @@ struct not_returning_iter_reference : valid_forward_iterator<not_returning_iter_
 };
 
 void check_bidirectional_iterator_requirements() {
-  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(missing_predecrement); // expected-error {{static assertion failed}}
+  // clang-format off
+  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(missing_predecrement, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{cannot decrement value of type 'missing_predecrement'}}
-  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(missing_postdecrement); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(missing_postdecrement, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{cannot decrement value of type 'missing_postdecrement'}}
-  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(not_returning_iter_reference); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(not_returning_iter_reference, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because type constraint 'same_as<int, __iter_reference<not_returning_iter_reference> >' was not satisfied}}
+  // clang-format on
 }
 
 template <class Derived>
@@ -246,7 +248,8 @@ struct missing_minus_const_iter_const_iter : valid_random_access_iterator<missin
   friend difference_type operator-(missing_minus_const_iter_const_iter&, missing_minus_const_iter_const_iter&);
   friend difference_type operator-(const missing_minus_const_iter_const_iter&, missing_minus_const_iter_const_iter&);
   friend difference_type operator-(missing_minus_const_iter_const_iter&, const missing_minus_const_iter_const_iter&);
-  friend difference_type operator-(const missing_minus_const_iter_const_iter&, const missing_minus_const_iter_const_iter&) = delete;
+  friend difference_type
+  operator-(const missing_minus_const_iter_const_iter&, const missing_minus_const_iter_const_iter&) = delete;
 };
 
 struct missing_subscript_operator : valid_random_access_iterator<missing_subscript_operator> {
@@ -359,62 +362,64 @@ struct missing_const_const_greater_eq : valid_random_access_iterator<missing_con
 };
 
 void check_random_access_iterator() {
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_equals); // expected-error {{static assertion failed}}
+  // clang-format off
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_equals, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter += __n' would be invalid: overload resolution selected deleted operator '+='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_equals); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_equals, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter -= __n' would be invalid: overload resolution selected deleted operator '-='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter + __n' would be invalid: overload resolution selected deleted operator '+'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_diff_iter); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_diff_iter, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__n + __iter' would be invalid: overload resolution selected deleted operator '+'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff_const_mut); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff_const_mut, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) + __n' would be invalid: overload resolution selected deleted operator '+'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff_mut_const); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff_mut_const, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__n + std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '+'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_diff_const); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_diff_const, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) - __n' would be invalid: overload resolution selected deleted operator '-'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_iter); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_iter, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter - __iter' would be invalid: overload resolution selected deleted operator '-'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_const_iter_iter); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_const_iter_iter, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) - __iter' would be invalid: overload resolution selected deleted operator '-'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_const_iter); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_const_iter, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter - std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '-'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_const_iter_const_iter); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_const_iter_const_iter, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) - std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '-'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_subscript_operator); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_subscript_operator, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter[__n]' would be invalid: overload resolution selected deleted operator '[]'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_subscript_operator); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_subscript_operator, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter)[__n]' would be invalid: overload resolution selected deleted operator '[]'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_less); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_less, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter < __iter' would be invalid: overload resolution selected deleted operator '<'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_less); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_less, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) < __iter' would be invalid: overload resolution selected deleted operator '<'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_less); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_less, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter < std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '<'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_less); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_less, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) < std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '<'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_greater); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_greater, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter > __iter' would be invalid: overload resolution selected deleted operator '>'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_greater); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_greater, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) > __iter' would be invalid: overload resolution selected deleted operator '>'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_greater); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_greater, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter > std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '>'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_greater); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_greater, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) > std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '>'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_less_eq); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_less_eq, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter <= __iter' would be invalid: overload resolution selected deleted operator '<='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_less_eq); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_less_eq, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) <= __iter' would be invalid: overload resolution selected deleted operator '<='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_less_eq); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_less_eq, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter <= std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '<='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_less_eq); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_less_eq, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) <= std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '<='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_greater_eq); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_greater_eq, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter >= __iter' would be invalid: overload resolution selected deleted operator '>='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_greater_eq); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_greater_eq, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) >= __iter' would be invalid: overload resolution selected deleted operator '>='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_greater_eq); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_greater_eq, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter >= std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '>='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_greater_eq); // expected-error {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_greater_eq, ""); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) >= std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '>='}}
+  // clang-format on
 }
diff --git a/libcxx/test/libcxx/algorithms/pstl.iterator-requirements.verify.cpp b/libcxx/test/libcxx/algorithms/pstl.iterator-requirements.verify.cpp
new file mode 100644
index 0000000000000..98e3509752e16
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/pstl.iterator-requirements.verify.cpp
@@ -0,0 +1,192 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+// REQUIRES: stdlib=libc++
+// UNSUPPORTED: libcpp-has-no-incomplete-pstl
+
+// <algorithm>
+// <numeric>
+
+// Make sure that all PSTL algorithms contain checks for iterator requirements.
+// This is not a requirement from the Standard, but we strive to catch misuse in
+// the PSTL both because we can, and because iterator category mistakes in the
+// PSTL can lead to subtle bugs.
+
+// Ignore spurious errors after the initial static_assert failure.
+// ADDITIONAL_COMPILE_FLAGS: -Xclang -verify-ignore-unexpected=error
+
+// We only diagnose this in C++20 and above because we implement the checks with concepts.
+// UNSUPPORTED: c++17
+
+#include <algorithm>
+#include <cstddef>
+#include <numeric>
+
+#include "test_iterators.h"
+
+using non_forward_iterator = cpp17_input_iterator<int*>;
+struct non_output_iterator : forward_iterator<int*> {
+  constexpr int const& operator*() const; // prevent it from being an output iterator
+};
+
+void f(non_forward_iterator non_fwd, non_output_iterator non_output, std::execution::sequenced_policy pol) {
+  auto pred     = [](auto&&...) -> bool { return true; };
+  auto func     = [](auto&&...) -> int { return 1; };
+  int* it       = nullptr;
+  int* out      = nullptr;
+  std::size_t n = 0;
+  int val       = 0;
+
+  {
+    (void)std::any_of(pol, non_fwd, non_fwd, pred);  // expected-error@*:* {{static assertion failed: any_of}}
+    (void)std::all_of(pol, non_fwd, non_fwd, pred);  // expected-error@*:* {{static assertion failed: all_of}}
+    (void)std::none_of(pol, non_fwd, non_fwd, pred); // expected-error@*:* {{static assertion failed: none_of}}
+  }
+
+  {
+    (void)std::copy(pol, non_fwd, non_fwd, it); // expected-error@*:* {{static assertion failed: copy}}
+    (void)std::copy(pol, it, it, non_fwd);      // expected-error@*:* {{static assertion failed: copy}}
+    (void)std::copy(pol, it, it, non_output);   // expected-error@*:* {{static assertion failed: copy}}
+  }
+  {
+    (void)std::copy_n(pol, non_fwd, n, it);    // expected-error@*:* {{static assertion failed: copy_n}}
+    (void)std::copy_n(pol, it, n, non_fwd);    // expected-error@*:* {{static assertion failed: copy_n}}
+    (void)std::copy_n(pol, it, n, non_output); // expected-error@*:* {{static assertion failed: copy_n}}
+  }
+
+  {
+    (void)std::count(pol, non_fwd, non_fwd, val);     // expected-error@*:* {{static assertion failed: count}}
+    (void)std::count_if(pol, non_fwd, non_fwd, pred); // expected-error@*:* {{static assertion failed: count_if}}
+  }
+
+  {
+    (void)std::equal(pol, non_fwd, non_fwd, it);       // expected-error@*:* {{static assertion failed: equal}}
+    (void)std::equal(pol, it, it, non_fwd);            // expected-error@*:* {{static assertion failed: equal}}
+    (void)std::equal(pol, non_fwd, non_fwd, it, pred); // expected-error@*:* {{static assertion failed: equal}}
+    (void)std::equal(pol, it, it, non_fwd, pred);      // expected-error@*:* {{static assertion failed: equal}}
+
+    (void)std::equal(pol, non_fwd, non_fwd, it, it);       // expected-error@*:* {{static assertion failed: equal}}
+    (void)std::equal(pol, it, it, non_fwd, non_fwd);       // expected-error@*:* {{static assertion failed: equal}}
+    (void)std::equal(pol, non_fwd, non_fwd, it, it, pred); // expected-error@*:* {{static assertion failed: equal}}
+    (void)std::equal(pol, it, it, non_fwd, non_fwd, pred); // expected-error@*:* {{static assertion failed: equal}}
+  }
+
+  {
+    (void)std::fill(pol, non_fwd, non_fwd, val); // expected-error@*:* {{static assertion failed: fill}}
+    (void)std::fill_n(pol, non_fwd, n, val);     // expected-error@*:* {{static assertion failed: fill_n}}
+  }
+
+  {
+    (void)std::find(pol, non_fwd, non_fwd, val);         // expected-error@*:* {{static assertion failed: find}}
+    (void)std::find_if(pol, non_fwd, non_fwd, pred);     // expected-error@*:* {{static assertion failed: find_if}}
+    (void)std::find_if_not(pol, non_fwd, non_fwd, pred); // expected-error@*:* {{static assertion failed: find_if_not}}
+  }
+
+  {
+    (void)std::for_each(pol, non_fwd, non_fwd, func); // expected-error@*:* {{static assertion failed: for_each}}
+    (void)std::for_each_n(pol, non_fwd, n, func);     // expected-error@*:* {{static assertion failed: for_each_n}}
+  }
+
+  {
+    (void)std::generate(pol, non_fwd, non_fwd, func); // expected-error@*:* {{static assertion failed: generate}}
+    (void)std::generate_n(pol, non_fwd, n, func);     // expected-error@*:* {{static assertion failed: generate_n}}
+  }
+
+  {
+    (void)std::is_partitioned(
+        pol, non_fwd, non_fwd, pred); // expected-error@*:* {{static assertion failed: is_partitioned}}
+  }
+
+  {
+    (void)std::merge(pol, non_fwd, non_fwd, it, it, out); // expected-error@*:* {{static assertion failed: merge}}
+    (void)std::merge(pol, it, it, non_fwd, non_fwd, out); // expected-error@*:* {{static assertion failed: merge}}
+    (void)std::merge(pol, it, it, it, it, non_output);    // expected-error@*:* {{static assertion failed: merge}}
+
+    (void)std::merge(pol, non_fwd, non_fwd, it, it, out, pred); // expected-error@*:* {{static assertion failed: merge}}
+    (void)std::merge(pol, it, it, non_fwd, non_fwd, out, pred); // expected-error@*:* {{static assertion failed: merge}}
+    (void)std::merge(pol, it, it, it, it, non_output, pred);    // expected-error@*:* {{static assertion failed: merge}}
+  }
+
+  {
+    (void)std::move(pol, non_fwd, non_fwd, out); // expected-error@*:* {{static assertion failed: move}}
+    (void)std::move(pol, it, it, non_fwd);       // expected-error@*:* {{static assertion failed: move}}
+    (void)std::move(pol, it, it, non_output);    // expected-error@*:* {{static assertion failed: move}}
+  }
+
+  {
+    (void)std::replace_if(
+        pol, non_fwd, non_fwd, pred, val);               // expected-error@*:* {{static assertion failed: replace_if}}
+    (void)std::replace(pol, non_fwd, non_fwd, val, val); // expected-error@*:* {{static assertion failed: replace}}
+
+    (void)std::replace_copy_if(
+        pol, non_fwd, non_fwd, out, pred, val); // expected-error@*:* {{static assertion failed: replace_copy_if}}
+    (void)std::replace_copy_if(
+        pol, it, it, non_fwd, pred, val); // expected-error@*:* {{static assertion failed: replace_copy_if}}
+    (void)std::replace_copy_if(
+        pol, it, it, non_output, pred, val); // expected-error@*:* {{static assertion failed: replace_copy_if}}
+
+    (void)std::replace_copy(
+        pol, non_fwd, non_fwd, out, val, val); // expected-error@*:* {{static assertion failed: replace_copy}}
+    (void)std::replace_copy(
+        pol, it, it, non_fwd, val, val); // expected-error@*:* {{static assertion failed: replace_copy}}
+    (void)std::replace_copy(
+        pol, it, it, non_output, val, val); // expected-error@*:* {{static assertion failed: replace_copy}}
+  }
+
+  {
+    (void)std::rotate_copy(
+        pol, non_fwd, non_fwd, non_fwd, out);            // expected-error@*:* {{static assertion failed: rotate_copy}}
+    (void)std::rotate_copy(pol, it, it, it, non_fwd);    // expected-error@*:* {{static assertion failed: rotate_copy}}
+    (void)std::rotate_copy(pol, it, it, it, non_output); // expected-error@*:* {{static assertion failed: rotate_copy}}
+  }
+
+  {
+    (void)std::sort(pol, non_fwd, non_fwd);       // expected-error@*:* {{static assertion failed: sort}}
+    (void)std::sort(pol, non_fwd, non_fwd, pred); // expected-error@*:* {{static assertion failed: sort}}
+  }
+
+  {
+    (void)std::stable_sort(pol, non_fwd, non_fwd);       // expected-error@*:* {{static assertion failed: stable_sort}}
+    (void)std::stable_sort(pol, non_fwd, non_fwd, pred); // expected-error@*:* {{static assertion failed: stable_sort}}
+  }
+
+  {
+    (void)std::transform(pol, non_fwd, non_fwd, out, func); // expected-error@*:* {{static assertion failed: transform}}
+    (void)std::transform(pol, it, it, non_fwd, func);       // expected-error@*:* {{static assertion failed: transform}}
+    (void)std::transform(pol, it, it, non_output, func);    // expected-error@*:* {{static assertion failed: transform}}
+
+    (void)std::transform(
+        pol, non_fwd, non_fwd, it, out, func);             // expected-error@*:* {{static assertion failed: transform}}
+    (void)std::transform(pol, it, it, non_fwd, out, func); // expected-error@*:* {{static assertion failed: transform}}
+    (void)std::transform(pol, it, it, it, non_fwd, func);  // expected-error@*:* {{static assertion failed: transform}}
+    (void)std::transform(
+        pol, it, it, it, non_output, func); // expected-error@*:* {{static assertion failed: transform}}
+  }
+
+  {
+    (void)std::reduce(pol, non_fwd, non_fwd);            // expected-error@*:* {{static assertion failed: reduce}}
+    (void)std::reduce(pol, non_fwd, non_fwd, val);       // expected-error@*:* {{static assertion failed: reduce}}
+    (void)std::reduce(pol, non_fwd, non_fwd, val, func); // expected-error@*:* {{static assertion failed: reduce}}
+  }
+
+  {
+    (void)std::transform_reduce(
+        pol, non_fwd, non_fwd, it, val); // expected-error@*:* {{static assertion failed: transform_reduce}}
+    (void)std::transform_reduce(
+        pol, it, it, non_fwd, val); // expected-error@*:* {{static assertion failed: transform_reduce}}
+
+    (void)std::transform_reduce(
+        pol, non_fwd, non_fwd, it, val, func, func); // expected-error@*:* {{static assertion failed: transform_reduce}}
+    (void)std::transform_reduce(
+        pol, it, it, non_fwd, val, func, func); // expected-error@*:* {{static assertion failed: transform_reduce}}
+
+    (void)std::transform_reduce(
+        pol, non_fwd, non_fwd, val, func, func); // expected-error@*:* {{static assertion failed: transform_reduce}}
+  }
+}
diff --git a/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp b/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp
index 8c7016a80b811..b48ac02dd79c5 100644
--- a/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp
@@ -8,11 +8,11 @@
 
 // <algorithm>
 
-// REQUIRES: libcpp-pstl-cpu-backend-libdispatch
+// REQUIRES: libcpp-pstl-backend-libdispatch
 
 // __chunk_partitions __partition_chunks(ptrdiff_t);
 
-#include <__algorithm/pstl_backends/cpu_backends/libdispatch.h>
+#include <__pstl/backends/libdispatch.h>
 #include <cassert>
 #include <cstddef>
 
diff --git a/libcxx/test/libcxx/containers/sequences/vector/asan.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/asan.pass.cpp
index 588ce2a3d17ed..614323b1ffd7b 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/asan.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/asan.pass.cpp
@@ -29,8 +29,7 @@ void do_exit() {
 
 int main(int, char**)
 {
-#if TEST_STD_VER >= 11 && TEST_CLANG_VER >= 1600
-  // TODO(LLVM-18): Remove the special-casing
+#if TEST_STD_VER >= 11
   {
     typedef int T;
     typedef cpp17_input_iterator<T*> MyInputIter;
@@ -52,7 +51,7 @@ int main(int, char**)
     assert(v[1] == 'b');
     assert(is_contiguous_container_asan_correct(v));
   }
-#endif
+#endif // TEST_STD_VER >= 11
   {
     typedef cpp17_input_iterator<int*> MyInputIter;
     // Sould not trigger ASan.
diff --git a/libcxx/test/libcxx/depr/enable_removed_cpp17_features.compile.pass.cpp b/libcxx/test/libcxx/depr/enable_removed_cpp17_features.compile.pass.cpp
deleted file mode 100644
index 1b7acad3cfa46..0000000000000
--- a/libcxx/test/libcxx/depr/enable_removed_cpp17_features.compile.pass.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// Test that defining _LIBCPP_ENABLE_CXX17_REMOVED_FEATURES correctly defines
-// _LIBCPP_ENABLE_CXX17_REMOVED_FOO for each individual component macro.
-
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX17_REMOVED_FEATURES -Wno-deprecated-pragma
-
-#include <__config>
-
-#include "test_macros.h"
-
-#ifndef _LIBCPP_ENABLE_CXX17_REMOVED_AUTO_PTR
-#  error _LIBCPP_ENABLE_CXX17_REMOVED_AUTO_PTR must be defined
-#endif
-
-#ifndef _LIBCPP_ENABLE_CXX17_REMOVED_BINDERS
-#  error _LIBCPP_ENABLE_CXX17_REMOVED_BINDERS must be defined
-#endif
-
-#ifndef _LIBCPP_ENABLE_CXX17_REMOVED_RANDOM_SHUFFLE
-#  error _LIBCPP_ENABLE_CXX17_REMOVED_RANDOM_SHUFFLE must be defined
-#endif
-
-#ifndef _LIBCPP_ENABLE_CXX17_REMOVED_UNEXPECTED_FUNCTIONS
-#error _LIBCPP_ENABLE_CXX17_REMOVED_UNEXPECTED_FUNCTIONS must be defined
-#endif
-
-#ifndef _LIBCPP_ENABLE_CXX17_REMOVED_AUTO_PTR
-#error _LIBCPP_ENABLE_CXX17_REMOVED_AUTO_PTR must be defined
-#endif
diff --git a/libcxx/test/libcxx/depr/enable_removed_cpp17_features.deprecated.verify.cpp b/libcxx/test/libcxx/depr/enable_removed_cpp17_features.deprecated.verify.cpp
deleted file mode 100644
index 059c1b3ead4f1..0000000000000
--- a/libcxx/test/libcxx/depr/enable_removed_cpp17_features.deprecated.verify.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <__config>
-
-// Ensure that defining _LIBCPP_ENABLE_CXX17_REMOVED_FEATURES yields a
-// deprecation warning. We intend to issue a deprecation warning in LLVM 18
-// and remove the macro entirely in LLVM 19. As such, this test will be quite
-// short lived.
-
-// UNSUPPORTED: clang-modules-build
-
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX17_REMOVED_FEATURES
-
-#include <__config> // expected-warning@* 1+ {{macro '_LIBCPP_ENABLE_CXX17_REMOVED_FEATURES' has been marked as deprecated}}
diff --git a/libcxx/test/libcxx/depr/enable_removed_cpp20_features.deprecated.verify.cpp b/libcxx/test/libcxx/depr/enable_removed_cpp20_features.deprecated.verify.cpp
deleted file mode 100644
index 163ff7d8fbda0..0000000000000
--- a/libcxx/test/libcxx/depr/enable_removed_cpp20_features.deprecated.verify.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <__config>
-
-// Ensure that defining _LIBCPP_ENABLE_CXX20_REMOVED_FEATURES yields a
-// deprecation warning. We intend to issue a deprecation warning in LLVM 18
-// and remove the macro entirely in LLVM 19. As such, this test will be quite
-// short lived.
-
-// UNSUPPORTED: clang-modules-build
-
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX20_REMOVED_FEATURES
-
-#include <version> // expected-warning@* 1+ {{macro '_LIBCPP_ENABLE_CXX20_REMOVED_FEATURES' has been marked as deprecated}}
diff --git a/libcxx/test/libcxx/time/time.zone/time.zone.info/time.zone.info.sys/ostream.pass.cpp b/libcxx/test/libcxx/time/time.zone/time.zone.info/time.zone.info.sys/ostream.pass.cpp
new file mode 100644
index 0000000000000..faa7d855c8de7
--- /dev/null
+++ b/libcxx/test/libcxx/time/time.zone/time.zone.info/time.zone.info.sys/ostream.pass.cpp
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-localization
+
+// TODO FMT This test should not require std::to_chars(floating-point)
+// XFAIL: availability-fp_to_chars-missing
+
+// XFAIL: libcpp-has-no-incomplete-tzdb
+
+// <chrono>
+
+// template<class charT, class traits>
+//   basic_ostream<charT, traits>&
+//     operator<<(basic_ostream<charT, traits>& os, const sys_info& r);
+
+// [time.zone.info.sys]
+//   7 Effects: Streams out the sys_info object r in an unspecified format.
+//   8 Returns: os.
+//
+// Tests the output produced by this function.
+
+#include <cassert>
+#include <chrono>
+#include <memory>
+#include <sstream>
+
+#include "assert_macros.h"
+#include "test_macros.h"
+#include "make_string.h"
+#include "concat_macros.h"
+
+#define SV(S) MAKE_STRING_VIEW(CharT, S)
+
+template <class CharT>
+static void test(std::basic_string_view<CharT> expected, std::chrono::sys_info&& info) {
+  std::basic_stringstream<CharT> sstr;
+  sstr << info;
+  std::basic_string<CharT> output = sstr.str();
+
+  TEST_REQUIRE(expected == output,
+               TEST_WRITE_CONCATENATED("\nExpected output ", expected, "\nActual output   ", output, '\n'));
+}
+
+template <class CharT>
+static void test() {
+  using namespace std::literals::chrono_literals;
+  namespace tz = std::chrono;
+
+  test(SV("[-10484-10-16 15:30:08, 14423-03-17 15:30:07) 00:00:00 0min \"TZ\""),
+       tz::sys_info{tz::sys_seconds::min(), tz::sys_seconds::max(), 0s, 0min, "TZ"});
+
+  test(SV("[1970-01-01 00:00:00, 2038-12-31 00:00:00) 12:23:45 -67min \"DMY\""),
+       tz::sys_info{static_cast<tz::sys_days>(tz::year_month_day{1970y, tz::January, 1d}),
+                    static_cast<tz::sys_days>(tz::year_month_day{2038y, tz::December, 31d}),
+                    12h + 23min + 45s,
+                    -67min,
+                    "DMY"});
+}
+
+int main(int, const char**) {
+  test<char>();
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test<wchar_t>();
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/vendor/apple/system-install-properties.sh.cpp b/libcxx/test/libcxx/vendor/apple/system-install-properties.sh.cpp
index 3e2e080368f4c..4ea27401e35d4 100644
--- a/libcxx/test/libcxx/vendor/apple/system-install-properties.sh.cpp
+++ b/libcxx/test/libcxx/vendor/apple/system-install-properties.sh.cpp
@@ -45,4 +45,4 @@
 
 // Make sure we use the libdispatch backend for the PSTL.
 //
-// RUN: grep "%{include-dir}/__config_site" -e '#define _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH'
+// RUN: grep "%{include-dir}/__config_site" -e '#define _LIBCPP_PSTL_BACKEND_LIBDISPATCH'
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp
index 2b9f34b731f87..0ec530c922e70 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp
@@ -7,9 +7,12 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// XFAIL: c++03
+// UNSUPPORTED: c++03
 // XFAIL: !has-1024-bit-atomics
 
+// Until we drop support for the synchronization library in C++11/14/17
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // XFAIL: availability-synchronization_library-missing
 
 // <atomic>
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp
index dfa781c566009..c21b67d479ae2 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_one.pass.cpp
@@ -7,9 +7,12 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// XFAIL: c++03
+// UNSUPPORTED: c++03
 // XFAIL: !has-1024-bit-atomics
 
+// Until we drop support for the synchronization library in C++11/14/17
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // XFAIL: availability-synchronization_library-missing
 
 // <atomic>
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp
index 38142b336e72c..af99113f13499 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait.pass.cpp
@@ -7,9 +7,12 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// XFAIL: c++03
+// UNSUPPORTED: c++03
 // XFAIL: !has-1024-bit-atomics
 
+// Until we drop support for the synchronization library in C++11/14/17
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // XFAIL: availability-synchronization_library-missing
 
 // <atomic>
diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp
index 2db95a0b67a7f..bb8c64593b54b 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_wait_explicit.pass.cpp
@@ -7,9 +7,12 @@
 //===----------------------------------------------------------------------===//
 //
 // UNSUPPORTED: no-threads
-// XFAIL: c++03
+// UNSUPPORTED: c++03
 // XFAIL: !has-1024-bit-atomics
 
+// Until we drop support for the synchronization library in C++11/14/17
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // XFAIL: availability-synchronization_library-missing
 
 // <atomic>
diff --git a/libcxx/test/std/thread/thread.barrier/arrive.pass.cpp b/libcxx/test/std/thread/thread.barrier/arrive.pass.cpp
index 18cdc6d654ac2..d9d9c1dba6bbb 100644
--- a/libcxx/test/std/thread/thread.barrier/arrive.pass.cpp
+++ b/libcxx/test/std/thread/thread.barrier/arrive.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
+// Until we drop support for the synchronization library in C++11/14/17
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // XFAIL: availability-synchronization_library-missing
 
 // <barrier>
diff --git a/libcxx/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp b/libcxx/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp
index 3fc48261de1b1..aff7b26e16f70 100644
--- a/libcxx/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp
+++ b/libcxx/test/std/thread/thread.barrier/arrive_and_drop.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
+// Until we drop support for the synchronization library in C++11/14/17
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // XFAIL: availability-synchronization_library-missing
 
 // <barrier>
diff --git a/libcxx/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp b/libcxx/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp
index 2aee8624ae3d5..8c45ba9278f28 100644
--- a/libcxx/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp
+++ b/libcxx/test/std/thread/thread.barrier/arrive_and_wait.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
+// Until we drop support for the synchronization library in C++11/14/17
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // XFAIL: availability-synchronization_library-missing
 
 // <barrier>
diff --git a/libcxx/test/std/thread/thread.barrier/completion.pass.cpp b/libcxx/test/std/thread/thread.barrier/completion.pass.cpp
index 7354dbe6ffe8a..633a0c8bf2366 100644
--- a/libcxx/test/std/thread/thread.barrier/completion.pass.cpp
+++ b/libcxx/test/std/thread/thread.barrier/completion.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
+// Until we drop support for the synchronization library in C++11/14/17
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // XFAIL: availability-synchronization_library-missing
 
 // <barrier>
diff --git a/libcxx/test/std/thread/thread.barrier/ctor.compile.pass.cpp b/libcxx/test/std/thread/thread.barrier/ctor.compile.pass.cpp
index d47127a18613b..fe7068d2a574c 100644
--- a/libcxx/test/std/thread/thread.barrier/ctor.compile.pass.cpp
+++ b/libcxx/test/std/thread/thread.barrier/ctor.compile.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
+// Until we drop support for the synchronization library in C++11/14/17
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // <barrier>
 
 // explicit barrier(ptrdiff_t __count, _CompletionF __completion = _CompletionF());
diff --git a/libcxx/test/std/thread/thread.barrier/max.pass.cpp b/libcxx/test/std/thread/thread.barrier/max.pass.cpp
index ec03c5c87a09c..b09a02e1bdef4 100644
--- a/libcxx/test/std/thread/thread.barrier/max.pass.cpp
+++ b/libcxx/test/std/thread/thread.barrier/max.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
+// Until we drop support for the synchronization library in C++11/14/17
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // <barrier>
 
 #include <barrier>
diff --git a/libcxx/test/std/thread/thread.latch/arrive_and_wait.pass.cpp b/libcxx/test/std/thread/thread.latch/arrive_and_wait.pass.cpp
index ddc06d2038cc8..8ca4f37b73b95 100644
--- a/libcxx/test/std/thread/thread.latch/arrive_and_wait.pass.cpp
+++ b/libcxx/test/std/thread/thread.latch/arrive_and_wait.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
+// Until we drop support for the synchronization library in C++11/14/17
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // XFAIL: availability-synchronization_library-missing
 
 // <latch>
diff --git a/libcxx/test/std/thread/thread.latch/count_down.pass.cpp b/libcxx/test/std/thread/thread.latch/count_down.pass.cpp
index 1503c09509a6c..eb524abd24b98 100644
--- a/libcxx/test/std/thread/thread.latch/count_down.pass.cpp
+++ b/libcxx/test/std/thread/thread.latch/count_down.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
+// Until we drop support for the synchronization library in C++11/14/17
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // XFAIL: availability-synchronization_library-missing
 
 // <latch>
diff --git a/libcxx/test/std/thread/thread.latch/ctor.pass.cpp b/libcxx/test/std/thread/thread.latch/ctor.pass.cpp
index 1983f6409cb5a..bca4561bd2f74 100644
--- a/libcxx/test/std/thread/thread.latch/ctor.pass.cpp
+++ b/libcxx/test/std/thread/thread.latch/ctor.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
+// Until we drop support for the synchronization library in C++11/14/17
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // <latch>
 
 // inline constexpr explicit latch(ptrdiff_t __expected);
diff --git a/libcxx/test/std/thread/thread.latch/max.pass.cpp b/libcxx/test/std/thread/thread.latch/max.pass.cpp
index 8b9176c8cac57..bcf353ed9712e 100644
--- a/libcxx/test/std/thread/thread.latch/max.pass.cpp
+++ b/libcxx/test/std/thread/thread.latch/max.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
+// Until we drop support for the synchronization library in C++11/14/17
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // <latch>
 
 #include <latch>
diff --git a/libcxx/test/std/thread/thread.latch/try_wait.pass.cpp b/libcxx/test/std/thread/thread.latch/try_wait.pass.cpp
index 70ef2cdf71254..8f354463a8697 100644
--- a/libcxx/test/std/thread/thread.latch/try_wait.pass.cpp
+++ b/libcxx/test/std/thread/thread.latch/try_wait.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
+// Until we drop support for the synchronization library in C++11/14/17
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // XFAIL: availability-synchronization_library-missing
 
 // <latch>
diff --git a/libcxx/test/std/thread/thread.semaphore/acquire.pass.cpp b/libcxx/test/std/thread/thread.semaphore/acquire.pass.cpp
index 3f6e3107e8bce..22eed736c6b75 100644
--- a/libcxx/test/std/thread/thread.semaphore/acquire.pass.cpp
+++ b/libcxx/test/std/thread/thread.semaphore/acquire.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
+// Until we drop support for the synchronization library in C++11/14/17
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // XFAIL: availability-synchronization_library-missing
 
 // <semaphore>
diff --git a/libcxx/test/std/thread/thread.semaphore/binary.pass.cpp b/libcxx/test/std/thread/thread.semaphore/binary.pass.cpp
index 111a650b5ea39..c01c78506587c 100644
--- a/libcxx/test/std/thread/thread.semaphore/binary.pass.cpp
+++ b/libcxx/test/std/thread/thread.semaphore/binary.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
+// Until we drop support for the synchronization library in C++11/14/17
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // XFAIL: availability-synchronization_library-missing
 
 // <semaphore>
diff --git a/libcxx/test/std/thread/thread.semaphore/ctor.compile.pass.cpp b/libcxx/test/std/thread/thread.semaphore/ctor.compile.pass.cpp
index 28ccc0124d489..dcc298ce11ce8 100644
--- a/libcxx/test/std/thread/thread.semaphore/ctor.compile.pass.cpp
+++ b/libcxx/test/std/thread/thread.semaphore/ctor.compile.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
+// Until we drop support for the synchronization library in C++11/14/17
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // <semaphore>
 
 // constexpr explicit counting_semaphore(ptrdiff_t desired);
diff --git a/libcxx/test/std/thread/thread.semaphore/max.pass.cpp b/libcxx/test/std/thread/thread.semaphore/max.pass.cpp
index ca7ad0c92e60e..6f3ed5e345e0b 100644
--- a/libcxx/test/std/thread/thread.semaphore/max.pass.cpp
+++ b/libcxx/test/std/thread/thread.semaphore/max.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
+// Until we drop support for the synchronization library in C++11/14/17
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // <semaphore>
 
 #include <semaphore>
diff --git a/libcxx/test/std/thread/thread.semaphore/release.pass.cpp b/libcxx/test/std/thread/thread.semaphore/release.pass.cpp
index bf3dd7f7d814f..3c4d179e50433 100644
--- a/libcxx/test/std/thread/thread.semaphore/release.pass.cpp
+++ b/libcxx/test/std/thread/thread.semaphore/release.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
+// Until we drop support for the synchronization library in C++11/14/17
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // XFAIL: availability-synchronization_library-missing
 
 // <semaphore>
diff --git a/libcxx/test/std/thread/thread.semaphore/timed.pass.cpp b/libcxx/test/std/thread/thread.semaphore/timed.pass.cpp
index 9fa01fc035904..77f15ece221d4 100644
--- a/libcxx/test/std/thread/thread.semaphore/timed.pass.cpp
+++ b/libcxx/test/std/thread/thread.semaphore/timed.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
+// Until we drop support for the synchronization library in C++11/14/17
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // XFAIL: availability-synchronization_library-missing
 
 // <semaphore>
diff --git a/libcxx/test/std/thread/thread.semaphore/try_acquire.pass.cpp b/libcxx/test/std/thread/thread.semaphore/try_acquire.pass.cpp
index 0d0f7792592fb..ec159daf87a3f 100644
--- a/libcxx/test/std/thread/thread.semaphore/try_acquire.pass.cpp
+++ b/libcxx/test/std/thread/thread.semaphore/try_acquire.pass.cpp
@@ -9,6 +9,9 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
+// Until we drop support for the synchronization library in C++11/14/17
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+
 // XFAIL: availability-synchronization_library-missing
 
 // <semaphore>
diff --git a/libcxx/test/std/time/time.cal/time.cal.ymd/time.cal.ymd.nonmembers/ostream.pass.cpp b/libcxx/test/std/time/time.cal/time.cal.ymd/time.cal.ymd.nonmembers/ostream.pass.cpp
index ffc737fcad5dd..20ffb165558e3 100644
--- a/libcxx/test/std/time/time.cal/time.cal.ymd/time.cal.ymd.nonmembers/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.cal/time.cal.ymd/time.cal.ymd.nonmembers/ostream.pass.cpp
@@ -13,9 +13,6 @@
 // TODO FMT This test should not require std::to_chars(floating-point)
 // XFAIL: availability-fp_to_chars-missing
 
-// TODO FMT Investigate Windows issues.
-// XFAIL: msvc
-
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
@@ -89,20 +86,9 @@ static void test() {
   TEST_EQUAL(stream_c_locale<CharT>(
                  std::chrono::year_month_day{std::chrono::year{2000}, std::chrono::month{2}, std::chrono::day{29}}),
              SV("2000-02-29"));
-
-#if defined(_AIX)
-  TEST_EQUAL(stream_c_locale<CharT>(
-                 std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}),
-             SV("+32767-12-31"));
-#elif defined(_WIN32) // defined(_AIX)
-  TEST_EQUAL(stream_c_locale<CharT>(
-                 std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}),
-             SV(""));
-#else                 //  defined(_AIX)
   TEST_EQUAL(stream_c_locale<CharT>(
                  std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}),
              SV("32767-12-31"));
-#endif                // defined(_AIX)
 
   TEST_EQUAL(stream_fr_FR_locale<CharT>(
                  std::chrono::year_month_day{std::chrono::year{-32'768}, std::chrono::month{1}, std::chrono::day{1}}),
@@ -122,19 +108,9 @@ static void test() {
   TEST_EQUAL(stream_fr_FR_locale<CharT>(
                  std::chrono::year_month_day{std::chrono::year{2000}, std::chrono::month{2}, std::chrono::day{29}}),
              SV("2000-02-29"));
-#if defined(_AIX)
-  TEST_EQUAL(stream_fr_FR_locale<CharT>(
-                 std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}),
-             SV("+32767-12-31"));
-#elif defined(_WIN32) // defined(_AIX)
-  TEST_EQUAL(stream_fr_FR_locale<CharT>(
-                 std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}),
-             SV(""));
-#else                 // defined(_AIX)
   TEST_EQUAL(stream_fr_FR_locale<CharT>(
                  std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}),
              SV("32767-12-31"));
-#endif                // defined(_AIX)
 
   TEST_EQUAL(stream_ja_JP_locale<CharT>(
                  std::chrono::year_month_day{std::chrono::year{-32'768}, std::chrono::month{1}, std::chrono::day{1}}),
@@ -154,19 +130,9 @@ static void test() {
   TEST_EQUAL(stream_ja_JP_locale<CharT>(
                  std::chrono::year_month_day{std::chrono::year{2000}, std::chrono::month{2}, std::chrono::day{29}}),
              SV("2000-02-29"));
-#if defined(_AIX)
-  TEST_EQUAL(stream_ja_JP_locale<CharT>(
-                 std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}),
-             SV("+32767-12-31"));
-#elif defined(_WIN32) // defined(_AIX)
-  TEST_EQUAL(stream_ja_JP_locale<CharT>(
-                 std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}),
-             SV(""));
-#else                 // defined(_AIX)
   TEST_EQUAL(stream_ja_JP_locale<CharT>(
                  std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}),
              SV("32767-12-31"));
-#endif                // defined(_AIX)
 }
 
 int main(int, char**) {
diff --git a/libcxx/test/std/time/time.clock/time.clock.system/sys_date.ostream.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.system/sys_date.ostream.pass.cpp
index 7af3ebf776807..c5645884cfed0 100644
--- a/libcxx/test/std/time/time.clock/time.clock.system/sys_date.ostream.pass.cpp
+++ b/libcxx/test/std/time/time.clock/time.clock.system/sys_date.ostream.pass.cpp
@@ -13,9 +13,6 @@
 // TODO FMT This test should not require std::to_chars(floating-point)
 // XFAIL: availability-fp_to_chars-missing
 
-// TODO FMT Investigate Windows issues.
-// XFAIL: msvc
-
 // REQUIRES: locale.fr_FR.UTF-8
 // REQUIRES: locale.ja_JP.UTF-8
 
@@ -81,20 +78,9 @@ static void test() {
   TEST_EQUAL(stream_c_locale<CharT>(std::chrono::sys_days{
                  std::chrono::year_month_day{std::chrono::year{2000}, std::chrono::month{2}, std::chrono::day{29}}}),
              SV("2000-02-29"));
-
-#if defined(_AIX)
-  TEST_EQUAL(stream_c_locale<CharT>(std::chrono::sys_days{
-                 std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}),
-             SV("+32767-12-31"));
-#elif defined(_WIN32) // defined(_AIX)
-  TEST_EQUAL(stream_c_locale<CharT>(std::chrono::sys_days{
-                 std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}),
-             SV(""));
-#else                 //  defined(_AIX)
   TEST_EQUAL(stream_c_locale<CharT>(std::chrono::sys_days{
                  std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}),
              SV("32767-12-31"));
-#endif                // defined(_AIX)
 
   // multiples of days are considered days.
   TEST_EQUAL(stream_c_locale<CharT>(std::chrono::sys_time<std::chrono::weeks>{std::chrono::weeks{3}}),
@@ -112,19 +98,9 @@ static void test() {
   TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::sys_days{
                  std::chrono::year_month_day{std::chrono::year{2000}, std::chrono::month{2}, std::chrono::day{29}}}),
              SV("2000-02-29"));
-#if defined(_AIX)
-  TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::sys_days{
-                 std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}),
-             SV("+32767-12-31"));
-#elif defined(_WIN32) // defined(_AIX)
-  TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::sys_days{
-                 std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}),
-             SV(""));
-#else                 // defined(_AIX)
   TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::sys_days{
                  std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}),
              SV("32767-12-31"));
-#endif                // defined(_AIX)
 
   // multiples of days are considered days.
   TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::sys_time<std::chrono::weeks>{std::chrono::weeks{3}}),
@@ -142,19 +118,9 @@ static void test() {
   TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::sys_days{
                  std::chrono::year_month_day{std::chrono::year{2000}, std::chrono::month{2}, std::chrono::day{29}}}),
              SV("2000-02-29"));
-#if defined(_AIX)
-  TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::sys_days{
-                 std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}),
-             SV("+32767-12-31"));
-#elif defined(_WIN32) // defined(_AIX)
-  TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::sys_days{
-                 std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}),
-             SV(""));
-#else                 // defined(_AIX)
   TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::sys_days{
                  std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}),
              SV("32767-12-31"));
-#endif                // defined(_AIX)
 
   // multiples of days are considered days.
   TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::sys_time<std::chrono::weeks>{std::chrono::weeks{3}}),
diff --git a/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp b/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp
index b07282593d759..f57841cca8629 100644
--- a/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.file_time.pass.cpp
@@ -904,12 +904,6 @@ static void test_valid_values_date_time() {
 
 template <class CharT>
 static void test_valid_values_time_zone() {
-// The Apple CI gives %z='-0700'	%Ez='-0700'	%Oz='-0700'	%Z='UTC'
-// -0700 looks like the local time where the CI happens to reside, therefore
-// omit this test on Apple.
-// The Windows CI gives %z='-0000', but on local machines set to a different
-// timezone, it gives e.g. %z='+0200'.
-#if !defined(__APPLE__) && !defined(_WIN32)
   using namespace std::literals::chrono_literals;
 
   constexpr std::basic_string_view<CharT> fmt  = SV("{:%%z='%z'%t%%Ez='%Ez'%t%%Oz='%Oz'%t%%Z='%Z'%n}");
@@ -918,48 +912,23 @@ static void test_valid_values_time_zone() {
   const std::locale loc(LOCALE_ja_JP_UTF_8);
   std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
 
-#  if defined(_AIX)
   // Non localized output using C-locale
-  check(SV("%z='UTC'\t%Ez='UTC'\t%Oz='UTC'\t%Z='UTC'\n"),
+  check(SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='UTC'\n"),
         fmt,
         file_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
 
   // Use the global locale (fr_FR)
-  check(SV("%z='UTC'\t%Ez='UTC'\t%Oz='UTC'\t%Z='UTC'\n"),
+  check(SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='UTC'\n"),
         lfmt,
         file_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
 
-  // Use supplied locale (ja_JP). This locale has a different alternate.a
+  // Use supplied locale (ja_JP).
   check(loc,
-        SV("%z='UTC'\t%Ez='UTC'\t%Oz='UTC'\t%Z='UTC'\n"),
-        lfmt,
-        file_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
-#  else                    // defined(_AIX)
-  // Non localized output using C-locale
-  check(SV("%z='+0000'\t%Ez='+0000'\t%Oz='+0000'\t%Z='UTC'\n"),
-        fmt,
-        file_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
-
-  // Use the global locale (fr_FR)
-  check(SV("%z='+0000'\t%Ez='+0000'\t%Oz='+0000'\t%Z='UTC'\n"),
+        SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='UTC'\n"),
         lfmt,
         file_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
 
-  // Use supplied locale (ja_JP). This locale has a different alternate.a
-#    if defined(__FreeBSD__)
-  check(loc,
-        SV("%z='+0000'\t%Ez='+0000'\t%Oz='+0000'\t%Z='UTC'\n"),
-        lfmt,
-        file_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
-#    else
-  check(loc,
-        SV("%z='+0000'\t%Ez='+0000'\t%Oz='+〇'\t%Z='UTC'\n"),
-        lfmt,
-        file_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
-#    endif
-#  endif                   // defined(_AIX)
   std::locale::global(std::locale::classic());
-#endif // !defined(__APPLE__) && !defined(_WIN32)
 }
 
 template <class CharT>
diff --git a/libcxx/test/std/time/time.syn/formatter.sys_info.pass.cpp b/libcxx/test/std/time/time.syn/formatter.sys_info.pass.cpp
new file mode 100644
index 0000000000000..0a41424cfdcb8
--- /dev/null
+++ b/libcxx/test/std/time/time.syn/formatter.sys_info.pass.cpp
@@ -0,0 +1,137 @@
+//===----------------------------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-localization
+
+// TODO FMT This test should not require std::to_chars(floating-point)
+// XFAIL: availability-fp_to_chars-missing
+
+// XFAIL: libcpp-has-no-incomplete-tzdb
+
+// REQUIRES: locale.fr_FR.UTF-8
+// REQUIRES: locale.ja_JP.UTF-8
+
+// <chrono>
+//
+// template<class charT> struct formatter<chrono::sys_info, charT>;
+
+#include <chrono>
+#include <format>
+
+#include <cassert>
+#include <concepts>
+#include <locale>
+#include <iostream>
+#include <type_traits>
+
+#include "formatter_tests.h"
+#include "make_string.h"
+#include "platform_support.h" // locale name macros
+#include "test_macros.h"
+
+template <class CharT>
+static void test_no_chrono_specs() {
+// This test libc++ specific due to
+// [time.zone.info.sys]/7
+//   Effects: Streams out the sys_info object r in an unspecified format.
+#ifdef _LIBCPP_VERSION
+  using namespace std::literals::chrono_literals;
+  namespace tz = std::chrono;
+
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output
+
+  check(SV("[-10484-10-16 15:30:08, 14423-03-17 15:30:07) 00:00:00 0min \"TZ\""),
+        SV("{}"),
+        tz::sys_info{tz::sys_seconds::min(), tz::sys_seconds::max(), 0s, 0min, "TZ"});
+
+  check(SV("[1970-01-01 00:00:00, 2038-12-31 00:00:00) 12:23:45 -67min \"DMY\""),
+        SV("{}"),
+        tz::sys_info{static_cast<tz::sys_days>(tz::year_month_day{1970y, tz::January, 1d}),
+                     static_cast<tz::sys_days>(tz::year_month_day{2038y, tz::December, 31d}),
+                     12h + 23min + 45s,
+                     -67min,
+                     "DMY"});
+
+  std::locale::global(std::locale::classic());
+#endif // _LIBCPP_VERSION
+}
+
+template <class CharT>
+static void test_valid_values() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt  = SV("{:%%z='%z'%t%%Ez='%Ez'%t%%Oz='%Oz'%t%%Z='%Z'%n}");
+  constexpr std::basic_string_view<CharT> lfmt = SV("{:L%%z='%z'%t%%Ez='%Ez'%t%%Oz='%Oz'%t%%Z='%Z'%n}");
+
+  const std::locale loc(LOCALE_ja_JP_UTF_8);
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output using C-locale
+  check(SV("%z='-0200'\t%Ez='-02:00'\t%Oz='-02:00'\t%Z='NEG'\n"),
+        fmt,
+        std::chrono::sys_info{std::chrono::sys_seconds{0s}, std::chrono::sys_seconds{0s}, -2h, 0min, "NEG"});
+
+  check(SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='ZERO'\n"),
+        fmt,
+        std::chrono::sys_info{std::chrono::sys_seconds{0s}, std::chrono::sys_seconds{0s}, 0s, 0min, "ZERO"});
+
+  check(SV("%z='+1115'\t%Ez='+11:15'\t%Oz='+11:15'\t%Z='POS'\n"),
+        fmt,
+        std::chrono::sys_info{std::chrono::sys_seconds{0s}, std::chrono::sys_seconds{0s}, 11h + 15min, 0min, "POS"});
+
+  // Use the global locale (fr_FR)
+  check(SV("%z='-0200'\t%Ez='-02:00'\t%Oz='-02:00'\t%Z='NEG'\n"),
+        lfmt,
+        std::chrono::sys_info{std::chrono::sys_seconds{0s}, std::chrono::sys_seconds{0s}, -2h, 0min, "NEG"});
+
+  check(SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='ZERO'\n"),
+        lfmt,
+        std::chrono::sys_info{std::chrono::sys_seconds{0s}, std::chrono::sys_seconds{0s}, 0s, 0min, "ZERO"});
+
+  check(SV("%z='+1115'\t%Ez='+11:15'\t%Oz='+11:15'\t%Z='POS'\n"),
+        lfmt,
+        std::chrono::sys_info{std::chrono::sys_seconds{0s}, std::chrono::sys_seconds{0s}, 11h + 15min, 0min, "POS"});
+
+  // Use supplied locale (ja_JP).
+  check(loc,
+        SV("%z='-0200'\t%Ez='-02:00'\t%Oz='-02:00'\t%Z='NEG'\n"),
+        lfmt,
+        std::chrono::sys_info{std::chrono::sys_seconds{0s}, std::chrono::sys_seconds{0s}, -2h, 0min, "NEG"});
+
+  check(loc,
+        SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='ZERO'\n"),
+        lfmt,
+        std::chrono::sys_info{std::chrono::sys_seconds{0s}, std::chrono::sys_seconds{0s}, 0s, 0min, "ZERO"});
+
+  check(loc,
+        SV("%z='+1115'\t%Ez='+11:15'\t%Oz='+11:15'\t%Z='POS'\n"),
+        lfmt,
+        std::chrono::sys_info{std::chrono::sys_seconds{0s}, std::chrono::sys_seconds{0s}, 11h + 15min, 0min, "POS"});
+
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test() {
+  test_no_chrono_specs<CharT>();
+  test_valid_values<CharT>();
+
+  check_invalid_types<CharT>({SV("z"), SV("Z"), SV("Ez"), SV("Oz")}, std::chrono::sys_info{});
+}
+
+int main(int, char**) {
+  test<char>();
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test<wchar_t>();
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp b/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp
index 2fed270cbade7..3a7d6f9a6b01f 100644
--- a/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.sys_time.pass.cpp
@@ -900,12 +900,6 @@ static void test_valid_values_date_time() {
 
 template <class CharT>
 static void test_valid_values_time_zone() {
-// The Apple CI gives %z='-0700'	%Ez='-0700'	%Oz='-0700'	%Z='UTC'
-// -0700 looks like the local time where the CI happens to reside, therefore
-// omit this test on Apple.
-// The Windows CI gives %z='-0000', but on local machines set to a different
-// timezone, it gives e.g. %z='+0200'.
-#if !defined(__APPLE__) && !defined(_WIN32)
   using namespace std::literals::chrono_literals;
 
   constexpr std::basic_string_view<CharT> fmt  = SV("{:%%z='%z'%t%%Ez='%Ez'%t%%Oz='%Oz'%t%%Z='%Z'%n}");
@@ -914,48 +908,23 @@ static void test_valid_values_time_zone() {
   const std::locale loc(LOCALE_ja_JP_UTF_8);
   std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
 
-#  if defined(_AIX)
   // Non localized output using C-locale
-  check(SV("%z='UTC'\t%Ez='UTC'\t%Oz='UTC'\t%Z='UTC'\n"),
+  check(SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='UTC'\n"),
         fmt,
         std::chrono::sys_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
 
   // Use the global locale (fr_FR)
-  check(SV("%z='UTC'\t%Ez='UTC'\t%Oz='UTC'\t%Z='UTC'\n"),
+  check(SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='UTC'\n"),
         lfmt,
         std::chrono::sys_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
 
-  // Use supplied locale (ja_JP). This locale has a different alternate.a
+  // Use supplied locale (ja_JP).
   check(loc,
-        SV("%z='UTC'\t%Ez='UTC'\t%Oz='UTC'\t%Z='UTC'\n"),
-        lfmt,
-        std::chrono::sys_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
-#  else                                // defined(_AIX)
-  // Non localized output using C-locale
-  check(SV("%z='+0000'\t%Ez='+0000'\t%Oz='+0000'\t%Z='UTC'\n"),
-        fmt,
-        std::chrono::sys_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
-
-  // Use the global locale (fr_FR)
-  check(SV("%z='+0000'\t%Ez='+0000'\t%Oz='+0000'\t%Z='UTC'\n"),
+        SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='UTC'\n"),
         lfmt,
         std::chrono::sys_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
 
-  // Use supplied locale (ja_JP). This locale has a different alternate.a
-#    if defined(__FreeBSD__)
-  check(loc,
-        SV("%z='+0000'\t%Ez='+0000'\t%Oz='+0000'\t%Z='UTC'\n"),
-        lfmt,
-        std::chrono::sys_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
-#    else
-  check(loc,
-        SV("%z='+0000'\t%Ez='+0000'\t%Oz='+〇'\t%Z='UTC'\n"),
-        lfmt,
-        std::chrono::sys_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
-#    endif
-#  endif                               // defined(_AIX)
   std::locale::global(std::locale::classic());
-#endif // !defined(__APPLE__) && !defined(_WIN32)
 }
 
 template <class CharT>
diff --git a/libcxx/test/std/time/time.syn/formatter.year_month_day.pass.cpp b/libcxx/test/std/time/time.syn/formatter.year_month_day.pass.cpp
index 5a2b7afa37a86..1f2af1cb0530d 100644
--- a/libcxx/test/std/time/time.syn/formatter.year_month_day.pass.cpp
+++ b/libcxx/test/std/time/time.syn/formatter.year_month_day.pass.cpp
@@ -62,17 +62,6 @@ static void test_no_chrono_specs() {
         std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{2}, std::chrono::day{31}});
 
   // Valid year, invalid month, valid day
-#ifdef _WIN32
-  check(SV(" is not a valid date"),
-        SV("{}"),
-        std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{0}, std::chrono::day{31}});
-  check(SV("****** is not a valid date******"),
-        SV("{:*^32}"),
-        std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{0}, std::chrono::day{31}});
-  check(SV("*********** is not a valid date"),
-        SV("{:*>31}"),
-        std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{0}, std::chrono::day{31}});
-#else  // _WIN32
   check(SV("1970-00-31 is not a valid date"),
         SV("{}"),
         std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{0}, std::chrono::day{31}});
@@ -82,20 +71,8 @@ static void test_no_chrono_specs() {
   check(SV("*1970-00-31 is not a valid date"),
         SV("{:*>31}"),
         std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{0}, std::chrono::day{31}});
-#endif // _WIN32
 
   // Valid year, invalid month, invalid day
-#ifdef _WIN32
-  check(SV(" is not a valid date"),
-        SV("{}"),
-        std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{0}, std::chrono::day{32}});
-  check(SV("****** is not a valid date******"),
-        SV("{:*^32}"),
-        std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{0}, std::chrono::day{32}});
-  check(SV("*********** is not a valid date"),
-        SV("{:*>31}"),
-        std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{0}, std::chrono::day{32}});
-#else  // _WIN32
   check(SV("1970-00-32 is not a valid date"),
         SV("{}"),
         std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{0}, std::chrono::day{32}});
@@ -105,7 +82,6 @@ static void test_no_chrono_specs() {
   check(SV("*1970-00-32 is not a valid date"),
         SV("{:*>31}"),
         std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{0}, std::chrono::day{32}});
-#endif // _WIN32
 
   // Invalid year, valid month, valid day
   check(SV("-32768-01-31 is not a valid date"),
diff --git a/libcxx/test/std/time/time.zone/time.zone.info/time.zone.info.sys/ostream.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.info/time.zone.info.sys/ostream.pass.cpp
new file mode 100644
index 0000000000000..82c4844b423c5
--- /dev/null
+++ b/libcxx/test/std/time/time.zone/time.zone.info/time.zone.info.sys/ostream.pass.cpp
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-localization
+
+// TODO FMT This test should not require std::to_chars(floating-point)
+// XFAIL: availability-fp_to_chars-missing
+
+// XFAIL: libcpp-has-no-incomplete-tzdb
+
+// <chrono>
+
+// template<class charT, class traits>
+//   basic_ostream<charT, traits>&
+//     operator<<(basic_ostream<charT, traits>& os, const sys_info& r);
+
+// [time.zone.info.sys]
+//   7 Effects: Streams out the sys_info object r in an unspecified format.
+//   8 Returns: os.
+//
+// There is a private libc++ test that validates the exact output.
+
+#include <cassert>
+#include <chrono>
+#include <memory>
+#include <sstream>
+
+#include "test_macros.h"
+
+template <class CharT>
+static void test() {
+  using namespace std::literals::chrono_literals;
+  std::chrono::sys_info s{std::chrono::sys_seconds{0s}, std::chrono::sys_seconds{0s}, 0h, 0min, ""};
+  std::basic_ostringstream<CharT> os;
+  std::basic_ostream<CharT>& result = std::chrono::operator<<(os, s);
+  assert(std::addressof(result) == std::addressof(os));
+}
+
+int main(int, const char**) {
+  test<char>();
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test<wchar_t>();
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.pass.cpp b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.pass.cpp
index a751a2fb6347b..d27cf0bd89062 100644
--- a/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.pass.cpp
+++ b/libcxx/test/std/time/time.zone/time.zone.timezone/time.zone.members/get_info.sys_time.pass.cpp
@@ -6,7 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14, c++17
+// TODO TZDB review the test based on review comments in
+// https://github.com/llvm/llvm-project/pull/85619
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23, c++26
 // UNSUPPORTED: no-filesystem, no-localization, no-tzdb
 
 // XFAIL: libcpp-has-no-incomplete-tzdb
diff --git a/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp
index fad5277fd6e11..f40784ec446fa 100644
--- a/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp
@@ -176,10 +176,12 @@ void test_P1361() {
 
   assert_is_formattable<std::chrono::hh_mm_ss<std::chrono::microseconds>, CharT>();
 
-  //assert_is_formattable<std::chrono::sys_info, CharT>();
+#  if !defined(TEST_HAS_NO_INCOMPLETE_TZDB)
+  assert_is_formattable<std::chrono::sys_info, CharT>();
   //assert_is_formattable<std::chrono::local_info, CharT>();
 
   //assert_is_formattable<std::chrono::zoned_time, CharT>();
+#  endif // !defined(TEST_HAS_NO_INCOMPLETE_TZDB)
 
 #endif // TEST_HAS_NO_LOCALIZATION
 }
diff --git a/libcxx/test/support/test_iterators.h b/libcxx/test/support/test_iterators.h
index 7ffb74990fa4d..aa819ecd4733b 100644
--- a/libcxx/test/support/test_iterators.h
+++ b/libcxx/test/support/test_iterators.h
@@ -1484,9 +1484,14 @@ class iterator_wrapper {
     return tmp;
   }
 
-  iterator_wrapper& operator+=(difference_type i) {
+  Derived& operator+=(difference_type i) {
     iter_ += i;
-    return *this;
+    return static_cast<Derived&>(*this);
+  }
+
+  Derived& operator-=(difference_type i) {
+    iter_ -= i;
+    return static_cast<Derived&>(*this);
   }
 
   friend decltype(iter_ - iter_) operator-(const iterator_wrapper& lhs, const iterator_wrapper& rhs) {
@@ -1503,8 +1508,15 @@ class iterator_wrapper {
     return iter;
   }
 
+  friend Derived operator+(difference_type i, Derived iter) { return iter + i; }
+
   friend bool operator==(const iterator_wrapper& lhs, const iterator_wrapper& rhs) { return lhs.iter_ == rhs.iter_; }
   friend bool operator!=(const iterator_wrapper& lhs, const iterator_wrapper& rhs) { return lhs.iter_ != rhs.iter_; }
+
+  friend bool operator>(const iterator_wrapper& lhs, const iterator_wrapper& rhs) { return lhs.iter_ > rhs.iter_; }
+  friend bool operator<(const iterator_wrapper& lhs, const iterator_wrapper& rhs) { return lhs.iter_ < rhs.iter_; }
+  friend bool operator<=(const iterator_wrapper& lhs, const iterator_wrapper& rhs) { return lhs.iter_ <= rhs.iter_; }
+  friend bool operator>=(const iterator_wrapper& lhs, const iterator_wrapper& rhs) { return lhs.iter_ >= rhs.iter_; }
 };
 
 class iterator_error : std::runtime_error {
diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h
index 7b2dcbb52d0c8..fe9207d7e5969 100644
--- a/libcxx/test/support/test_macros.h
+++ b/libcxx/test/support/test_macros.h
@@ -417,6 +417,14 @@ inline Tp const& DoNotOptimize(Tp const& value) {
 #  define TEST_HAS_NO_RANDOM_DEVICE
 #endif
 
+#if defined(_LIBCPP_HAS_NO_INCOMPLETE_TZDB)
+#  define TEST_HAS_NO_INCOMPLETE_TZDB
+#endif
+
+#if defined(_LIBCPP_HAS_NO_TIME_ZONE_DATABASE)
+#  define TEST_HAS_NO_TIME_ZONE_DATABASE
+#endif
+
 #if defined(TEST_COMPILER_CLANG)
 #  define TEST_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push")
 #  define TEST_DIAGNOSTIC_POP _Pragma("clang diagnostic pop")
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index 6ff16309546ba..c81b56b1af547 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -318,7 +318,7 @@ def _getAndroidDeviceApi(cfg):
     "_LIBCPP_HAS_NO_WIDE_CHARACTERS": "no-wide-characters",
     "_LIBCPP_HAS_NO_TIME_ZONE_DATABASE": "no-tzdb",
     "_LIBCPP_HAS_NO_UNICODE": "libcpp-has-no-unicode",
-    "_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH": "libcpp-pstl-cpu-backend-libdispatch",
+    "_LIBCPP_PSTL_BACKEND_LIBDISPATCH": "libcpp-pstl-backend-libdispatch",
 }
 for macro, feature in macros.items():
     DEFAULT_FEATURES.append(
diff --git a/libcxx/utils/libcxx/test/modules.py b/libcxx/utils/libcxx/test/modules.py
index 3f3c7999a1a21..aab7651c7bb03 100644
--- a/libcxx/utils/libcxx/test/modules.py
+++ b/libcxx/utils/libcxx/test/modules.py
@@ -26,8 +26,6 @@
 # The operators are added for private types like __iom_t10.
 SkipDeclarations["iomanip"] = ["std::operator<<", "std::operator>>"]
 
-SkipDeclarations["iosfwd"] = ["std::ios_base", "std::vector"]
-
 # This header also provides declarations in the namespace that might be
 # an error.
 SkipDeclarations["filesystem"] = [
@@ -54,8 +52,6 @@
     "std::operator==",
 ]
 
-# Declared in the forward header since std::string uses std::allocator
-SkipDeclarations["string"] = ["std::allocator"]
 # TODO MODULES remove zombie names
 # https://libcxx.llvm.org/Status/Cxx20.html#note-p0619
 SkipDeclarations["memory"] = [
@@ -63,9 +59,6 @@
     "std::get_temporary_buffer",
 ]
 
-# TODO MODULES this should be part of ios instead
-SkipDeclarations["streambuf"] = ["std::basic_ios"]
-
 # include/__type_traits/is_swappable.h
 SkipDeclarations["type_traits"] = [
     "std::swap",
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 9c20bbb83d86d..7269d156752de 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -2072,8 +2072,16 @@ void Writer::createRuntimePseudoRelocs() {
     return;
   }
 
-  if (!rels.empty())
+  if (!rels.empty()) {
     log("Writing " + Twine(rels.size()) + " runtime pseudo relocations");
+    const char *symbolName = "_pei386_runtime_relocator";
+    Symbol *relocator = ctx.symtab.findUnderscore(symbolName);
+    if (!relocator)
+      error("output image has runtime pseudo relocations, but the function " +
+            Twine(symbolName) +
+            " is missing; it is needed for fixing the relocations at runtime");
+  }
+
   PseudoRelocTableChunk *table = make<PseudoRelocTableChunk>(rels);
   rdataSec->addChunk(table);
   EmptyChunk *endOfList = make<EmptyChunk>();
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index c8350652e65a6..fa48552b8f7a1 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -464,7 +464,11 @@ void InputSection::copyRelocations(uint8_t *buf,
         addend += sec->getFile<ELFT>()->mipsGp0;
       }
 
-      if (RelTy::IsRela)
+      if (config->emachine == EM_LOONGARCH && type == R_LARCH_ALIGN)
+        // LoongArch psABI v2.30, the R_LARCH_ALIGN requires symbol index.
+        // If it use the section symbol, the addend should not be changed.
+        p->r_addend = addend;
+      else if (RelTy::IsRela)
         p->r_addend = sym.getVA(addend) - section->getOutputSection()->addr;
       // For SHF_ALLOC sections relocated by REL, append a relocation to
       // sec->relocations so that relocateAlloc transitively called by
diff --git a/lld/test/COFF/autoimport-arm-data.s b/lld/test/COFF/autoimport-arm-data.s
index 74604aa5c8234..82c66f0989d49 100644
--- a/lld/test/COFF/autoimport-arm-data.s
+++ b/lld/test/COFF/autoimport-arm-data.s
@@ -33,6 +33,9 @@
     .text
     .thumb
 main:
+    bx lr
+    .global _pei386_runtime_relocator
+_pei386_runtime_relocator:
     bx lr
     .data
 ptr:
diff --git a/lld/test/COFF/autoimport-arm64-data.s b/lld/test/COFF/autoimport-arm64-data.s
index fa3654be3a71d..b49bd4f89c97c 100644
--- a/lld/test/COFF/autoimport-arm64-data.s
+++ b/lld/test/COFF/autoimport-arm64-data.s
@@ -33,6 +33,9 @@
     .global main
     .text
 main:
+    ret
+    .global _pei386_runtime_relocator
+_pei386_runtime_relocator:
     ret
     .data
 ptr:
diff --git a/lld/test/COFF/autoimport-gnu-implib.s b/lld/test/COFF/autoimport-gnu-implib.s
index d7d4ed626e83a..d9dc9d7a38fdc 100644
--- a/lld/test/COFF/autoimport-gnu-implib.s
+++ b/lld/test/COFF/autoimport-gnu-implib.s
@@ -27,5 +27,8 @@
     .text
 main:
     movl data(%rip), %eax
+    ret
+    .global _pei386_runtime_relocator
+_pei386_runtime_relocator:
     ret
     .data
diff --git a/lld/test/COFF/autoimport-handler-func.s b/lld/test/COFF/autoimport-handler-func.s
new file mode 100644
index 0000000000000..02d040bfa274c
--- /dev/null
+++ b/lld/test/COFF/autoimport-handler-func.s
@@ -0,0 +1,36 @@
+# REQUIRES: x86
+# RUN: split-file %s %t.dir
+
+# RUN: llvm-dlltool -m i386:x86-64 -d %t.dir/lib.def -D lib.dll -l %t.dir/lib.lib
+
+# RUN: llvm-mc -triple=x86_64-windows-gnu %t.dir/main.s -filetype=obj -o %t.dir/main.obj
+# RUN: llvm-mc -triple=x86_64-windows-gnu %t.dir/func.s -filetype=obj -o %t.dir/func.obj
+# RUN: env LLD_IN_TEST=1 not lld-link -lldmingw -out:%t.dir/main.exe -entry:main %t.dir/main.obj %t.dir/lib.lib 2>&1 | FileCheck %s --check-prefix=ERR
+
+# RUN: lld-link -lldmingw -out:%t.dir/main.exe -entry:main %t.dir/main.obj %t.dir/func.obj %t.dir/lib.lib 2>&1 | FileCheck %s --check-prefix=NOERR --allow-empty
+
+# ERR: error: output image has runtime pseudo relocations, but the function _pei386_runtime_relocator is missing; it is needed for fixing the relocations at runtime
+
+# NOERR-NOT: error
+
+#--- main.s
+    .global main
+    .text
+main:
+    ret
+
+    .data
+    .long 1
+    .quad variable
+    .long 2
+
+#--- func.s
+    .global _pei386_runtime_relocator
+    .text
+_pei386_runtime_relocator:
+    ret
+
+#--- lib.def
+EXPORTS
+variable DATA
+
diff --git a/lld/test/COFF/autoimport-warn.s b/lld/test/COFF/autoimport-warn.s
index 9c363ed30f245..eead0fed861f8 100644
--- a/lld/test/COFF/autoimport-warn.s
+++ b/lld/test/COFF/autoimport-warn.s
@@ -18,6 +18,9 @@ main:
     movl variable2(%rip), %ecx
     addl %ecx, %eax
     ret
+    .global _pei386_runtime_relocator
+_pei386_runtime_relocator:
+    ret
 
     .section .rdata$.refptr.variable1,"dr",discard,.refptr.variable1
     .global .refptr.variable1
diff --git a/lld/test/COFF/autoimport-x86.s b/lld/test/COFF/autoimport-x86.s
index fa36f10e9ca91..5d7c9c2c3fa58 100644
--- a/lld/test/COFF/autoimport-x86.s
+++ b/lld/test/COFF/autoimport-x86.s
@@ -55,6 +55,9 @@
     .text
 main:
     movl variable(%rip), %eax
+    ret
+    .global _pei386_runtime_relocator
+_pei386_runtime_relocator:
     ret
     .data
 ptr:
diff --git a/lld/test/ELF/loongarch-relax-align-ldr.s b/lld/test/ELF/loongarch-relax-align-ldr.s
new file mode 100644
index 0000000000000..6534dc906cfd0
--- /dev/null
+++ b/lld/test/ELF/loongarch-relax-align-ldr.s
@@ -0,0 +1,28 @@
+# REQUIRES: loongarch
+## Test `ld -r` not changes the addend of R_LARCH_ALIGN.
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.64.o
+# RUN: ld.lld -r %t.64.o %t.64.o -o %t.64.r
+# RUN: llvm-objdump -dr --no-show-raw-insn %t.64.r | FileCheck %s
+
+# CHECK:      <.text>:
+# CHECK-NEXT:   break 1
+# CHECK-NEXT:   nop
+# CHECK-NEXT:   {{0*}}04:  R_LARCH_ALIGN        .text+0x804
+# CHECK-NEXT:   nop
+# CHECK-NEXT:   nop
+# CHECK-NEXT:   break 2
+# CHECK-NEXT:   break 0
+# CHECK-NEXT:   break 0
+# CHECK-NEXT:   break 0
+# CHECK-NEXT:   break 1
+# CHECK-NEXT:   nop
+# CHECK-NEXT:   {{0*}}24:  R_LARCH_ALIGN        .text+0x804
+# CHECK-NEXT:   nop
+# CHECK-NEXT:   nop
+# CHECK-NEXT:   break 2
+
+.text
+break 1
+.p2align 4, , 8
+break 2
diff --git a/lld/test/ELF/loongarch-relax-emit-relocs.s b/lld/test/ELF/loongarch-relax-emit-relocs.s
index 581fce8c95caa..9007f8fcc114f 100644
--- a/lld/test/ELF/loongarch-relax-emit-relocs.s
+++ b/lld/test/ELF/loongarch-relax-emit-relocs.s
@@ -25,7 +25,7 @@
 # CHECK-NEXT:     R_LARCH_PCALA_LO12 _start
 # CHECK-NEXT:     R_LARCH_RELAX *ABS*
 # CHECK-NEXT:   nop
-# CHECK-NEXT:     R_LARCH_ALIGN .Lla-relax-align0+0x4
+# CHECK-NEXT:     R_LARCH_ALIGN .text+0x4
 # CHECK-NEXT:   nop
 # CHECK-NEXT:   ret
 
@@ -37,11 +37,12 @@
 # CHECKR-NEXT:     R_LARCH_PCALA_LO12 _start
 # CHECKR-NEXT:     R_LARCH_RELAX *ABS*
 # CHECKR-NEXT:   nop
-# CHECKR-NEXT:     R_LARCH_ALIGN .Lla-relax-align0+0x4
+# CHECKR-NEXT:     R_LARCH_ALIGN .text+0x4
 # CHECKR-NEXT:   nop
 # CHECKR-NEXT:   nop
 # CHECKR-NEXT:   ret
 
+.text
 .global _start
 _start:
   la.pcrel $a0, _start
diff --git a/lldb/cmake/modules/LLDBFramework.cmake b/lldb/cmake/modules/LLDBFramework.cmake
index 81fc596ef4244..f915839f6b45a 100644
--- a/lldb/cmake/modules/LLDBFramework.cmake
+++ b/lldb/cmake/modules/LLDBFramework.cmake
@@ -119,7 +119,7 @@ add_custom_command(TARGET liblldb POST_BUILD
 if(NOT APPLE_EMBEDDED)
   if (TARGET clang-resource-headers)
     add_dependencies(liblldb clang-resource-headers)
-    set(clang_resource_headers_dir $<TARGET_PROPERTY:clang-resource-headers,RUNTIME_OUTPUT_DIRECTORY>)
+    set(clang_resource_headers_dir $<TARGET_PROPERTY:clang-resource-headers,INTERFACE_INCLUDE_DIRECTORIES>)
   else()
     set(clang_resource_headers_dir ${LLDB_EXTERNAL_CLANG_RESOURCE_DIR}/include)
     if(NOT EXISTS ${clang_resource_headers_dir})
diff --git a/lldb/packages/Python/lldbsuite/test/dotest.py b/lldb/packages/Python/lldbsuite/test/dotest.py
index 8c29145ecc527..2ec4a840b9167 100644
--- a/lldb/packages/Python/lldbsuite/test/dotest.py
+++ b/lldb/packages/Python/lldbsuite/test/dotest.py
@@ -248,7 +248,7 @@ def parseOptionsAndInitTestdirs():
             configuration.compiler = which(args.compiler)
         if not is_exe(configuration.compiler):
             logging.error(
-                "%s is not a valid compiler executable; aborting...", args.compiler
+                '"%s" is not a valid compiler executable; aborting...', args.compiler
             )
             sys.exit(-1)
     else:
diff --git a/lldb/source/Expression/IRExecutionUnit.cpp b/lldb/source/Expression/IRExecutionUnit.cpp
index cb9bee8733e15..7ad0e5ff22b2f 100644
--- a/lldb/source/Expression/IRExecutionUnit.cpp
+++ b/lldb/source/Expression/IRExecutionUnit.cpp
@@ -13,6 +13,7 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -279,10 +280,13 @@ void IRExecutionUnit::GetRunnableInfo(Status &error, lldb::addr_t &func_addr,
   llvm::EngineBuilder builder(std::move(m_module_up));
   llvm::Triple triple(m_module->getTargetTriple());
 
+  // PIC needed for ELF to avoid generating 32-bit relocations (which overflow
+  // if the object is loaded into high memory).
+  bool want_pic = triple.isOSBinFormatMachO() || triple.isOSBinFormatELF();
+
   builder.setEngineKind(llvm::EngineKind::JIT)
       .setErrorStr(&error_string)
-      .setRelocationModel(triple.isOSBinFormatMachO() ? llvm::Reloc::PIC_
-                                                      : llvm::Reloc::Static)
+      .setRelocationModel(want_pic ? llvm::Reloc::PIC_ : llvm::Reloc::Static)
       .setMCJITMemoryManager(std::make_unique<MemoryManager>(*this))
       .setOptLevel(llvm::CodeGenOptLevel::Less);
 
diff --git a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
index 5d2b4b03fe60c..59fc8726b7673 100644
--- a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
@@ -1089,6 +1089,10 @@ Status NativeProcessLinux::Detach() {
   if (GetID() == LLDB_INVALID_PROCESS_ID)
     return error;
 
+  // Cancel out any SIGSTOPs we may have sent while stopping the process.
+  // Otherwise, the process may stop as soon as we detach from it.
+  kill(GetID(), SIGCONT);
+
   for (const auto &thread : m_threads) {
     Status e = Detach(thread->GetID());
     if (e.Fail())
diff --git a/lldb/source/Plugins/UnwindAssembly/x86/x86AssemblyInspectionEngine.cpp b/lldb/source/Plugins/UnwindAssembly/x86/x86AssemblyInspectionEngine.cpp
index 2032c5a68d054..6bfaa54135a95 100644
--- a/lldb/source/Plugins/UnwindAssembly/x86/x86AssemblyInspectionEngine.cpp
+++ b/lldb/source/Plugins/UnwindAssembly/x86/x86AssemblyInspectionEngine.cpp
@@ -909,6 +909,9 @@ bool x86AssemblyInspectionEngine::GetNonCallSiteUnwindPlanFromAssembly(
   if (!m_register_map_initialized)
     return false;
 
+  if (m_disasm_context == nullptr)
+    return false;
+
   addr_t current_func_text_offset = 0;
   int current_sp_bytes_offset_from_fa = 0;
   bool is_aligned = false;
@@ -1570,6 +1573,9 @@ bool x86AssemblyInspectionEngine::FindFirstNonPrologueInstruction(
   if (!m_register_map_initialized)
     return false;
 
+  if (m_disasm_context == nullptr)
+    return false;
+
   while (offset < size) {
     int regno;
     int insn_len;
diff --git a/lldb/test/API/commands/frame/diagnose/dereference-function-return/TestDiagnoseDereferenceFunctionReturn.py b/lldb/test/API/commands/frame/diagnose/dereference-function-return/TestDiagnoseDereferenceFunctionReturn.py
index d8f45161378b0..4d9b036f5102c 100644
--- a/lldb/test/API/commands/frame/diagnose/dereference-function-return/TestDiagnoseDereferenceFunctionReturn.py
+++ b/lldb/test/API/commands/frame/diagnose/dereference-function-return/TestDiagnoseDereferenceFunctionReturn.py
@@ -19,6 +19,9 @@ def test_diagnose_dereference_function_return(self):
         TestBase.setUp(self)
         self.build()
         exe = self.getBuildArtifact("a.out")
+        # FIXME: This default changed in lldbtest.py and this test
+        # seems to rely on having it turned off.
+        self.runCmd("settings set target.disable-aslr true")
         self.runCmd("file " + exe, CURRENT_EXECUTABLE_SET)
         self.runCmd("run", RUN_SUCCEEDED)
         self.expect("thread list", "Thread should be stopped", substrs=["stopped"])
diff --git a/lldb/test/API/commands/process/detach-resumes/Makefile b/lldb/test/API/commands/process/detach-resumes/Makefile
new file mode 100644
index 0000000000000..c46619c662348
--- /dev/null
+++ b/lldb/test/API/commands/process/detach-resumes/Makefile
@@ -0,0 +1,4 @@
+CXX_SOURCES := main.cpp
+ENABLE_THREADS := YES
+
+include Makefile.rules
diff --git a/lldb/test/API/commands/process/detach-resumes/TestDetachResumes.py b/lldb/test/API/commands/process/detach-resumes/TestDetachResumes.py
new file mode 100644
index 0000000000000..797db55a45b9b
--- /dev/null
+++ b/lldb/test/API/commands/process/detach-resumes/TestDetachResumes.py
@@ -0,0 +1,60 @@
+"""
+Test that the process continues running after we detach from it.
+"""
+
+import lldb
+import time
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class DetachResumesTestCase(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr89077")
+    def test_detach_resumes(self):
+        self.build()
+        exe = self.getBuildArtifact()
+
+        # The inferior will use this file to let us know it is ready to be
+        # attached.
+        sync_file_path = lldbutil.append_to_process_working_directory(
+            self, "sync_file_%d" % (int(time.time()))
+        )
+
+        # And this one to let us know it is running after we've detached from
+        # it.
+        exit_file_path = lldbutil.append_to_process_working_directory(
+            self, "exit_file_%d" % (int(time.time()))
+        )
+
+        popen = self.spawnSubprocess(
+            self.getBuildArtifact(exe), [sync_file_path, exit_file_path]
+        )
+        lldbutil.wait_for_file_on_target(self, sync_file_path)
+
+        self.runCmd("process attach -p " + str(popen.pid))
+
+        # Set a breakpoint at a place that will be called by multiple threads
+        # simultaneously. On systems (e.g. linux) where the debugger needs to
+        # send signals to suspend threads, these signals will race with threads
+        # hitting the breakpoint (and stopping on their own).
+        bpno = lldbutil.run_break_set_by_symbol(self, "break_here")
+
+        # And let the inferior know it can call the function.
+        self.runCmd("expr -- wait_for_debugger_flag = false")
+
+        self.runCmd("continue")
+
+        self.expect(
+            "thread list",
+            STOPPED_DUE_TO_BREAKPOINT,
+            substrs=["stopped", "stop reason = breakpoint"],
+        )
+
+        # Detach, the process should keep running after this, and not be stopped
+        # by the signals that the debugger may have used to suspend the threads.
+        self.runCmd("detach")
+
+        lldbutil.wait_for_file_on_target(self, exit_file_path)
diff --git a/lldb/test/API/commands/process/detach-resumes/main.cpp b/lldb/test/API/commands/process/detach-resumes/main.cpp
new file mode 100644
index 0000000000000..e8050fef2c385
--- /dev/null
+++ b/lldb/test/API/commands/process/detach-resumes/main.cpp
@@ -0,0 +1,48 @@
+#include "pseudo_barrier.h"
+#include <chrono>
+#include <fcntl.h>
+#include <fstream>
+#include <stdio.h>
+#include <thread>
+#include <vector>
+
+pseudo_barrier_t barrier;
+
+constexpr size_t nthreads = 5;
+volatile bool wait_for_debugger_flag = true;
+
+void break_here() {}
+
+void tfunc() {
+  pseudo_barrier_wait(barrier);
+
+  break_here();
+}
+
+int main(int argc, char const *argv[]) {
+  lldb_enable_attach();
+
+  if (argc < 3)
+    return 1;
+
+  // Create a file to signal that this process has started up.
+  std::ofstream(argv[1]).close();
+
+  // And wait for it to attach.
+  for (int i = 0; i < 100 && wait_for_debugger_flag; ++i)
+    std::this_thread::sleep_for(std::chrono::seconds(1));
+
+  // Fire up the threads and have them call break_here() simultaneously.
+  pseudo_barrier_init(barrier, nthreads);
+  std::vector<std::thread> threads;
+  for (size_t i = 0; i < nthreads; ++i)
+    threads.emplace_back(tfunc);
+
+  for (std::thread &t : threads)
+    t.join();
+
+  // Create the file to let the debugger know we're running.
+  std::ofstream(argv[2]).close();
+
+  return 0;
+}
diff --git a/lldb/test/API/functionalities/asan/Makefile b/lldb/test/API/functionalities/asan/Makefile
index 4913a18d8cc6f..d66696fed7078 100644
--- a/lldb/test/API/functionalities/asan/Makefile
+++ b/lldb/test/API/functionalities/asan/Makefile
@@ -1,4 +1,8 @@
 C_SOURCES := main.c
-CFLAGS_EXTRAS := -fsanitize=address -g -gcolumn-info
+asan: CFLAGS_EXTRAS := -fsanitize=address -g -gcolumn-info
+asan: all
+
+libsanitizers: CFLAGS_EXTRAS := -fsanitize=address -fsanitize-stable-abi -g -gcolumn-info
+libsanitizers: all
 
 include Makefile.rules
diff --git a/lldb/test/API/functionalities/asan/TestMemoryHistory.py b/lldb/test/API/functionalities/asan/TestMemoryHistory.py
index 00162ae8822c7..41ab25823f5cc 100644
--- a/lldb/test/API/functionalities/asan/TestMemoryHistory.py
+++ b/lldb/test/API/functionalities/asan/TestMemoryHistory.py
@@ -8,16 +8,24 @@
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbplatform
 from lldbsuite.test import lldbutil
-
+from lldbsuite.test_event.build_exception import BuildError
 
 class AsanTestCase(TestBase):
     @skipIfFreeBSD  # llvm.org/pr21136 runtimes not yet available by default
     @expectedFailureNetBSD
     @skipUnlessAddressSanitizer
     def test(self):
-        self.build()
+        self.build(make_targets=["asan"])
         self.asan_tests()
 
+    @skipIf(oslist=no_match(["macosx"]))
+    def test_libsanitizers_asan(self):
+        try:
+            self.build(make_targets=["libsanitizers"])
+        except BuildError as e:
+            self.skipTest("failed to build with libsanitizers")
+        self.libsanitizer_tests()
+
     def setUp(self):
         # Call super's setUp().
         TestBase.setUp(self)
@@ -26,6 +34,68 @@ def setUp(self):
         self.line_free = line_number("main.c", "// free line")
         self.line_breakpoint = line_number("main.c", "// break line")
 
+    # Test line numbers: rdar://126237493
+    def libsanitizer_tests(self):
+        target = self.createTestTarget()
+
+        self.runCmd(
+            "env SanitizersAddress=1 MallocSanitizerZone=1 MallocSecureAllocator=0"
+        )
+
+        self.runCmd("run")
+
+        # In libsanitizers, memory history is not supported until a report has been generated
+        self.expect(
+            "thread list",
+            "Process should be stopped due to ASan report",
+            substrs=["stopped", "stop reason = Use of deallocated memory"],
+        )
+
+        # test the 'memory history' command
+        self.expect(
+            "memory history 'pointer'",
+            substrs=[
+                "Memory deallocated by Thread",
+                "a.out`f2",
+                "main.c",
+                "Memory allocated by Thread",
+                "a.out`f1",
+                "main.c",
+            ],
+        )
+
+        # do the same using SB API
+        process = self.dbg.GetSelectedTarget().process
+        val = (
+            process.GetSelectedThread().GetSelectedFrame().EvaluateExpression("pointer")
+        )
+        addr = val.GetValueAsUnsigned()
+        threads = process.GetHistoryThreads(addr)
+        self.assertEqual(threads.GetSize(), 2)
+
+        history_thread = threads.GetThreadAtIndex(0)
+        self.assertTrue(history_thread.num_frames >= 2)
+        self.assertEqual(
+            history_thread.frames[1].GetLineEntry().GetFileSpec().GetFilename(),
+            "main.c",
+        )
+
+        history_thread = threads.GetThreadAtIndex(1)
+        self.assertTrue(history_thread.num_frames >= 2)
+        self.assertEqual(
+            history_thread.frames[1].GetLineEntry().GetFileSpec().GetFilename(),
+            "main.c",
+        )
+
+        # let's free the container (SBThreadCollection) and see if the
+        # SBThreads still live
+        threads = None
+        self.assertTrue(history_thread.num_frames >= 2)
+        self.assertEqual(
+            history_thread.frames[1].GetLineEntry().GetFileSpec().GetFilename(),
+            "main.c",
+        )
+
     def asan_tests(self):
         target = self.createTestTarget()
 
diff --git a/lldb/test/API/functionalities/asan/TestReportData.py b/lldb/test/API/functionalities/asan/TestReportData.py
index 543c5fe66a208..5e4c179e2a481 100644
--- a/lldb/test/API/functionalities/asan/TestReportData.py
+++ b/lldb/test/API/functionalities/asan/TestReportData.py
@@ -8,7 +8,7 @@
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
-
+from lldbsuite.test_event.build_exception import BuildError
 
 class AsanTestReportDataCase(TestBase):
     @skipIfFreeBSD  # llvm.org/pr21136 runtimes not yet available by default
@@ -16,9 +16,17 @@ class AsanTestReportDataCase(TestBase):
     @skipUnlessAddressSanitizer
     @skipIf(archs=["i386"], bugnumber="llvm.org/PR36710")
     def test(self):
-        self.build()
+        self.build(make_targets=["asan"])
         self.asan_tests()
 
+    @skipIf(oslist=no_match(["macosx"]))
+    def test_libsanitizers_asan(self):
+        try:
+            self.build(make_targets=["libsanitizers"])
+        except BuildError as e:
+            self.skipTest("failed to build with libsanitizers")
+        self.asan_tests(libsanitizers=True)
+
     def setUp(self):
         # Call super's setUp().
         TestBase.setUp(self)
@@ -29,10 +37,15 @@ def setUp(self):
         self.line_crash = line_number("main.c", "// BOOM line")
         self.col_crash = 16
 
-    def asan_tests(self):
+    def asan_tests(self, libsanitizers=False):
         target = self.createTestTarget()
 
-        self.registerSanitizerLibrariesWithTarget(target)
+        if libsanitizers:
+            self.runCmd(
+                "env SanitizersAddress=1 MallocSanitizerZone=1 MallocSecureAllocator=0"
+            )
+        else:
+            self.registerSanitizerLibrariesWithTarget(target)
 
         self.runCmd("run")
 
diff --git a/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py b/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py
index 1790bd497f4e6..2dcbb728549fb 100644
--- a/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py
+++ b/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py
@@ -48,8 +48,6 @@ def follow_child_helper(self, use_fork, call_exec):
         self.expect("continue", patterns=[r"exited with status = 1[0-4]"])
 
     @skipUnlessPlatform(["linux"])
-    # See https://github.com/llvm/llvm-project/issues/85084.
-    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_parent_vfork_no_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-parent.
@@ -58,8 +56,6 @@ def test_follow_parent_vfork_no_exec(self):
         self.follow_parent_helper(use_fork=False, call_exec=False)
 
     @skipUnlessPlatform(["linux"])
-    # See https://github.com/llvm/llvm-project/issues/85084.
-    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_parent_fork_no_exec(self):
         """
         Make sure that debugging concurrent fork() from multiple threads won't crash lldb during follow-parent.
@@ -68,8 +64,6 @@ def test_follow_parent_fork_no_exec(self):
         self.follow_parent_helper(use_fork=True, call_exec=False)
 
     @skipUnlessPlatform(["linux"])
-    # See https://github.com/llvm/llvm-project/issues/85084.
-    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_parent_vfork_call_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-parent.
@@ -78,8 +72,6 @@ def test_follow_parent_vfork_call_exec(self):
         self.follow_parent_helper(use_fork=False, call_exec=True)
 
     @skipUnlessPlatform(["linux"])
-    # See https://github.com/llvm/llvm-project/issues/85084.
-    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_parent_fork_call_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-parent.
@@ -88,8 +80,6 @@ def test_follow_parent_fork_call_exec(self):
         self.follow_parent_helper(use_fork=True, call_exec=True)
 
     @skipUnlessPlatform(["linux"])
-    # See https://github.com/llvm/llvm-project/issues/85084.
-    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_child_vfork_no_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-child.
@@ -98,8 +88,6 @@ def test_follow_child_vfork_no_exec(self):
         self.follow_child_helper(use_fork=False, call_exec=False)
 
     @skipUnlessPlatform(["linux"])
-    # See https://github.com/llvm/llvm-project/issues/85084.
-    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_child_fork_no_exec(self):
         """
         Make sure that debugging concurrent fork() from multiple threads won't crash lldb during follow-child.
@@ -108,8 +96,6 @@ def test_follow_child_fork_no_exec(self):
         self.follow_child_helper(use_fork=True, call_exec=False)
 
     @skipUnlessPlatform(["linux"])
-    # See https://github.com/llvm/llvm-project/issues/85084.
-    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_child_vfork_call_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-child.
@@ -118,8 +104,6 @@ def test_follow_child_vfork_call_exec(self):
         self.follow_child_helper(use_fork=False, call_exec=True)
 
     @skipUnlessPlatform(["linux"])
-    # See https://github.com/llvm/llvm-project/issues/85084.
-    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_child_fork_call_exec(self):
         """
         Make sure that debugging concurrent fork() from multiple threads won't crash lldb during follow-child.
diff --git a/lldb/test/Shell/lit.cfg.py b/lldb/test/Shell/lit.cfg.py
index 290569576ac80..e24f3fbb4d931 100644
--- a/lldb/test/Shell/lit.cfg.py
+++ b/lldb/test/Shell/lit.cfg.py
@@ -50,10 +50,14 @@
 )
 
 # Enable sanitizer runtime flags.
-config.environment["ASAN_OPTIONS"] = "detect_stack_use_after_return=1"
-config.environment["TSAN_OPTIONS"] = "halt_on_error=1"
-if platform.system() == "Darwin":
-    config.environment["MallocNanoZone"] = "0"
+if "Address" in config.llvm_use_sanitizer:
+    config.environment["ASAN_OPTIONS"] = "detect_stack_use_after_return=1"
+    if platform.system() == "Darwin":
+        config.environment["MallocNanoZone"] = "0"
+
+if "Thread" in config.llvm_use_sanitizer:
+    config.environment["TSAN_OPTIONS"] = "halt_on_error=1"
+
 
 # Support running the test suite under the lldb-repro wrapper. This makes it
 # possible to capture a test suite run and then rerun all the test from the
diff --git a/lldb/test/Shell/lit.site.cfg.py.in b/lldb/test/Shell/lit.site.cfg.py.in
index 736dfc335732b..b69e7bce1bc0b 100644
--- a/lldb/test/Shell/lit.site.cfg.py.in
+++ b/lldb/test/Shell/lit.site.cfg.py.in
@@ -26,6 +26,7 @@ config.lldb_enable_lua = @LLDB_ENABLE_LUA@
 config.lldb_build_directory = "@LLDB_TEST_BUILD_DIRECTORY@"
 config.have_lldb_server = @LLDB_TOOL_LLDB_SERVER_BUILD@
 config.lldb_system_debugserver = @LLDB_USE_SYSTEM_DEBUGSERVER@
+config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
 # The shell tests use their own module caches.
 config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-shell")
 config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-shell")
diff --git a/lldb/unittests/UnwindAssembly/CMakeLists.txt b/lldb/unittests/UnwindAssembly/CMakeLists.txt
index 136fcd9ae9798..d6e4471af4ecb 100644
--- a/lldb/unittests/UnwindAssembly/CMakeLists.txt
+++ b/lldb/unittests/UnwindAssembly/CMakeLists.txt
@@ -9,3 +9,7 @@ endif()
 if ("X86" IN_LIST LLVM_TARGETS_TO_BUILD)
   add_subdirectory(x86)
 endif()
+
+if (NOT "X86" IN_LIST LLVM_TARGETS_TO_BUILD)
+  add_subdirectory(x86-but-no-x86-target)
+endif()
diff --git a/lldb/unittests/UnwindAssembly/x86-but-no-x86-target/CMakeLists.txt b/lldb/unittests/UnwindAssembly/x86-but-no-x86-target/CMakeLists.txt
new file mode 100644
index 0000000000000..d28e9629a64cf
--- /dev/null
+++ b/lldb/unittests/UnwindAssembly/x86-but-no-x86-target/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_lldb_unittest(UnwindAssemblyX86ButNoX86TargetTests
+    Testx86AssemblyInspectionEngine.cpp
+    LINK_LIBS
+      lldbCore
+      lldbSymbol
+      lldbPluginUnwindAssemblyX86
+    LINK_COMPONENTS
+      Support
+      ${LLVM_TARGETS_TO_BUILD}
+  )
diff --git a/lldb/unittests/UnwindAssembly/x86-but-no-x86-target/Testx86AssemblyInspectionEngine.cpp b/lldb/unittests/UnwindAssembly/x86-but-no-x86-target/Testx86AssemblyInspectionEngine.cpp
new file mode 100644
index 0000000000000..ed093d146440e
--- /dev/null
+++ b/lldb/unittests/UnwindAssembly/x86-but-no-x86-target/Testx86AssemblyInspectionEngine.cpp
@@ -0,0 +1,103 @@
+//===-- Testx86AssemblyInspectionEngine.cpp -------------------------------===//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+
+#include "Plugins/UnwindAssembly/x86/x86AssemblyInspectionEngine.h"
+#include "lldb/Core/AddressRange.h"
+#include "lldb/Symbol/UnwindPlan.h"
+#include "lldb/Utility/ArchSpec.h"
+
+#include "llvm/Support/TargetSelect.h"
+
+#include <memory>
+#include <vector>
+
+using namespace lldb;
+using namespace lldb_private;
+
+class Testx86AssemblyInspectionEngine : public testing::Test {
+public:
+  static void SetUpTestCase();
+};
+
+void Testx86AssemblyInspectionEngine::SetUpTestCase() {
+  llvm::InitializeAllTargets();
+  llvm::InitializeAllAsmPrinters();
+  llvm::InitializeAllTargetMCs();
+  llvm::InitializeAllDisassemblers();
+}
+
+// only defining the register names / numbers that the unwinder is actually
+// using today
+
+// names should match the constants below.  These will be the eRegisterKindLLDB
+// register numbers.
+
+const char *x86_64_reg_names[] = {"rax", "rbx", "rcx", "rdx", "rsp", "rbp",
+                                  "rsi", "rdi", "r8",  "r9",  "r10", "r11",
+                                  "r12", "r13", "r14", "r15", "rip"};
+
+enum x86_64_regs {
+  k_rax = 0,
+  k_rbx = 1,
+  k_rcx = 2,
+  k_rdx = 3,
+  k_rsp = 4,
+  k_rbp = 5,
+  k_rsi = 6,
+  k_rdi = 7,
+  k_r8 = 8,
+  k_r9 = 9,
+  k_r10 = 10,
+  k_r11 = 11,
+  k_r12 = 12,
+  k_r13 = 13,
+  k_r14 = 14,
+  k_r15 = 15,
+  k_rip = 16
+};
+
+std::unique_ptr<x86AssemblyInspectionEngine> Getx86_64Inspector() {
+
+  ArchSpec arch("x86_64-apple-macosx");
+  std::unique_ptr<x86AssemblyInspectionEngine> engine(
+      new x86AssemblyInspectionEngine(arch));
+
+  std::vector<x86AssemblyInspectionEngine::lldb_reg_info> lldb_regnums;
+  int i = 0;
+  for (const auto &name : x86_64_reg_names) {
+    x86AssemblyInspectionEngine::lldb_reg_info ri;
+    ri.name = name;
+    ri.lldb_regnum = i++;
+    lldb_regnums.push_back(ri);
+  }
+
+  engine->Initialize(lldb_regnums);
+  return engine;
+}
+
+TEST_F(Testx86AssemblyInspectionEngine, TestSimple64bitFrameFunction) {
+  std::unique_ptr<x86AssemblyInspectionEngine> engine = Getx86_64Inspector();
+
+  // 'int main() { }' compiled for x86_64-apple-macosx with clang
+  uint8_t data[] = {
+      0x55,             // offset 0 -- pushq %rbp
+      0x48, 0x89, 0xe5, // offset 1 -- movq %rsp, %rbp
+      0x31, 0xc0,       // offset 4 -- xorl %eax, %eax
+      0x5d,             // offset 6 -- popq %rbp
+      0xc3              // offset 7 -- retq
+  };
+
+  AddressRange sample_range(0x1000, sizeof(data));
+
+  UnwindPlan unwind_plan(eRegisterKindLLDB);
+  EXPECT_FALSE(engine->GetNonCallSiteUnwindPlanFromAssembly(
+      data, sizeof(data), sample_range, unwind_plan));
+}
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 22c1d1f186ea5..7da5d8e41f6f8 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -16024,7 +16024,7 @@ Additional Documentation
 .. [CLANG-ATTR] `Attributes in Clang <https://clang.llvm.org/docs/AttributeReference.html>`__
 .. [DWARF] `DWARF Debugging Information Format <http://dwarfstd.org/>`__
 .. [ELF] `Executable and Linkable Format (ELF) <http://www.sco.com/developers/gabi/>`__
-.. [HRF] `Heterogeneous-race-free Memory Models <http://benedictgaster.org/wp-content/uploads/2014/01/asplos269-FINAL.pdf>`__
+.. [HRF] `Heterogeneous-race-free Memory Models <https://research.cs.wisc.edu/multifacet/papers/asplos14_hrf.pdf>`__
 .. [HSA] `Heterogeneous System Architecture (HSA) Foundation <http://www.hsafoundation.com/>`__
 .. [MsgPack] `Message Pack <http://www.msgpack.org/>`__
 .. [OpenCL] `The OpenCL Specification Version 2.0 <http://www.khronos.org/registry/cl/specs/opencl-2.0.pdf>`__
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 6f5eba263def4..a4cf17a8398a8 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -119,6 +119,7 @@ on support follow.
      ``Za128rs``       Supported (`See note <#riscv-profiles-extensions-note>`__)
      ``Za64rs``        Supported (`See note <#riscv-profiles-extensions-note>`__)
      ``Zacas``         Supported (`See note <#riscv-zacas-note>`__)
+     ``Zama16b``       Supported (`See note <#riscv-profiles-extensions-note>`__)
      ``Zawrs``         Assembly Support
      ``Zba``           Supported
      ``Zbb``           Supported
@@ -237,7 +238,7 @@ Supported
 
 .. _riscv-profiles-extensions-note:
 
-``Za128rs``, ``Za64rs``, ``Zic64b``, ``Ziccamoa``, ``Ziccif``, ``Zicclsm``, ``Ziccrse``, ``Shcounterenvw``, ``Shgatpa``, ``Shtvala``, ``Shvsatpa``, ``Shvstvala``, ``Shvstvecd``, ``Ssccptr``, ``Sscounterenw``, ``Ssstateen``, ``Ssstrict``, ``Sstvala``, ``Sstvecd``, ``Ssu64xl``, ``Svade``, ``Svbare``
+``Za128rs``, ``Za64rs``, ``Zama16b``, ``Zic64b``, ``Ziccamoa``, ``Ziccif``, ``Zicclsm``, ``Ziccrse``, ``Shcounterenvw``, ``Shgatpa``, ``Shtvala``, ``Shvsatpa``, ``Shvstvala``, ``Shvstvecd``, ``Ssccptr``, ``Sscounterenw``, ``Ssstateen``, ``Ssstrict``, ``Sstvala``, ``Sstvecd``, ``Ssu64xl``, ``Svade``, ``Svbare``
   These extensions are defined as part of the `RISC-V Profiles specification <https://github.com/riscv/riscv-profiles/releases/tag/v1.0>`__.  They do not introduce any new features themselves, but instead describe existing hardware features.
 
   .. _riscv-zacas-note:
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 11add9274a7a0..76ef6ceb94078 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -149,6 +149,9 @@ Changes to the C API
 
 * Deprecated ``LLVMConstNUWNeg`` and ``LLVMBuildNUWNeg``.
 
+* Added ``LLVMAtomicRMWBinOpUIncWrap`` and ``LLVMAtomicRMWBinOpUDecWrap`` to
+  ``LLVMAtomicRMWBinOp`` enum for AtomicRMW instructions.
+
 Changes to the CodeGen infrastructure
 -------------------------------------
 
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 6be5957ce6103..0b03f3b36fcdd 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -361,35 +361,39 @@ typedef enum {
 } LLVMAtomicOrdering;
 
 typedef enum {
-    LLVMAtomicRMWBinOpXchg, /**< Set the new value and return the one old */
-    LLVMAtomicRMWBinOpAdd, /**< Add a value and return the old one */
-    LLVMAtomicRMWBinOpSub, /**< Subtract a value and return the old one */
-    LLVMAtomicRMWBinOpAnd, /**< And a value and return the old one */
-    LLVMAtomicRMWBinOpNand, /**< Not-And a value and return the old one */
-    LLVMAtomicRMWBinOpOr, /**< OR a value and return the old one */
-    LLVMAtomicRMWBinOpXor, /**< Xor a value and return the old one */
-    LLVMAtomicRMWBinOpMax, /**< Sets the value if it's greater than the
-                             original using a signed comparison and return
-                             the old one */
-    LLVMAtomicRMWBinOpMin, /**< Sets the value if it's Smaller than the
-                             original using a signed comparison and return
-                             the old one */
-    LLVMAtomicRMWBinOpUMax, /**< Sets the value if it's greater than the
-                             original using an unsigned comparison and return
-                             the old one */
-    LLVMAtomicRMWBinOpUMin, /**< Sets the value if it's greater than the
-                              original using an unsigned comparison and return
-                              the old one */
-    LLVMAtomicRMWBinOpFAdd, /**< Add a floating point value and return the
-                              old one */
-    LLVMAtomicRMWBinOpFSub, /**< Subtract a floating point value and return the
+  LLVMAtomicRMWBinOpXchg, /**< Set the new value and return the one old */
+  LLVMAtomicRMWBinOpAdd,  /**< Add a value and return the old one */
+  LLVMAtomicRMWBinOpSub,  /**< Subtract a value and return the old one */
+  LLVMAtomicRMWBinOpAnd,  /**< And a value and return the old one */
+  LLVMAtomicRMWBinOpNand, /**< Not-And a value and return the old one */
+  LLVMAtomicRMWBinOpOr,   /**< OR a value and return the old one */
+  LLVMAtomicRMWBinOpXor,  /**< Xor a value and return the old one */
+  LLVMAtomicRMWBinOpMax,  /**< Sets the value if it's greater than the
+                            original using a signed comparison and return
+                            the old one */
+  LLVMAtomicRMWBinOpMin,  /**< Sets the value if it's Smaller than the
+                            original using a signed comparison and return
+                            the old one */
+  LLVMAtomicRMWBinOpUMax, /**< Sets the value if it's greater than the
+                           original using an unsigned comparison and return
+                           the old one */
+  LLVMAtomicRMWBinOpUMin, /**< Sets the value if it's greater than the
+                            original using an unsigned comparison and return
+                            the old one */
+  LLVMAtomicRMWBinOpFAdd, /**< Add a floating point value and return the
                             old one */
-    LLVMAtomicRMWBinOpFMax, /**< Sets the value if it's greater than the
-                             original using an floating point comparison and
-                             return the old one */
-    LLVMAtomicRMWBinOpFMin, /**< Sets the value if it's smaller than the
-                             original using an floating point comparison and
-                             return the old one */
+  LLVMAtomicRMWBinOpFSub, /**< Subtract a floating point value and return the
+                          old one */
+  LLVMAtomicRMWBinOpFMax, /**< Sets the value if it's greater than the
+                           original using an floating point comparison and
+                           return the old one */
+  LLVMAtomicRMWBinOpFMin, /**< Sets the value if it's smaller than the
+                           original using an floating point comparison and
+                           return the old one */
+  LLVMAtomicRMWBinOpUIncWrap, /**< Increments the value, wrapping back to zero
+                               when incremented above input value */
+  LLVMAtomicRMWBinOpUDecWrap, /**< Decrements the value, wrapping back to
+                               the input value when decremented below zero */
 } LLVMAtomicRMWBinOp;
 
 typedef enum {
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index 9db0894162afc..e1c41b3b55ccf 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -124,7 +124,7 @@ bool isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI);
 /// specified, perform context-sensitive analysis and return true if the
 /// pointer couldn't possibly be null at the specified instruction.
 /// Supports values with integer or pointer type and vectors of integers.
-bool isKnownNonZero(const Value *V, unsigned Depth, const SimplifyQuery &Q);
+bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth = 0);
 
 /// Return true if the two given values are negation.
 /// Currently can recoginze Value pair:
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
index 807cec3c177d9..c4174cee5e10c 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/Utils.h
@@ -555,5 +555,9 @@ void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI,
 /// debug users of \p MI by writing the effect of \p MI in a DIExpression.
 void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI);
 
+/// Returns whether opcode \p Opc is a pre-isel generic floating-point opcode,
+/// having only floating-point operands.
+bool isPreISelGenericFloatingPointOpcode(unsigned Opc);
+
 } // End namespace llvm.
 #endif
diff --git a/llvm/include/llvm/CodeGen/LivePhysRegs.h b/llvm/include/llvm/CodeGen/LivePhysRegs.h
index 1d40b1cbb0eaa..9574a6f0c7c00 100644
--- a/llvm/include/llvm/CodeGen/LivePhysRegs.h
+++ b/llvm/include/llvm/CodeGen/LivePhysRegs.h
@@ -39,6 +39,8 @@
 
 namespace llvm {
 
+template <typename T> class ArrayRef;
+
 class MachineInstr;
 class MachineFunction;
 class MachineOperand;
@@ -207,6 +209,22 @@ static inline bool recomputeLiveIns(MachineBasicBlock &MBB) {
   return oldLiveIns != newLiveIns;
 }
 
+/// Convenience function for recomputing live-in's for a set of MBBs until the
+/// computation converges.
+inline void fullyRecomputeLiveIns(ArrayRef<MachineBasicBlock *> MBBs) {
+  MachineBasicBlock *const *Data = MBBs.data();
+  const size_t Len = MBBs.size();
+  while (true) {
+    bool AnyChange = false;
+    for (size_t I = 0; I < Len; ++I)
+      if (recomputeLiveIns(*Data[I]))
+        AnyChange = true;
+    if (!AnyChange)
+      return;
+  }
+}
+
+
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_LIVEPHYSREGS_H
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index f381273c46cfb..b6534a1962a2f 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -2004,8 +2004,18 @@ class IRBuilderBase {
   // Instruction creation methods: Cast/Conversion Operators
   //===--------------------------------------------------------------------===//
 
-  Value *CreateTrunc(Value *V, Type *DestTy, const Twine &Name = "") {
-    return CreateCast(Instruction::Trunc, V, DestTy, Name);
+  Value *CreateTrunc(Value *V, Type *DestTy, const Twine &Name = "",
+                     bool IsNUW = false, bool IsNSW = false) {
+    if (V->getType() == DestTy)
+      return V;
+    if (Value *Folded = Folder.FoldCast(Instruction::Trunc, V, DestTy))
+      return Folded;
+    Instruction *I = CastInst::Create(Instruction::Trunc, V, DestTy);
+    if (IsNUW)
+      I->setHasNoUnsignedWrap();
+    if (IsNSW)
+      I->setHasNoSignedWrap();
+    return Insert(I, Name);
   }
 
   Value *CreateZExt(Value *V, Type *DestTy, const Twine &Name = "",
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index cfe1b11ade5a4..8e6bef69218c2 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -24,6 +24,7 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/FMF.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/LLVMContext.h"
@@ -311,6 +312,32 @@ class BinaryOperator : public Instruction {
     return BO;
   }
 
+  static BinaryOperator *CreateWithFMF(BinaryOps Opc, Value *V1, Value *V2,
+                                       FastMathFlags FMF,
+                                       const Twine &Name = "",
+                                       Instruction *InsertBefore = nullptr) {
+    BinaryOperator *BO = Create(Opc, V1, V2, Name, InsertBefore);
+    BO->setFastMathFlags(FMF);
+    return BO;
+  }
+
+  static BinaryOperator *CreateFAddFMF(Value *V1, Value *V2, FastMathFlags FMF,
+                                       const Twine &Name = "") {
+    return CreateWithFMF(Instruction::FAdd, V1, V2, FMF, Name);
+  }
+  static BinaryOperator *CreateFSubFMF(Value *V1, Value *V2, FastMathFlags FMF,
+                                       const Twine &Name = "") {
+    return CreateWithFMF(Instruction::FSub, V1, V2, FMF, Name);
+  }
+  static BinaryOperator *CreateFMulFMF(Value *V1, Value *V2, FastMathFlags FMF,
+                                       const Twine &Name = "") {
+    return CreateWithFMF(Instruction::FMul, V1, V2, FMF, Name);
+  }
+  static BinaryOperator *CreateFDivFMF(Value *V1, Value *V2, FastMathFlags FMF,
+                                       const Twine &Name = "") {
+    return CreateWithFMF(Instruction::FDiv, V1, V2, FMF, Name);
+  }
+
   static BinaryOperator *CreateFAddFMF(Value *V1, Value *V2,
                                        Instruction *FMFSource,
                                        const Twine &Name = "") {
diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 0dfe9f029f9b1..92eae344ce729 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -105,6 +105,10 @@ namespace Intrinsic {
   /// Map a MS builtin name to an intrinsic ID.
   ID getIntrinsicForMSBuiltin(const char *Prefix, StringRef BuiltinName);
 
+  /// Returns true if the intrinsic ID is for one of the "Constrained
+  /// Floating-Point Intrinsics".
+  bool isConstrainedFPIntrinsic(ID QID);
+
   /// This is a type descriptor which explains the type requirements of an
   /// intrinsic. This is returned by getIntrinsicInfoTableEntries.
   struct IITDescriptor {
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 92cb79d54afc2..98cc0e5037698 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -345,7 +345,7 @@ template <int64_t Val> inline constantint_match<Val> m_ConstantInt() {
 
 /// This helper class is used to match constant scalars, vector splats,
 /// and fixed width vectors that satisfy a specified predicate.
-/// For fixed width vector constants, undefined elements are ignored.
+/// For fixed width vector constants, poison elements are ignored.
 template <typename Predicate, typename ConstantVal>
 struct cstval_pred_ty : public Predicate {
   template <typename ITy> bool match(ITy *V) {
@@ -364,19 +364,19 @@ struct cstval_pred_ty : public Predicate {
         // Non-splat vector constant: check each element for a match.
         unsigned NumElts = FVTy->getNumElements();
         assert(NumElts != 0 && "Constant vector with no elements?");
-        bool HasNonUndefElements = false;
+        bool HasNonPoisonElements = false;
         for (unsigned i = 0; i != NumElts; ++i) {
           Constant *Elt = C->getAggregateElement(i);
           if (!Elt)
             return false;
-          if (isa<UndefValue>(Elt))
+          if (isa<PoisonValue>(Elt))
             continue;
           auto *CV = dyn_cast<ConstantVal>(Elt);
           if (!CV || !this->isValue(CV->getValue()))
             return false;
-          HasNonUndefElements = true;
+          HasNonPoisonElements = true;
         }
-        return HasNonUndefElements;
+        return HasNonPoisonElements;
       }
     }
     return false;
@@ -2587,31 +2587,6 @@ m_Not(const ValTy &V) {
   return m_c_Xor(m_AllOnes(), V);
 }
 
-template <typename ValTy> struct NotForbidUndef_match {
-  ValTy Val;
-  NotForbidUndef_match(const ValTy &V) : Val(V) {}
-
-  template <typename OpTy> bool match(OpTy *V) {
-    // We do not use m_c_Xor because that could match an arbitrary APInt that is
-    // not -1 as C and then fail to match the other operand if it is -1.
-    // This code should still work even when both operands are constants.
-    Value *X;
-    const APInt *C;
-    if (m_Xor(m_Value(X), m_APIntForbidUndef(C)).match(V) && C->isAllOnes())
-      return Val.match(X);
-    if (m_Xor(m_APIntForbidUndef(C), m_Value(X)).match(V) && C->isAllOnes())
-      return Val.match(X);
-    return false;
-  }
-};
-
-/// Matches a bitwise 'not' as 'xor V, -1' or 'xor -1, V'. For vectors, the
-/// constant value must be composed of only -1 scalar elements.
-template <typename ValTy>
-inline NotForbidUndef_match<ValTy> m_NotForbidUndef(const ValTy &V) {
-  return NotForbidUndef_match<ValTy>(V);
-}
-
 /// Matches an SMin with LHS and RHS in either order.
 template <typename LHS, typename RHS>
 inline MaxMin_match<ICmpInst, LHS, RHS, smin_pred_ty, true>
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index e46570af3873f..f662febb9216b 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -508,9 +508,9 @@ class InstrProfLookupTrait {
     using namespace support;
 
     offset_type KeyLen =
-        endian::readNext<offset_type, llvm::endianness::little, unaligned>(D);
+        endian::readNext<offset_type, llvm::endianness::little>(D);
     offset_type DataLen =
-        endian::readNext<offset_type, llvm::endianness::little, unaligned>(D);
+        endian::readNext<offset_type, llvm::endianness::little>(D);
     return std::make_pair(KeyLen, DataLen);
   }
 
diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index 0431c182276ec..7f3956bd73939 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -16,6 +16,8 @@
 namespace llvm {
 namespace memprof {
 
+struct MemProfRecord;
+
 // The versions of the indexed MemProf format
 enum IndexedVersion : uint64_t {
   // Version 0: This version didn't have a version field.
@@ -66,7 +68,7 @@ struct PortableMemInfoBlock {
       switch (Id) {
 #define MIBEntryDef(NameTag, Name, Type)                                       \
   case Meta::Name: {                                                           \
-    Name = endian::readNext<Type, llvm::endianness::little, unaligned>(Ptr);   \
+    Name = endian::readNext<Type, llvm::endianness::little>(Ptr);              \
   } break;
 #include "llvm/ProfileData/MIBEntryDef.inc"
 #undef MIBEntryDef
@@ -221,13 +223,12 @@ struct Frame {
     using namespace support;
 
     const uint64_t F =
-        endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+        endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
     const uint32_t L =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Ptr);
+        endian::readNext<uint32_t, llvm::endianness::little>(Ptr);
     const uint32_t C =
-        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Ptr);
-    const bool I =
-        endian::readNext<bool, llvm::endianness::little, unaligned>(Ptr);
+        endian::readNext<uint32_t, llvm::endianness::little>(Ptr);
+    const bool I = endian::readNext<bool, llvm::endianness::little>(Ptr);
     return Frame(/*Function=*/F, /*LineOffset=*/L, /*Column=*/C,
                  /*IsInlineFrame=*/I);
   }
@@ -369,14 +370,9 @@ struct IndexedMemProfRecord {
   size_t serializedSize(IndexedVersion Version) const;
 
   bool operator==(const IndexedMemProfRecord &Other) const {
-    if (Other.AllocSites.size() != AllocSites.size())
+    if (Other.AllocSites != AllocSites)
       return false;
 
-    for (size_t I = 0; I < AllocSites.size(); I++) {
-      if (AllocSites[I] != Other.AllocSites[I])
-        return false;
-    }
-
     if (Other.CallSiteIds != CallSiteIds)
       return false;
     return true;
@@ -392,6 +388,12 @@ struct IndexedMemProfRecord {
                                           const unsigned char *Buffer,
                                           IndexedVersion Version);
 
+  // Convert IndexedMemProfRecord to MemProfRecord.  Callback is used to
+  // translate CallStackId to call stacks with frames inline.
+  MemProfRecord toMemProfRecord(
+      std::function<const llvm::SmallVector<Frame>(const CallStackId)> Callback)
+      const;
+
   // Returns the GUID for the function name after canonicalization. For
   // memprof, we remove any .llvm suffix added by LTO. MemProfRecords are
   // mapped to functions using this GUID.
@@ -474,16 +476,15 @@ class RecordLookupTrait {
     using namespace support;
 
     offset_type KeyLen =
-        endian::readNext<offset_type, llvm::endianness::little, unaligned>(D);
+        endian::readNext<offset_type, llvm::endianness::little>(D);
     offset_type DataLen =
-        endian::readNext<offset_type, llvm::endianness::little, unaligned>(D);
+        endian::readNext<offset_type, llvm::endianness::little>(D);
     return std::make_pair(KeyLen, DataLen);
   }
 
   uint64_t ReadKey(const unsigned char *D, offset_type /*Unused*/) {
     using namespace support;
-    return endian::readNext<external_key_type, llvm::endianness::little,
-                            unaligned>(D);
+    return endian::readNext<external_key_type, llvm::endianness::little>(D);
   }
 
   data_type ReadData(uint64_t K, const unsigned char *D,
@@ -615,16 +616,15 @@ class FrameLookupTrait {
     using namespace support;
 
     offset_type KeyLen =
-        endian::readNext<offset_type, llvm::endianness::little, unaligned>(D);
+        endian::readNext<offset_type, llvm::endianness::little>(D);
     offset_type DataLen =
-        endian::readNext<offset_type, llvm::endianness::little, unaligned>(D);
+        endian::readNext<offset_type, llvm::endianness::little>(D);
     return std::make_pair(KeyLen, DataLen);
   }
 
   uint64_t ReadKey(const unsigned char *D, offset_type /*Unused*/) {
     using namespace support;
-    return endian::readNext<external_key_type, llvm::endianness::little,
-                            unaligned>(D);
+    return endian::readNext<external_key_type, llvm::endianness::little>(D);
   }
 
   data_type ReadData(uint64_t K, const unsigned char *D,
diff --git a/llvm/include/llvm/ProfileData/MemProfReader.h b/llvm/include/llvm/ProfileData/MemProfReader.h
index 89f49a20a6089..7fa8af184dc93 100644
--- a/llvm/include/llvm/ProfileData/MemProfReader.h
+++ b/llvm/include/llvm/ProfileData/MemProfReader.h
@@ -70,8 +70,20 @@ class MemProfReader {
       Callback =
           std::bind(&MemProfReader::idToFrame, this, std::placeholders::_1);
 
+    auto CallStackCallback = [&](CallStackId CSId) {
+      llvm::SmallVector<Frame> CallStack;
+      auto Iter = CSIdToCallStack.find(CSId);
+      assert(Iter != CSIdToCallStack.end());
+      for (FrameId Id : Iter->second)
+        CallStack.push_back(Callback(Id));
+      return CallStack;
+    };
+
     const IndexedMemProfRecord &IndexedRecord = Iter->second;
-    GuidRecord = {Iter->first, MemProfRecord(IndexedRecord, Callback)};
+    GuidRecord = {
+        Iter->first,
+        IndexedRecord.toMemProfRecord(CallStackCallback),
+    };
     Iter++;
     return Error::success();
   }
@@ -84,8 +96,15 @@ class MemProfReader {
   // Initialize the MemProfReader with the frame mappings and profile contents.
   MemProfReader(
       llvm::DenseMap<FrameId, Frame> FrameIdMap,
+      llvm::MapVector<GlobalValue::GUID, IndexedMemProfRecord> ProfData);
+
+  // Initialize the MemProfReader with the frame mappings, call stack mappings,
+  // and profile contents.
+  MemProfReader(
+      llvm::DenseMap<FrameId, Frame> FrameIdMap,
+      llvm::DenseMap<CallStackId, llvm::SmallVector<FrameId>> CSIdMap,
       llvm::MapVector<GlobalValue::GUID, IndexedMemProfRecord> ProfData)
-      : IdToFrame(std::move(FrameIdMap)),
+      : IdToFrame(std::move(FrameIdMap)), CSIdToCallStack(std::move(CSIdMap)),
         FunctionProfileData(std::move(ProfData)) {}
 
 protected:
@@ -97,6 +116,8 @@ class MemProfReader {
   }
   // A mapping from FrameId (a hash of the contents) to the frame.
   llvm::DenseMap<FrameId, Frame> IdToFrame;
+  // A mapping from CallStackId to the call stack.
+  llvm::DenseMap<CallStackId, llvm::SmallVector<FrameId>> CSIdToCallStack;
   // A mapping from function GUID, hash of the canonical function symbol to the
   // memprof profile data for that function, i.e allocation and callsite info.
   llvm::MapVector<GlobalValue::GUID, IndexedMemProfRecord> FunctionProfileData;
diff --git a/llvm/include/llvm/Support/Endian.h b/llvm/include/llvm/Support/Endian.h
index 1cdb5ca0d5eaa..30e0852b972c5 100644
--- a/llvm/include/llvm/Support/Endian.h
+++ b/llvm/include/llvm/Support/Endian.h
@@ -80,8 +80,8 @@ template <typename value_type, std::size_t alignment, typename CharT>
   return ret;
 }
 
-template <typename value_type, endianness endian, std::size_t alignment,
-          typename CharT>
+template <typename value_type, endianness endian,
+          std::size_t alignment = unaligned, typename CharT>
 [[nodiscard]] inline value_type readNext(const CharT *&memory) {
   return readNext<value_type, alignment, CharT>(memory, endian);
 }
diff --git a/llvm/include/llvm/Support/OnDiskHashTable.h b/llvm/include/llvm/Support/OnDiskHashTable.h
index 0a8cbbd8b1883..f6b4055e74de7 100644
--- a/llvm/include/llvm/Support/OnDiskHashTable.h
+++ b/llvm/include/llvm/Support/OnDiskHashTable.h
@@ -368,14 +368,12 @@ template <typename Info> class OnDiskChainedHashTable {
 
     // 'Items' starts with a 16-bit unsigned integer representing the
     // number of items in this bucket.
-    unsigned Len =
-        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Items);
+    unsigned Len = endian::readNext<uint16_t, llvm::endianness::little>(Items);
 
     for (unsigned i = 0; i < Len; ++i) {
       // Read the hash.
       hash_value_type ItemHash =
-          endian::readNext<hash_value_type, llvm::endianness::little,
-                           unaligned>(Items);
+          endian::readNext<hash_value_type, llvm::endianness::little>(Items);
 
       // Determine the length of the key and the data.
       const std::pair<offset_type, offset_type> &L =
@@ -473,8 +471,7 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> {
         // 'Items' starts with a 16-bit unsigned integer representing the
         // number of items in this bucket.
         NumItemsInBucketLeft =
-            endian::readNext<uint16_t, llvm::endianness::little, unaligned>(
-                Ptr);
+            endian::readNext<uint16_t, llvm::endianness::little>(Ptr);
       }
       Ptr += sizeof(hash_value_type); // Skip the hash.
       // Determine the length of the key and the data.
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index 5765926d6d93d..cb98f96af522f 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -351,6 +351,9 @@ HANDLE_TARGET_OPCODE(G_INTRINSIC_ROUND)
 /// INTRINSIC round to integer intrinsic.
 HANDLE_TARGET_OPCODE(G_INTRINSIC_LRINT)
 
+/// INTRINSIC long round to integer intrinsic.
+HANDLE_TARGET_OPCODE(G_INTRINSIC_LLRINT)
+
 /// INTRINSIC roundeven intrinsic.
 HANDLE_TARGET_OPCODE(G_INTRINSIC_ROUNDEVEN)
 
diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td
index d0f471eb29b6f..e8cf8fcb647f4 100644
--- a/llvm/include/llvm/Target/GenericOpcodes.td
+++ b/llvm/include/llvm/Target/GenericOpcodes.td
@@ -1089,6 +1089,12 @@ def G_INTRINSIC_LRINT : GenericInstruction {
   let hasSideEffects = false;
 }
 
+def G_INTRINSIC_LLRINT : GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src);
+  let hasSideEffects = false;
+}
+
 def G_INTRINSIC_ROUNDEVEN : GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1);
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 778ff7e437eb5..8568a7ae90e56 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -443,14 +443,20 @@ def select_constant_cmp: GICombineRule<
 // TODO: handle compares (currently not marked as isCommutable)
 def commute_int_constant_to_rhs : GICombineRule<
   (defs root:$root),
-  (match (wip_match_opcode G_ADD, G_MUL, G_AND, G_OR, G_XOR):$root,
+  (match (wip_match_opcode G_ADD, G_MUL, G_AND, G_OR, G_XOR,
+                           G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_UADDO, G_SADDO,
+                           G_UMULO, G_SMULO, G_UMULH, G_SMULH,
+                           G_UADDSAT, G_SADDSAT, G_SMULFIX, G_UMULFIX,
+                           G_SMULFIXSAT, G_UMULFIXSAT):$root,
     [{ return Helper.matchCommuteConstantToRHS(*${root}); }]),
   (apply [{ Helper.applyCommuteBinOpOperands(*${root}); }])
 >;
 
 def commute_fp_constant_to_rhs : GICombineRule<
   (defs root:$root),
-  (match (wip_match_opcode G_FADD, G_FMUL):$root,
+  (match (wip_match_opcode G_FADD, G_FMUL, G_FMINNUM, G_FMAXNUM,
+                           G_FMINNUM_IEEE, G_FMAXNUM_IEEE,
+                           G_FMINIMUM, G_FMAXIMUM):$root,
     [{ return Helper.matchCommuteFPConstantToRHS(*${root}); }]),
   (apply [{ Helper.applyCommuteBinOpOperands(*${root}); }])
 >;
diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index dd4e7d790bc6b..8fa0e4b86d6dc 100644
--- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -157,6 +157,7 @@ def : GINodeEquiv<G_INTRINSIC_TRUNC, ftrunc>;
 def : GINodeEquiv<G_INTRINSIC_ROUND, fround>;
 def : GINodeEquiv<G_INTRINSIC_ROUNDEVEN, froundeven>;
 def : GINodeEquiv<G_INTRINSIC_LRINT, lrint>;
+def : GINodeEquiv<G_INTRINSIC_LLRINT, llrint>;
 def : GINodeEquiv<G_FCOPYSIGN, fcopysign>;
 def : GINodeEquiv<G_SMIN, smin>;
 def : GINodeEquiv<G_SMAX, smax>;
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 805b963a7a13c..bcb36beb3cc0b 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -46,13 +46,10 @@ enum CPUFeatures {
   FEAT_FP,
   FEAT_SIMD,
   FEAT_CRC,
-  FEAT_SHA1,
   FEAT_SHA2,
   FEAT_SHA3,
   FEAT_AES,
-  FEAT_PMULL,
   FEAT_FP16,
-  FEAT_DIT,
   FEAT_DPB,
   FEAT_DPB2,
   FEAT_JSCVT,
@@ -60,35 +57,23 @@ enum CPUFeatures {
   FEAT_RCPC,
   FEAT_RCPC2,
   FEAT_FRINTTS,
-  FEAT_DGH,
   FEAT_I8MM,
   FEAT_BF16,
-  FEAT_EBF16,
   FEAT_RPRES,
   FEAT_SVE,
-  FEAT_SVE_BF16,
-  FEAT_SVE_EBF16,
-  FEAT_SVE_I8MM,
   FEAT_SVE_F32MM,
   FEAT_SVE_F64MM,
   FEAT_SVE2,
   FEAT_SVE_AES,
-  FEAT_SVE_PMULL128,
   FEAT_SVE_BITPERM,
   FEAT_SVE_SHA3,
   FEAT_SVE_SM4,
   FEAT_SME,
   FEAT_MEMTAG,
-  FEAT_MEMTAG2,
-  FEAT_MEMTAG3,
   FEAT_SB,
   FEAT_PREDRES,
   FEAT_SSBS,
-  FEAT_SSBS2,
-  FEAT_BTI,
   FEAT_LS64,
-  FEAT_LS64_V,
-  FEAT_LS64_ACCDATA,
   FEAT_WFXT,
   FEAT_SME_F64,
   FEAT_SME_I64,
@@ -96,12 +81,12 @@ enum CPUFeatures {
   FEAT_RCPC3,
   FEAT_MOPS,
   FEAT_MAX,
-  FEAT_EXT = 62,
+  FEAT_EXT = 47,
   FEAT_INIT
 };
 
-static_assert(FEAT_MAX < 62,
-              "Number of features in CPUFeatures are limited to 62 entries");
+static_assert(FEAT_MAX < 47,
+              "Number of features in CPUFeatures are limited to 47 entries");
 
 // Arch extension modifiers for CPUs. These are labelled with their Arm ARM
 // feature name (though the canonical reference for those is AArch64.td)
@@ -215,17 +200,13 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"b16b16", AArch64::AEK_B16B16, "+b16b16", "-b16b16", FEAT_INIT, "", 0},
     {"bf16", AArch64::AEK_BF16, "+bf16", "-bf16", FEAT_BF16, "+bf16", 280},
     {"brbe", AArch64::AEK_BRBE, "+brbe", "-brbe", FEAT_INIT, "", 0},
-    {"bti", AArch64::AEK_NONE, {}, {}, FEAT_BTI, "+bti", 510},
     {"crc", AArch64::AEK_CRC, "+crc", "-crc", FEAT_CRC, "+crc", 110},
     {"crypto", AArch64::AEK_CRYPTO, "+crypto", "-crypto", FEAT_INIT, "+aes,+sha2", 0},
     {"cssc", AArch64::AEK_CSSC, "+cssc", "-cssc", FEAT_INIT, "", 0},
     {"d128", AArch64::AEK_D128, "+d128", "-d128", FEAT_INIT, "", 0},
-    {"dgh", AArch64::AEK_NONE, {}, {}, FEAT_DGH, "", 260},
-    {"dit", AArch64::AEK_NONE, {}, {}, FEAT_DIT, "+dit", 180},
     {"dotprod", AArch64::AEK_DOTPROD, "+dotprod", "-dotprod", FEAT_DOTPROD, "+dotprod,+fp-armv8,+neon", 104},
     {"dpb", AArch64::AEK_NONE, {}, {}, FEAT_DPB, "+ccpp", 190},
     {"dpb2", AArch64::AEK_NONE, {}, {}, FEAT_DPB2, "+ccpp,+ccdp", 200},
-    {"ebf16", AArch64::AEK_NONE, {}, {}, FEAT_EBF16, "+bf16", 290},
     {"f32mm", AArch64::AEK_F32MM, "+f32mm", "-f32mm", FEAT_SVE_F32MM, "+sve,+f32mm,+fullfp16,+fp-armv8,+neon", 350},
     {"f64mm", AArch64::AEK_F64MM, "+f64mm", "-f64mm", FEAT_SVE_F64MM, "+sve,+f64mm,+fullfp16,+fp-armv8,+neon", 360},
     {"fcma", AArch64::AEK_FCMA, "+complxnum", "-complxnum", FEAT_FCMA, "+fp-armv8,+neon,+complxnum", 220},
@@ -239,17 +220,12 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"i8mm", AArch64::AEK_I8MM, "+i8mm", "-i8mm", FEAT_I8MM, "+i8mm", 270},
     {"ite", AArch64::AEK_ITE, "+ite", "-ite", FEAT_INIT, "", 0},
     {"jscvt", AArch64::AEK_JSCVT, "+jsconv", "-jsconv", FEAT_JSCVT, "+fp-armv8,+neon,+jsconv", 210},
-    {"ls64_accdata", AArch64::AEK_NONE, {}, {}, FEAT_LS64_ACCDATA, "+ls64", 540},
-    {"ls64_v", AArch64::AEK_NONE, {}, {}, FEAT_LS64_V, "", 530},
     {"ls64", AArch64::AEK_LS64, "+ls64", "-ls64", FEAT_LS64, "", 520},
     {"lse", AArch64::AEK_LSE, "+lse", "-lse", FEAT_LSE, "+lse", 80},
     {"lse128", AArch64::AEK_LSE128, "+lse128", "-lse128", FEAT_INIT, "", 0},
     {"memtag", AArch64::AEK_MTE, "+mte", "-mte", FEAT_MEMTAG, "", 440},
-    {"memtag2", AArch64::AEK_NONE, {}, {}, FEAT_MEMTAG2, "+mte", 450},
-    {"memtag3", AArch64::AEK_NONE, {}, {}, FEAT_MEMTAG3, "+mte", 460},
     {"mops", AArch64::AEK_MOPS, "+mops", "-mops", FEAT_MOPS, "+mops", 650},
     {"pauth", AArch64::AEK_PAUTH, "+pauth", "-pauth", FEAT_INIT, "", 0},
-    {"pmull", AArch64::AEK_NONE, {}, {}, FEAT_PMULL, "+aes,+fp-armv8,+neon", 160},
     {"pmuv3", AArch64::AEK_PERFMON, "+perfmon", "-perfmon", FEAT_INIT, "", 0},
     {"predres", AArch64::AEK_PREDRES, "+predres", "-predres", FEAT_PREDRES, "+predres", 480},
     {"predres2", AArch64::AEK_SPECRES2, "+specres2", "-specres2", FEAT_INIT, "", 0},
@@ -263,7 +239,6 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"rng", AArch64::AEK_RAND, "+rand", "-rand", FEAT_RNG, "+rand", 10},
     {"rpres", AArch64::AEK_NONE, {}, {}, FEAT_RPRES, "", 300},
     {"sb", AArch64::AEK_SB, "+sb", "-sb", FEAT_SB, "+sb", 470},
-    {"sha1", AArch64::AEK_NONE, {}, {}, FEAT_SHA1, "+fp-armv8,+neon", 120},
     {"sha2", AArch64::AEK_SHA2, "+sha2", "-sha2", FEAT_SHA2, "+sha2,+fp-armv8,+neon", 130},
     {"sha3", AArch64::AEK_SHA3, "+sha3", "-sha3", FEAT_SHA3, "+sha3,+sha2,+fp-armv8,+neon", 140},
     {"simd", AArch64::AEK_SIMD, "+neon", "-neon", FEAT_SIMD, "+fp-armv8,+neon", 100},
@@ -275,14 +250,9 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"sme2", AArch64::AEK_SME2, "+sme2", "-sme2", FEAT_SME2, "+sme2,+sme,+bf16", 580},
     {"sme2p1", AArch64::AEK_SME2p1, "+sme2p1", "-sme2p1", FEAT_INIT, "+sme2p1,+sme2,+sme,+bf16", 0},
     {"ssbs", AArch64::AEK_SSBS, "+ssbs", "-ssbs", FEAT_SSBS, "", 490},
-    {"ssbs2", AArch64::AEK_NONE, {}, {}, FEAT_SSBS2, "+ssbs", 500},
-    {"sve-bf16", AArch64::AEK_NONE, {}, {}, FEAT_SVE_BF16, "+sve,+bf16,+fullfp16,+fp-armv8,+neon", 320},
-    {"sve-ebf16", AArch64::AEK_NONE, {}, {}, FEAT_SVE_EBF16, "+sve,+bf16,+fullfp16,+fp-armv8,+neon", 330},
-    {"sve-i8mm", AArch64::AEK_NONE, {}, {}, FEAT_SVE_I8MM, "+sve,+i8mm,+fullfp16,+fp-armv8,+neon", 340},
     {"sve", AArch64::AEK_SVE, "+sve", "-sve", FEAT_SVE, "+sve,+fullfp16,+fp-armv8,+neon", 310},
     {"sve2-aes", AArch64::AEK_SVE2AES, "+sve2-aes", "-sve2-aes", FEAT_SVE_AES, "+sve2,+sve,+sve2-aes,+fullfp16,+fp-armv8,+neon", 380},
     {"sve2-bitperm", AArch64::AEK_SVE2BITPERM, "+sve2-bitperm", "-sve2-bitperm", FEAT_SVE_BITPERM, "+sve2,+sve,+sve2-bitperm,+fullfp16,+fp-armv8,+neon", 400},
-    {"sve2-pmull128", AArch64::AEK_NONE, {}, {}, FEAT_SVE_PMULL128, "+sve2,+sve,+sve2-aes,+fullfp16,+fp-armv8,+neon", 390},
     {"sve2-sha3", AArch64::AEK_SVE2SHA3, "+sve2-sha3", "-sve2-sha3", FEAT_SVE_SHA3, "+sve2,+sve,+sve2-sha3,+fullfp16,+fp-armv8,+neon", 410},
     {"sve2-sm4", AArch64::AEK_SVE2SM4, "+sve2-sm4", "-sve2-sm4", FEAT_SVE_SM4, "+sve2,+sve,+sve2-sm4,+fullfp16,+fp-armv8,+neon", 420},
     {"sve2", AArch64::AEK_SVE2, "+sve2", "-sve2", FEAT_SVE2, "+sve2,+sve,+fullfp16,+fp-armv8,+neon", 370},
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 345e09dce0b2b..187ace3a0cbed 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -372,6 +372,15 @@ RecurKind getMinMaxReductionRecurKind(Intrinsic::ID RdxID);
 /// Returns the comparison predicate used when expanding a min/max reduction.
 CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK);
 
+/// See RecurrenceDescriptor::isAnyOfPattern for a description of the pattern we
+/// are trying to match. In this pattern, we are only ever selecting between two
+/// values: 1) an initial start value \p StartVal of the reduction PHI, and 2) a
+/// loop invariant value. If any of lane value in \p Left, \p Right is not equal
+/// to \p StartVal, select the loop invariant value. This is done by selecting
+/// \p Right iff \p Left is equal to \p StartVal.
+Value *createAnyOfOp(IRBuilderBase &Builder, Value *StartVal, RecurKind RK,
+                     Value *Left, Value *Right);
+
 /// Returns a Min/Max operation corresponding to MinMaxRecurrenceKind.
 /// The Builder's fast-math-flags must be set to propagate the expected values.
 Value *createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index b082dfe8fbd21..16ee2ca49d0ec 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1283,8 +1283,7 @@ AliasResult BasicAAResult::aliasGEP(
     // VarIndex = Scale*V.
     const VariableGEPIndex &Var = DecompGEP1.VarIndices[0];
     if (Var.Val.TruncBits == 0 &&
-        isKnownNonZero(Var.Val.V, /*Depth=*/0,
-                       SimplifyQuery(DL, DT, &AC, Var.CxtI))) {
+        isKnownNonZero(Var.Val.V, SimplifyQuery(DL, DT, &AC, Var.CxtI))) {
       // Check if abs(V*Scale) >= abs(Scale) holds in the presence of
       // potentially wrapping math.
       auto MultiplyByScaleNoWrap = [](const VariableGEPIndex &Var) {
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 4e6e666922671..06ba5ca4c6b35 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -1513,7 +1513,7 @@ static Value *simplifyAShrInst(Value *Op0, Value *Op1, bool IsExact,
 
   // -1 >>a X --> -1
   // (-1 << X) a>> X --> -1
-  // Do not return Op0 because it may contain undef elements if it's a vector.
+  // We could return the original -1 constant to preserve poison elements.
   if (match(Op0, m_AllOnes()) ||
       match(Op0, m_Shl(m_AllOnes(), m_Specific(Op1))))
     return Constant::getAllOnesValue(Op0->getType());
@@ -1586,10 +1586,10 @@ static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp,
     if (match(UnsignedICmp,
               m_c_ICmp(UnsignedPred, m_Specific(Y), m_Specific(A)))) {
       if (UnsignedPred == ICmpInst::ICMP_UGE && IsAnd &&
-          EqPred == ICmpInst::ICMP_NE && isKnownNonZero(B, /*Depth=*/0, Q))
+          EqPred == ICmpInst::ICMP_NE && isKnownNonZero(B, Q))
         return UnsignedICmp;
       if (UnsignedPred == ICmpInst::ICMP_ULT && !IsAnd &&
-          EqPred == ICmpInst::ICMP_EQ && isKnownNonZero(B, /*Depth=*/0, Q))
+          EqPred == ICmpInst::ICMP_EQ && isKnownNonZero(B, Q))
         return UnsignedICmp;
     }
   }
@@ -1607,13 +1607,13 @@ static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp,
   // X > Y && Y == 0  -->  Y == 0  iff X != 0
   // X > Y || Y == 0  -->  X > Y   iff X != 0
   if (UnsignedPred == ICmpInst::ICMP_UGT && EqPred == ICmpInst::ICMP_EQ &&
-      isKnownNonZero(X, /*Depth=*/0, Q))
+      isKnownNonZero(X, Q))
     return IsAnd ? ZeroICmp : UnsignedICmp;
 
   // X <= Y && Y != 0  -->  X <= Y  iff X != 0
   // X <= Y || Y != 0  -->  Y != 0  iff X != 0
   if (UnsignedPred == ICmpInst::ICMP_ULE && EqPred == ICmpInst::ICMP_NE &&
-      isKnownNonZero(X, /*Depth=*/0, Q))
+      isKnownNonZero(X, Q))
     return IsAnd ? UnsignedICmp : ZeroICmp;
 
   // The transforms below here are expected to be handled more generally with
@@ -2281,7 +2281,7 @@ static Value *simplifyOrLogic(Value *X, Value *Y) {
   // (B ^ ~A) | (A & B) --> B ^ ~A
   // (~A ^ B) | (B & A) --> ~A ^ B
   // (B ^ ~A) | (B & A) --> B ^ ~A
-  if (match(X, m_c_Xor(m_NotForbidUndef(m_Value(A)), m_Value(B))) &&
+  if (match(X, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) &&
       match(Y, m_c_And(m_Specific(A), m_Specific(B))))
     return X;
 
@@ -2298,31 +2298,29 @@ static Value *simplifyOrLogic(Value *X, Value *Y) {
   // (B & ~A) | ~(A | B) --> ~A
   // (B & ~A) | ~(B | A) --> ~A
   Value *NotA;
-  if (match(X,
-            m_c_And(m_CombineAnd(m_Value(NotA), m_NotForbidUndef(m_Value(A))),
-                    m_Value(B))) &&
+  if (match(X, m_c_And(m_CombineAnd(m_Value(NotA), m_Not(m_Value(A))),
+                       m_Value(B))) &&
       match(Y, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))
     return NotA;
   // The same is true of Logical And
   // TODO: This could share the logic of the version above if there was a
   // version of LogicalAnd that allowed more than just i1 types.
-  if (match(X, m_c_LogicalAnd(
-                   m_CombineAnd(m_Value(NotA), m_NotForbidUndef(m_Value(A))),
-                   m_Value(B))) &&
+  if (match(X, m_c_LogicalAnd(m_CombineAnd(m_Value(NotA), m_Not(m_Value(A))),
+                              m_Value(B))) &&
       match(Y, m_Not(m_c_LogicalOr(m_Specific(A), m_Specific(B)))))
     return NotA;
 
   // ~(A ^ B) | (A & B) --> ~(A ^ B)
   // ~(A ^ B) | (B & A) --> ~(A ^ B)
   Value *NotAB;
-  if (match(X, m_CombineAnd(m_NotForbidUndef(m_Xor(m_Value(A), m_Value(B))),
+  if (match(X, m_CombineAnd(m_Not(m_Xor(m_Value(A), m_Value(B))),
                             m_Value(NotAB))) &&
       match(Y, m_c_And(m_Specific(A), m_Specific(B))))
     return NotAB;
 
   // ~(A & B) | (A ^ B) --> ~(A & B)
   // ~(A & B) | (B ^ A) --> ~(A & B)
-  if (match(X, m_CombineAnd(m_NotForbidUndef(m_And(m_Value(A), m_Value(B))),
+  if (match(X, m_CombineAnd(m_Not(m_And(m_Value(A), m_Value(B))),
                             m_Value(NotAB))) &&
       match(Y, m_c_Xor(m_Specific(A), m_Specific(B))))
     return NotAB;
@@ -2552,9 +2550,8 @@ static Value *simplifyXorInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     // The 'not' op must contain a complete -1 operand (no undef elements for
     // vector) for the transform to be safe.
     Value *NotA;
-    if (match(X,
-              m_c_Or(m_CombineAnd(m_NotForbidUndef(m_Value(A)), m_Value(NotA)),
-                     m_Value(B))) &&
+    if (match(X, m_c_Or(m_CombineAnd(m_Not(m_Value(A)), m_Value(NotA)),
+                        m_Value(B))) &&
         match(Y, m_c_And(m_Specific(A), m_Specific(B))))
       return NotA;
 
@@ -2817,10 +2814,9 @@ static Constant *computePointerICmp(CmpInst::Predicate Pred, Value *LHS,
     // the other operand can not be based on the alloc - if it were, then
     // the cmp itself would be a capture.
     Value *MI = nullptr;
-    if (isAllocLikeFn(LHS, TLI) && llvm::isKnownNonZero(RHS, /*Depth=*/0, Q))
+    if (isAllocLikeFn(LHS, TLI) && llvm::isKnownNonZero(RHS, Q))
       MI = LHS;
-    else if (isAllocLikeFn(RHS, TLI) &&
-             llvm::isKnownNonZero(LHS, /*Depth=*/0, Q))
+    else if (isAllocLikeFn(RHS, TLI) && llvm::isKnownNonZero(LHS, Q))
       MI = RHS;
     if (MI) {
       // FIXME: This is incorrect, see PR54002. While we can assume that the
@@ -2976,12 +2972,12 @@ static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS,
     return getTrue(ITy);
   case ICmpInst::ICMP_EQ:
   case ICmpInst::ICMP_ULE:
-    if (isKnownNonZero(LHS, /*Depth=*/0, Q))
+    if (isKnownNonZero(LHS, Q))
       return getFalse(ITy);
     break;
   case ICmpInst::ICMP_NE:
   case ICmpInst::ICMP_UGT:
-    if (isKnownNonZero(LHS, /*Depth=*/0, Q))
+    if (isKnownNonZero(LHS, Q))
       return getTrue(ITy);
     break;
   case ICmpInst::ICMP_SLT: {
@@ -2996,7 +2992,7 @@ static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS,
     KnownBits LHSKnown = computeKnownBits(LHS, /* Depth */ 0, Q);
     if (LHSKnown.isNegative())
       return getTrue(ITy);
-    if (LHSKnown.isNonNegative() && isKnownNonZero(LHS, /*Depth=*/0, Q))
+    if (LHSKnown.isNonNegative() && isKnownNonZero(LHS, Q))
       return getFalse(ITy);
     break;
   }
@@ -3012,7 +3008,7 @@ static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS,
     KnownBits LHSKnown = computeKnownBits(LHS, /* Depth */ 0, Q);
     if (LHSKnown.isNegative())
       return getFalse(ITy);
-    if (LHSKnown.isNonNegative() && isKnownNonZero(LHS, /*Depth=*/0, Q))
+    if (LHSKnown.isNonNegative() && isKnownNonZero(LHS, Q))
       return getTrue(ITy);
     break;
   }
@@ -3165,7 +3161,7 @@ static Value *simplifyICmpWithBinOpOnLHS(CmpInst::Predicate Pred,
   const APInt *C;
   if ((match(LBO, m_LShr(m_Specific(RHS), m_APInt(C))) && *C != 0) ||
       (match(LBO, m_UDiv(m_Specific(RHS), m_APInt(C))) && *C != 1)) {
-    if (isKnownNonZero(RHS, /*Depth=*/0, Q)) {
+    if (isKnownNonZero(RHS, Q)) {
       switch (Pred) {
       default:
         break;
@@ -3398,7 +3394,7 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS,
       bool NUW = Q.IIQ.hasNoUnsignedWrap(LBO) && Q.IIQ.hasNoUnsignedWrap(RBO);
       bool NSW = Q.IIQ.hasNoSignedWrap(LBO) && Q.IIQ.hasNoSignedWrap(RBO);
       if (!NUW || (ICmpInst::isSigned(Pred) && !NSW) ||
-          !isKnownNonZero(LBO->getOperand(0), /*Depth=*/0, Q))
+          !isKnownNonZero(LBO->getOperand(0), Q))
         break;
       if (Value *V = simplifyICmpInst(Pred, LBO->getOperand(1),
                                       RBO->getOperand(1), Q, MaxRecurse - 1))
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 3223b0564e6c9..6cded828c25f4 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -645,7 +645,7 @@ LazyValueInfoImpl::solveBlockValueImpl(Value *Val, BasicBlock *BB) {
   // instruction is placed, even if it could legally be hoisted much higher.
   // That is unfortunate.
   PointerType *PT = dyn_cast<PointerType>(BBI->getType());
-  if (PT && isKnownNonZero(BBI, /*Depth=*/0, DL))
+  if (PT && isKnownNonZero(BBI, DL))
     return ValueLatticeElement::getNot(ConstantPointerNull::get(PT));
 
   if (BBI->getType()->isIntegerTy()) {
@@ -1863,8 +1863,7 @@ LazyValueInfo::getPredicateAt(unsigned Pred, Value *V, Constant *C,
   Module *M = CxtI->getModule();
   const DataLayout &DL = M->getDataLayout();
   if (V->getType()->isPointerTy() && C->isNullValue() &&
-      isKnownNonZero(V->stripPointerCastsSameRepresentation(), /*Depth=*/0,
-                     DL)) {
+      isKnownNonZero(V->stripPointerCastsSameRepresentation(), DL)) {
     if (Pred == ICmpInst::ICMP_EQ)
       return LazyValueInfo::False;
     else if (Pred == ICmpInst::ICMP_NE)
diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp
index 0694c2995dfcc..1ab856ac8830a 100644
--- a/llvm/lib/Analysis/Lint.cpp
+++ b/llvm/lib/Analysis/Lint.cpp
@@ -350,10 +350,7 @@ void Lint::visitCallBase(CallBase &I) {
     }
 
     case Intrinsic::vastart:
-      Check(I.getParent()->getParent()->isVarArg(),
-            "Undefined behavior: va_start called in a non-varargs function",
-            &I);
-
+      // vastart in non-varargs function is rejected by the verifier
       visitMemoryReference(I, MemoryLocation::getForArgument(&I, 0, TLI),
                            std::nullopt, nullptr, MemRef::Read | MemRef::Write);
       break;
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index b5403408cf2ab..ac508e19c9e01 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -100,7 +100,7 @@ static bool isDereferenceableAndAlignedPointer(
   if (KnownDerefBytes.getBoolValue() && KnownDerefBytes.uge(Size) &&
       !CheckForFreed)
     if (!CheckForNonNull ||
-        isKnownNonZero(V, /*Depth=*/0, SimplifyQuery(DL, DT, AC, CtxI))) {
+        isKnownNonZero(V, SimplifyQuery(DL, DT, AC, CtxI))) {
       // As we recursed through GEPs to get here, we've incrementally checked
       // that each step advanced by a multiple of the alignment. If our base is
       // properly aligned, then the original offset accessed must also be.
@@ -134,7 +134,7 @@ static bool isDereferenceableAndAlignedPointer(
     if (getObjectSize(V, ObjSize, DL, TLI, Opts)) {
       APInt KnownDerefBytes(Size.getBitWidth(), ObjSize);
       if (KnownDerefBytes.getBoolValue() && KnownDerefBytes.uge(Size) &&
-          isKnownNonZero(V, /*Depth=*/0, SimplifyQuery(DL, DT, AC, CtxI)) &&
+          isKnownNonZero(V, SimplifyQuery(DL, DT, AC, CtxI)) &&
           !V->canBeFreed()) {
         // As we recursed through GEPs to get here, we've incrementally
         // checked that each step advanced by a multiple of the alignment. If
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 1c98b0295e525..95440dda3b4c0 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -6900,7 +6900,7 @@ const ConstantRange &ScalarEvolution::getRangeRef(
         uint64_t Rem = MaxVal.urem(Align);
         MaxVal -= APInt(BitWidth, Rem);
         APInt MinVal = APInt::getZero(BitWidth);
-        if (llvm::isKnownNonZero(V, /*Depth=*/0, DL))
+        if (llvm::isKnownNonZero(V, DL))
           MinVal = Align;
         ConservativeResult = ConservativeResult.intersectWith(
             ConstantRange::getNonEmpty(MinVal, MaxVal + 1), RangeType);
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index b3abf016cfb93..ab2f43e1033fa 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -272,7 +272,7 @@ bool llvm::isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL,
 }
 
 static bool isKnownNonZero(const Value *V, const APInt &DemandedElts,
-                           unsigned Depth, const SimplifyQuery &Q);
+                           const SimplifyQuery &Q, unsigned Depth);
 
 bool llvm::isKnownNonNegative(const Value *V, const SimplifyQuery &SQ,
                               unsigned Depth) {
@@ -288,7 +288,7 @@ bool llvm::isKnownPositive(const Value *V, const SimplifyQuery &SQ,
   // this updated.
   KnownBits Known = computeKnownBits(V, Depth, SQ);
   return Known.isNonNegative() &&
-         (Known.isNonZero() || isKnownNonZero(V, Depth, SQ));
+         (Known.isNonZero() || isKnownNonZero(V, SQ, Depth));
 }
 
 bool llvm::isKnownNegative(const Value *V, const SimplifyQuery &SQ,
@@ -868,7 +868,7 @@ static void computeKnownBitsFromShiftOperator(
   bool ShAmtNonZero =
       Known.isNonZero() ||
       (Known.getMaxValue().ult(Known.getBitWidth()) &&
-       isKnownNonZero(I->getOperand(1), DemandedElts, Depth + 1, Q));
+       isKnownNonZero(I->getOperand(1), DemandedElts, Q, Depth + 1));
   Known = KF(Known2, Known, ShAmtNonZero);
 }
 
@@ -2124,7 +2124,7 @@ bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
   case Instruction::Mul:
     return isKnownToBeAPowerOfTwo(I->getOperand(1), OrZero, Depth, Q) &&
            isKnownToBeAPowerOfTwo(I->getOperand(0), OrZero, Depth, Q) &&
-           (OrZero || isKnownNonZero(I, Depth, Q));
+           (OrZero || isKnownNonZero(I, Q, Depth));
   case Instruction::And:
     // A power of two and'd with anything is a power of two or zero.
     if (OrZero &&
@@ -2134,7 +2134,7 @@ bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
     // X & (-X) is always a power of two or zero.
     if (match(I->getOperand(0), m_Neg(m_Specific(I->getOperand(1)))) ||
         match(I->getOperand(1), m_Neg(m_Specific(I->getOperand(0)))))
-      return OrZero || isKnownNonZero(I->getOperand(0), Depth, Q);
+      return OrZero || isKnownNonZero(I->getOperand(0), Q, Depth);
     return false;
   case Instruction::Add: {
     // Adding a power-of-two or zero to the same power-of-two or zero yields
@@ -2249,7 +2249,7 @@ static bool isGEPKnownNonNull(const GEPOperator *GEP, unsigned Depth,
 
   // If the base pointer is non-null, we cannot walk to a null address with an
   // inbounds GEP in address space zero.
-  if (isKnownNonZero(GEP->getPointerOperand(), Depth, Q))
+  if (isKnownNonZero(GEP->getPointerOperand(), Q, Depth))
     return true;
 
   // Walk the GEP operands and see if any operand introduces a non-zero offset.
@@ -2288,7 +2288,7 @@ static bool isGEPKnownNonNull(const GEPOperator *GEP, unsigned Depth,
     if (Depth++ >= MaxAnalysisRecursionDepth)
       continue;
 
-    if (isKnownNonZero(GTI.getOperand(), Depth, Q))
+    if (isKnownNonZero(GTI.getOperand(), Q, Depth))
       return true;
   }
 
@@ -2441,8 +2441,8 @@ static bool isNonZeroAdd(const APInt &DemandedElts, unsigned Depth,
                          const SimplifyQuery &Q, unsigned BitWidth, Value *X,
                          Value *Y, bool NSW, bool NUW) {
   if (NUW)
-    return isKnownNonZero(Y, DemandedElts, Depth, Q) ||
-           isKnownNonZero(X, DemandedElts, Depth, Q);
+    return isKnownNonZero(Y, DemandedElts, Q, Depth) ||
+           isKnownNonZero(X, DemandedElts, Q, Depth);
 
   KnownBits XKnown = computeKnownBits(X, DemandedElts, Depth, Q);
   KnownBits YKnown = computeKnownBits(Y, DemandedElts, Depth, Q);
@@ -2450,8 +2450,8 @@ static bool isNonZeroAdd(const APInt &DemandedElts, unsigned Depth,
   // If X and Y are both non-negative (as signed values) then their sum is not
   // zero unless both X and Y are zero.
   if (XKnown.isNonNegative() && YKnown.isNonNegative())
-    if (isKnownNonZero(Y, DemandedElts, Depth, Q) ||
-        isKnownNonZero(X, DemandedElts, Depth, Q))
+    if (isKnownNonZero(Y, DemandedElts, Q, Depth) ||
+        isKnownNonZero(X, DemandedElts, Q, Depth))
       return true;
 
   // If X and Y are both negative (as signed values) then their sum is not
@@ -2485,7 +2485,7 @@ static bool isNonZeroSub(const APInt &DemandedElts, unsigned Depth,
                          Value *Y) {
   // TODO: Move this case into isKnownNonEqual().
   if (auto *C = dyn_cast<Constant>(X))
-    if (C->isNullValue() && isKnownNonZero(Y, DemandedElts, Depth, Q))
+    if (C->isNullValue() && isKnownNonZero(Y, DemandedElts, Q, Depth))
       return true;
 
   return ::isKnownNonEqual(X, Y, Depth, Q);
@@ -2497,18 +2497,18 @@ static bool isNonZeroMul(const APInt &DemandedElts, unsigned Depth,
   // If X and Y are non-zero then so is X * Y as long as the multiplication
   // does not overflow.
   if (NSW || NUW)
-    return isKnownNonZero(X, DemandedElts, Depth, Q) &&
-           isKnownNonZero(Y, DemandedElts, Depth, Q);
+    return isKnownNonZero(X, DemandedElts, Q, Depth) &&
+           isKnownNonZero(Y, DemandedElts, Q, Depth);
 
   // If either X or Y is odd, then if the other is non-zero the result can't
   // be zero.
   KnownBits XKnown = computeKnownBits(X, DemandedElts, Depth, Q);
   if (XKnown.One[0])
-    return isKnownNonZero(Y, DemandedElts, Depth, Q);
+    return isKnownNonZero(Y, DemandedElts, Q, Depth);
 
   KnownBits YKnown = computeKnownBits(Y, DemandedElts, Depth, Q);
   if (YKnown.One[0])
-    return XKnown.isNonZero() || isKnownNonZero(X, DemandedElts, Depth, Q);
+    return XKnown.isNonZero() || isKnownNonZero(X, DemandedElts, Q, Depth);
 
   // If there exists any subset of X (sX) and subset of Y (sY) s.t sX * sY is
   // non-zero, then X * Y is non-zero. We can find sX and sY by just taking
@@ -2564,7 +2564,7 @@ static bool isNonZeroShift(const Operator *I, const APInt &DemandedElts,
   // non-zero then at least one non-zero bit must remain.
   if (InvShiftOp(KnownVal.Zero, NumBits - MaxShift)
           .eq(InvShiftOp(APInt::getAllOnes(NumBits), NumBits - MaxShift)) &&
-      isKnownNonZero(I->getOperand(0), DemandedElts, Depth, Q))
+      isKnownNonZero(I->getOperand(0), DemandedElts, Q, Depth))
     return true;
 
   return false;
@@ -2613,7 +2613,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
     Type *FromTy = I->getOperand(0)->getType();
     if ((FromTy->isIntOrIntVectorTy() || FromTy->isPtrOrPtrVectorTy()) &&
         (BitWidth % getBitWidth(FromTy->getScalarType(), Q.DL)) == 0)
-      return isKnownNonZero(I->getOperand(0), Depth, Q);
+      return isKnownNonZero(I->getOperand(0), Q, Depth);
   } break;
   case Instruction::IntToPtr:
     // Note that we have to take special care to avoid looking through
@@ -2622,7 +2622,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
     if (!isa<ScalableVectorType>(I->getType()) &&
         Q.DL.getTypeSizeInBits(I->getOperand(0)->getType()).getFixedValue() <=
             Q.DL.getTypeSizeInBits(I->getType()).getFixedValue())
-      return isKnownNonZero(I->getOperand(0), Depth, Q);
+      return isKnownNonZero(I->getOperand(0), Q, Depth);
     break;
   case Instruction::PtrToInt:
     // Similar to int2ptr above, we can look through ptr2int here if the cast
@@ -2630,25 +2630,25 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
     if (!isa<ScalableVectorType>(I->getType()) &&
         Q.DL.getTypeSizeInBits(I->getOperand(0)->getType()).getFixedValue() <=
             Q.DL.getTypeSizeInBits(I->getType()).getFixedValue())
-      return isKnownNonZero(I->getOperand(0), Depth, Q);
+      return isKnownNonZero(I->getOperand(0), Q, Depth);
     break;
   case Instruction::Sub:
     return isNonZeroSub(DemandedElts, Depth, Q, BitWidth, I->getOperand(0),
                         I->getOperand(1));
   case Instruction::Or:
     // X | Y != 0 if X != 0 or Y != 0.
-    return isKnownNonZero(I->getOperand(1), DemandedElts, Depth, Q) ||
-           isKnownNonZero(I->getOperand(0), DemandedElts, Depth, Q);
+    return isKnownNonZero(I->getOperand(1), DemandedElts, Q, Depth) ||
+           isKnownNonZero(I->getOperand(0), DemandedElts, Q, Depth);
   case Instruction::SExt:
   case Instruction::ZExt:
     // ext X != 0 if X != 0.
-    return isKnownNonZero(I->getOperand(0), Depth, Q);
+    return isKnownNonZero(I->getOperand(0), Q, Depth);
 
   case Instruction::Shl: {
     // shl nsw/nuw can't remove any non-zero bits.
     const OverflowingBinaryOperator *BO = cast<OverflowingBinaryOperator>(I);
     if (Q.IIQ.hasNoUnsignedWrap(BO) || Q.IIQ.hasNoSignedWrap(BO))
-      return isKnownNonZero(I->getOperand(0), Depth, Q);
+      return isKnownNonZero(I->getOperand(0), Q, Depth);
 
     // shl X, Y != 0 if X is odd.  Note that the value of the shift is undefined
     // if the lowest bit is shifted off the end.
@@ -2664,7 +2664,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
     // shr exact can only shift out zero bits.
     const PossiblyExactOperator *BO = cast<PossiblyExactOperator>(I);
     if (BO->isExact())
-      return isKnownNonZero(I->getOperand(0), Depth, Q);
+      return isKnownNonZero(I->getOperand(0), Q, Depth);
 
     // shr X, Y != 0 if X is negative.  Note that the value of the shift is not
     // defined if the sign bit is shifted off the end.
@@ -2680,7 +2680,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
     // X / Y
     // div exact can only produce a zero if the dividend is zero.
     if (cast<PossiblyExactOperator>(I)->isExact())
-      return isKnownNonZero(I->getOperand(0), DemandedElts, Depth, Q);
+      return isKnownNonZero(I->getOperand(0), DemandedElts, Q, Depth);
 
     std::optional<bool> XUgeY;
     KnownBits XKnown =
@@ -2730,7 +2730,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
       Value *Op;
       Op = IsTrueArm ? I->getOperand(1) : I->getOperand(2);
       // Op is trivially non-zero.
-      if (isKnownNonZero(Op, DemandedElts, Depth, Q))
+      if (isKnownNonZero(Op, DemandedElts, Q, Depth))
         return true;
 
       // The condition of the select dominates the true/false arm. Check if the
@@ -2780,7 +2780,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
         }
       }
       // Finally recurse on the edge and check it directly.
-      return isKnownNonZero(U.get(), DemandedElts, NewDepth, RecQ);
+      return isKnownNonZero(U.get(), DemandedElts, RecQ, NewDepth);
     });
   }
   case Instruction::InsertElement: {
@@ -2802,9 +2802,9 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
 
     // Result is zero if Elt is non-zero and rest of the demanded elts in Vec
     // are non-zero.
-    return (SkipElt || isKnownNonZero(Elt, Depth, Q)) &&
+    return (SkipElt || isKnownNonZero(Elt, Q, Depth)) &&
            (DemandedVecElts.isZero() ||
-            isKnownNonZero(Vec, DemandedVecElts, Depth, Q));
+            isKnownNonZero(Vec, DemandedVecElts, Q, Depth));
   }
   case Instruction::ExtractElement:
     if (const auto *EEI = dyn_cast<ExtractElementInst>(I)) {
@@ -2816,7 +2816,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
         APInt DemandedVecElts = APInt::getAllOnes(NumElts);
         if (CIdx && CIdx->getValue().ult(NumElts))
           DemandedVecElts = APInt::getOneBitSet(NumElts, CIdx->getZExtValue());
-        return isKnownNonZero(Vec, DemandedVecElts, Depth, Q);
+        return isKnownNonZero(Vec, DemandedVecElts, Q, Depth);
       }
     }
     break;
@@ -2831,12 +2831,12 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
       break;
     // If demanded elements for both vecs are non-zero, the shuffle is non-zero.
     return (DemandedRHS.isZero() ||
-            isKnownNonZero(Shuf->getOperand(1), DemandedRHS, Depth, Q)) &&
+            isKnownNonZero(Shuf->getOperand(1), DemandedRHS, Q, Depth)) &&
            (DemandedLHS.isZero() ||
-            isKnownNonZero(Shuf->getOperand(0), DemandedLHS, Depth, Q));
+            isKnownNonZero(Shuf->getOperand(0), DemandedLHS, Q, Depth));
   }
   case Instruction::Freeze:
-    return isKnownNonZero(I->getOperand(0), Depth, Q) &&
+    return isKnownNonZero(I->getOperand(0), Q, Depth) &&
            isGuaranteedNotToBePoison(I->getOperand(0), Q.AC, Q.CxtI, Q.DT,
                                      Depth);
   case Instruction::Load: {
@@ -2886,7 +2886,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
       if (Call->isReturnNonNull())
         return true;
       if (const auto *RP = getArgumentAliasingToReturnedPointer(Call, true))
-        return isKnownNonZero(RP, Depth, Q);
+        return isKnownNonZero(RP, Q, Depth);
     } else {
       if (MDNode *Ranges = Q.IIQ.getMetadata(Call, LLVMContext::MD_range))
         return rangeMetadataExcludesValue(Ranges, APInt::getZero(BitWidth));
@@ -2896,7 +2896,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
           return true;
       }
       if (const Value *RV = Call->getReturnedArgOperand())
-        if (RV->getType() == I->getType() && isKnownNonZero(RV, Depth, Q))
+        if (RV->getType() == I->getType() && isKnownNonZero(RV, Q, Depth))
           return true;
     }
 
@@ -2908,7 +2908,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
       case Intrinsic::bitreverse:
       case Intrinsic::bswap:
       case Intrinsic::ctpop:
-        return isKnownNonZero(II->getArgOperand(0), DemandedElts, Depth, Q);
+        return isKnownNonZero(II->getArgOperand(0), DemandedElts, Q, Depth);
         // NB: We don't do usub_sat here as in any case we can prove its
         // non-zero, we will fold it to `sub nuw` in InstCombine.
       case Intrinsic::ssub_sat:
@@ -2924,11 +2924,11 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
       case Intrinsic::vector_reduce_umin:
       case Intrinsic::vector_reduce_smax:
       case Intrinsic::vector_reduce_smin:
-        return isKnownNonZero(II->getArgOperand(0), Depth, Q);
+        return isKnownNonZero(II->getArgOperand(0), Q, Depth);
       case Intrinsic::umax:
       case Intrinsic::uadd_sat:
-        return isKnownNonZero(II->getArgOperand(1), DemandedElts, Depth, Q) ||
-               isKnownNonZero(II->getArgOperand(0), DemandedElts, Depth, Q);
+        return isKnownNonZero(II->getArgOperand(1), DemandedElts, Q, Depth) ||
+               isKnownNonZero(II->getArgOperand(0), DemandedElts, Q, Depth);
       case Intrinsic::smax: {
         // If either arg is strictly positive the result is non-zero. Otherwise
         // the result is non-zero if both ops are non-zero.
@@ -2936,7 +2936,7 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
                              const KnownBits &OpKnown) {
           if (!OpNonZero.has_value())
             OpNonZero = OpKnown.isNonZero() ||
-                        isKnownNonZero(Op, DemandedElts, Depth, Q);
+                        isKnownNonZero(Op, DemandedElts, Q, Depth);
           return *OpNonZero;
         };
         // Avoid re-computing isKnownNonZero.
@@ -2971,8 +2971,8 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
       }
         [[fallthrough]];
       case Intrinsic::umin:
-        return isKnownNonZero(II->getArgOperand(0), DemandedElts, Depth, Q) &&
-               isKnownNonZero(II->getArgOperand(1), DemandedElts, Depth, Q);
+        return isKnownNonZero(II->getArgOperand(0), DemandedElts, Q, Depth) &&
+               isKnownNonZero(II->getArgOperand(1), DemandedElts, Q, Depth);
       case Intrinsic::cttz:
         return computeKnownBits(II->getArgOperand(0), DemandedElts, Depth, Q)
             .Zero[0];
@@ -2983,12 +2983,12 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
       case Intrinsic::fshl:
         // If Op0 == Op1, this is a rotate. rotate(x, y) != 0 iff x != 0.
         if (II->getArgOperand(0) == II->getArgOperand(1))
-          return isKnownNonZero(II->getArgOperand(0), DemandedElts, Depth, Q);
+          return isKnownNonZero(II->getArgOperand(0), DemandedElts, Q, Depth);
         break;
       case Intrinsic::vscale:
         return true;
       case Intrinsic::experimental_get_vector_length:
-        return isKnownNonZero(I->getOperand(0), Depth, Q);
+        return isKnownNonZero(I->getOperand(0), Q, Depth);
       default:
         break;
       }
@@ -3010,8 +3010,8 @@ static bool isKnownNonZeroFromOperator(const Operator *I,
 /// specified, perform context-sensitive analysis and return true if the
 /// pointer couldn't possibly be null at the specified instruction.
 /// Supports values with integer or pointer type and vectors of integers.
-bool isKnownNonZero(const Value *V, const APInt &DemandedElts, unsigned Depth,
-                    const SimplifyQuery &Q) {
+bool isKnownNonZero(const Value *V, const APInt &DemandedElts,
+                    const SimplifyQuery &Q, unsigned Depth) {
   Type *Ty = V->getType();
 
 #ifndef NDEBUG
@@ -3101,12 +3101,12 @@ bool isKnownNonZero(const Value *V, const APInt &DemandedElts, unsigned Depth,
   return false;
 }
 
-bool llvm::isKnownNonZero(const Value *V, unsigned Depth,
-                          const SimplifyQuery &Q) {
+bool llvm::isKnownNonZero(const Value *V, const SimplifyQuery &Q,
+                          unsigned Depth) {
   auto *FVTy = dyn_cast<FixedVectorType>(V->getType());
   APInt DemandedElts =
       FVTy ? APInt::getAllOnes(FVTy->getNumElements()) : APInt(1, 1);
-  return ::isKnownNonZero(V, DemandedElts, Depth, Q);
+  return ::isKnownNonZero(V, DemandedElts, Q, Depth);
 }
 
 /// If the pair of operators are the same invertible function, return the
@@ -3253,7 +3253,7 @@ static bool isModifyingBinopOfNonZero(const Value *V1, const Value *V2,
       Op = BO->getOperand(0);
     else
       return false;
-    return isKnownNonZero(Op, Depth + 1, Q);
+    return isKnownNonZero(Op, Q, Depth + 1);
   }
   return false;
 }
@@ -3266,7 +3266,7 @@ static bool isNonEqualMul(const Value *V1, const Value *V2, unsigned Depth,
     const APInt *C;
     return match(OBO, m_Mul(m_Specific(V1), m_APInt(C))) &&
            (OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap()) &&
-           !C->isZero() && !C->isOne() && isKnownNonZero(V1, Depth + 1, Q);
+           !C->isZero() && !C->isOne() && isKnownNonZero(V1, Q, Depth + 1);
   }
   return false;
 }
@@ -3279,7 +3279,7 @@ static bool isNonEqualShl(const Value *V1, const Value *V2, unsigned Depth,
     const APInt *C;
     return match(OBO, m_Shl(m_Specific(V1), m_APInt(C))) &&
            (OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap()) &&
-           !C->isZero() && isKnownNonZero(V1, Depth + 1, Q);
+           !C->isZero() && isKnownNonZero(V1, Q, Depth + 1);
   }
   return false;
 }
@@ -5032,6 +5032,19 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
 
       break;
     }
+    case Intrinsic::vector_reduce_fmax:
+    case Intrinsic::vector_reduce_fmin:
+    case Intrinsic::vector_reduce_fmaximum:
+    case Intrinsic::vector_reduce_fminimum: {
+      // reduce min/max will choose an element from one of the vector elements,
+      // so we can infer and class information that is common to all elements.
+      Known = computeKnownFPClass(II->getArgOperand(0), II->getFastMathFlags(),
+                                  InterestedClasses, Depth + 1, Q);
+      // Can only propagate sign if output is never NaN.
+      if (!Known.isKnownNeverNaN())
+        Known.SignBit.reset();
+      break;
+    }
     case Intrinsic::trunc:
     case Intrinsic::floor:
     case Intrinsic::ceil:
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index ecf7bc30913f5..55aa1d438b2a6 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -2047,12 +2047,8 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
   MBB->splice(Loc, TBB, TBB->begin(), TIB);
   FBB->erase(FBB->begin(), FIB);
 
-  if (UpdateLiveIns) {
-    bool anyChange = false;
-    do {
-      anyChange = recomputeLiveIns(*TBB) || recomputeLiveIns(*FBB);
-    } while (anyChange);
-  }
+  if (UpdateLiveIns)
+    fullyRecomputeLiveIns({TBB, FBB});
 
   ++NumHoist;
   return true;
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 22dbb3198a9f1..e657872c38284 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2314,7 +2314,7 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
 
   // Bail if the value is never zero.
   Use &Op = CountZeros->getOperandUse(0);
-  if (isKnownNonZero(Op, /*Depth=*/0, *DL))
+  if (isKnownNonZero(Op, *DL))
     return false;
 
   // The intrinsic will be sunk behind a compare against zero and branch.
diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index 0fe4cfefdb160..8e623c85b737b 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -340,6 +340,8 @@ Value *CachingVPExpander::expandPredicationToFPCall(
     replaceOperation(*NewOp, VPI);
     return NewOp;
   }
+  case Intrinsic::fma:
+  case Intrinsic::fmuladd:
   case Intrinsic::experimental_constrained_fma:
   case Intrinsic::experimental_constrained_fmuladd: {
     Value *Op0 = VPI.getOperand(0);
@@ -347,8 +349,12 @@ Value *CachingVPExpander::expandPredicationToFPCall(
     Value *Op2 = VPI.getOperand(2);
     Function *Fn = Intrinsic::getDeclaration(
         VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()});
-    Value *NewOp =
-        Builder.CreateConstrainedFPCall(Fn, {Op0, Op1, Op2}, VPI.getName());
+    Value *NewOp;
+    if (Intrinsic::isConstrainedFPIntrinsic(UnpredicatedIntrinsicID))
+      NewOp =
+          Builder.CreateConstrainedFPCall(Fn, {Op0, Op1, Op2}, VPI.getName());
+    else
+      NewOp = Builder.CreateCall(Fn, {Op0, Op1, Op2}, VPI.getName());
     replaceOperation(*NewOp, VPI);
     return NewOp;
   }
@@ -731,6 +737,8 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
   case Intrinsic::vp_minnum:
   case Intrinsic::vp_maximum:
   case Intrinsic::vp_minimum:
+  case Intrinsic::vp_fma:
+  case Intrinsic::vp_fmuladd:
     return expandPredicationToFPCall(Builder, VPI,
                                      VPI.getFunctionalIntrinsicID().value());
   case Intrinsic::vp_load:
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 40c5119ee7fb3..3829c33369b27 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -6273,8 +6273,21 @@ bool CombinerHelper::matchShiftsTooBig(MachineInstr &MI) {
 }
 
 bool CombinerHelper::matchCommuteConstantToRHS(MachineInstr &MI) {
-  Register LHS = MI.getOperand(1).getReg();
-  Register RHS = MI.getOperand(2).getReg();
+  unsigned LHSOpndIdx = 1;
+  unsigned RHSOpndIdx = 2;
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_UADDO:
+  case TargetOpcode::G_SADDO:
+  case TargetOpcode::G_UMULO:
+  case TargetOpcode::G_SMULO:
+    LHSOpndIdx = 2;
+    RHSOpndIdx = 3;
+    break;
+  default:
+    break;
+  }
+  Register LHS = MI.getOperand(LHSOpndIdx).getReg();
+  Register RHS = MI.getOperand(RHSOpndIdx).getReg();
   if (!getIConstantVRegVal(LHS, MRI)) {
     // Skip commuting if LHS is not a constant. But, LHS may be a
     // G_CONSTANT_FOLD_BARRIER. If so we commute as long as we don't already
@@ -6300,10 +6313,23 @@ bool CombinerHelper::matchCommuteFPConstantToRHS(MachineInstr &MI) {
 
 void CombinerHelper::applyCommuteBinOpOperands(MachineInstr &MI) {
   Observer.changingInstr(MI);
-  Register LHSReg = MI.getOperand(1).getReg();
-  Register RHSReg = MI.getOperand(2).getReg();
-  MI.getOperand(1).setReg(RHSReg);
-  MI.getOperand(2).setReg(LHSReg);
+  unsigned LHSOpndIdx = 1;
+  unsigned RHSOpndIdx = 2;
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_UADDO:
+  case TargetOpcode::G_SADDO:
+  case TargetOpcode::G_UMULO:
+  case TargetOpcode::G_SMULO:
+    LHSOpndIdx = 2;
+    RHSOpndIdx = 3;
+    break;
+  default:
+    break;
+  }
+  Register LHSReg = MI.getOperand(LHSOpndIdx).getReg();
+  Register RHSReg = MI.getOperand(RHSOpndIdx).getReg();
+  MI.getOperand(LHSOpndIdx).setReg(RHSReg);
+  MI.getOperand(RHSOpndIdx).setReg(LHSReg);
   Observer.changedInstr(MI);
 }
 
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 9b4575f7f34d4..0b6aae3759756 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1955,6 +1955,8 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) {
       return TargetOpcode::G_PTRMASK;
     case Intrinsic::lrint:
       return TargetOpcode::G_INTRINSIC_LRINT;
+    case Intrinsic::llrint:
+      return TargetOpcode::G_INTRINSIC_LLRINT;
     // FADD/FMUL require checking the FMF, so are handled elsewhere.
     case Intrinsic::vector_reduce_fmin:
       return TargetOpcode::G_VECREDUCE_FMIN;
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 156353296cfc1..d55091e2e7173 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -474,6 +474,8 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
     RTLIBCASE(ROUNDEVEN_F);
   case TargetOpcode::G_INTRINSIC_LRINT:
     RTLIBCASE(LRINT_F);
+  case TargetOpcode::G_INTRINSIC_LLRINT:
+    RTLIBCASE(LLRINT_F);
   }
   llvm_unreachable("Unknown libcall function");
 }
@@ -1061,7 +1063,8 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
       return Status;
     break;
   }
-  case TargetOpcode::G_INTRINSIC_LRINT: {
+  case TargetOpcode::G_INTRINSIC_LRINT:
+  case TargetOpcode::G_INTRINSIC_LLRINT: {
     LLT LLTy = MRI.getType(MI.getOperand(1).getReg());
     unsigned Size = LLTy.getSizeInBits();
     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
@@ -2661,6 +2664,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
   case TargetOpcode::G_FPTOSI:
   case TargetOpcode::G_FPTOUI:
   case TargetOpcode::G_INTRINSIC_LRINT:
+  case TargetOpcode::G_INTRINSIC_LLRINT:
   case TargetOpcode::G_IS_FPCLASS:
     Observer.changingInstr(MI);
 
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index c3bc3203b6360..ae43e9ccf6112 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -1665,3 +1665,47 @@ void llvm::salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI) {
     }
   }
 }
+
+bool llvm::isPreISelGenericFloatingPointOpcode(unsigned Opc) {
+  switch (Opc) {
+  case TargetOpcode::G_FABS:
+  case TargetOpcode::G_FADD:
+  case TargetOpcode::G_FCANONICALIZE:
+  case TargetOpcode::G_FCEIL:
+  case TargetOpcode::G_FCONSTANT:
+  case TargetOpcode::G_FCOPYSIGN:
+  case TargetOpcode::G_FCOS:
+  case TargetOpcode::G_FDIV:
+  case TargetOpcode::G_FEXP2:
+  case TargetOpcode::G_FEXP:
+  case TargetOpcode::G_FFLOOR:
+  case TargetOpcode::G_FLOG10:
+  case TargetOpcode::G_FLOG2:
+  case TargetOpcode::G_FLOG:
+  case TargetOpcode::G_FMA:
+  case TargetOpcode::G_FMAD:
+  case TargetOpcode::G_FMAXIMUM:
+  case TargetOpcode::G_FMAXNUM:
+  case TargetOpcode::G_FMAXNUM_IEEE:
+  case TargetOpcode::G_FMINIMUM:
+  case TargetOpcode::G_FMINNUM:
+  case TargetOpcode::G_FMINNUM_IEEE:
+  case TargetOpcode::G_FMUL:
+  case TargetOpcode::G_FNEARBYINT:
+  case TargetOpcode::G_FNEG:
+  case TargetOpcode::G_FPEXT:
+  case TargetOpcode::G_FPOW:
+  case TargetOpcode::G_FPTRUNC:
+  case TargetOpcode::G_FREM:
+  case TargetOpcode::G_FRINT:
+  case TargetOpcode::G_FSIN:
+  case TargetOpcode::G_FSQRT:
+  case TargetOpcode::G_FSUB:
+  case TargetOpcode::G_INTRINSIC_ROUND:
+  case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
+  case TargetOpcode::G_INTRINSIC_TRUNC:
+    return true;
+  default:
+    return false;
+  }
+}
diff --git a/llvm/lib/CodeGen/MachineDebugify.cpp b/llvm/lib/CodeGen/MachineDebugify.cpp
index c264e199cf472..bffdd51bfbca7 100644
--- a/llvm/lib/CodeGen/MachineDebugify.cpp
+++ b/llvm/lib/CodeGen/MachineDebugify.cpp
@@ -65,6 +65,7 @@ bool applyDebugifyMetadataToMachineFunction(MachineModuleInfo &MMI,
   // all the others.
   Function *DbgValF = M.getFunction("llvm.dbg.value");
   DbgValueInst *EarliestDVI = nullptr;
+  DbgVariableRecord *EarliestDVR = nullptr;
   DenseMap<unsigned, DILocalVariable *> Line2Var;
   DIExpression *Expr = nullptr;
   if (DbgValF) {
@@ -80,6 +81,20 @@ bool applyDebugifyMetadataToMachineFunction(MachineModuleInfo &MMI,
       Expr = DVI->getExpression();
     }
   }
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
+        if (!DVR.isDbgValue())
+          continue;
+        unsigned Line = DVR.getDebugLoc().getLine();
+        assert(Line != 0 && "debugify should not insert line 0 locations");
+        Line2Var[Line] = DVR.getVariable();
+        if (!EarliestDVR || Line < EarliestDVR->getDebugLoc().getLine())
+          EarliestDVR = &DVR;
+        Expr = DVR.getExpression();
+      }
+    }
+  }
   if (Line2Var.empty())
     return true;
 
@@ -109,7 +124,8 @@ bool applyDebugifyMetadataToMachineFunction(MachineModuleInfo &MMI,
       // Find a suitable local variable for the DBG_VALUE.
       unsigned Line = MI.getDebugLoc().getLine();
       if (!Line2Var.count(Line))
-        Line = EarliestDVI->getDebugLoc().getLine();
+        Line = EarliestDVI ? EarliestDVI->getDebugLoc().getLine()
+                           : EarliestDVR->getDebugLoc().getLine();
       DILocalVariable *LocalVar = Line2Var[Line];
       assert(LocalVar && "No variable for current line?");
       VarSet.insert(LocalVar);
diff --git a/llvm/lib/CodeGen/RegisterPressure.cpp b/llvm/lib/CodeGen/RegisterPressure.cpp
index f86aa3a167202..3fa22447f416b 100644
--- a/llvm/lib/CodeGen/RegisterPressure.cpp
+++ b/llvm/lib/CodeGen/RegisterPressure.cpp
@@ -64,7 +64,7 @@ static void increaseSetPressure(std::vector<unsigned> &CurrSetPressure,
 static void decreaseSetPressure(std::vector<unsigned> &CurrSetPressure,
                                 const MachineRegisterInfo &MRI, Register Reg,
                                 LaneBitmask PrevMask, LaneBitmask NewMask) {
-  //assert((NewMask & !PrevMask) == 0 && "Must not add bits");
+  assert((NewMask & ~PrevMask).none() && "Must not add bits");
   if (NewMask.any() || PrevMask.none())
     return;
 
@@ -617,17 +617,11 @@ void RegisterOperands::adjustLaneLiveness(const LiveIntervals &LIS,
       ++I;
     }
   }
-  for (auto *I = Uses.begin(); I != Uses.end();) {
-    LaneBitmask LiveBefore = getLiveLanesAt(LIS, MRI, true, I->RegUnit,
-                                            Pos.getBaseIndex());
-    LaneBitmask LaneMask = I->LaneMask & LiveBefore;
-    if (LaneMask.none()) {
-      I = Uses.erase(I);
-    } else {
-      I->LaneMask = LaneMask;
-      ++I;
-    }
-  }
+
+  // For uses just copy the information from LIS.
+  for (auto &[RegUnit, LaneMask] : Uses)
+    LaneMask = getLiveLanesAt(LIS, MRI, true, RegUnit, Pos.getBaseIndex());
+
   if (AddFlagsMI != nullptr) {
     for (const RegisterMaskPair &P : DeadDefs) {
       Register RegUnit = P.RegUnit;
@@ -1060,18 +1054,27 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
   // Kill liveness at live defs.
   for (const RegisterMaskPair &P : RegOpers.Defs) {
     Register Reg = P.RegUnit;
-    LaneBitmask LiveLanes = LiveRegs.contains(Reg);
+    LaneBitmask LiveAfter = LiveRegs.contains(Reg);
     LaneBitmask UseLanes = getRegLanes(RegOpers.Uses, Reg);
     LaneBitmask DefLanes = P.LaneMask;
-    LaneBitmask LiveAfter = (LiveLanes & ~DefLanes) | UseLanes;
-    decreaseRegPressure(Reg, LiveLanes, LiveAfter);
+    LaneBitmask LiveBefore = (LiveAfter & ~DefLanes) | UseLanes;
+
+    // There may be parts of the register that were dead before the
+    // instruction, but became live afterwards. Similarly, some parts
+    // may have been killed in this instruction.
+    decreaseRegPressure(Reg, LiveAfter, LiveAfter & LiveBefore);
+    increaseRegPressure(Reg, LiveAfter, ~LiveAfter & LiveBefore);
   }
   // Generate liveness for uses.
   for (const RegisterMaskPair &P : RegOpers.Uses) {
     Register Reg = P.RegUnit;
-    LaneBitmask LiveLanes = LiveRegs.contains(Reg);
-    LaneBitmask LiveAfter = LiveLanes | P.LaneMask;
-    increaseRegPressure(Reg, LiveLanes, LiveAfter);
+    // If this register was also in a def operand, we've handled it
+    // with defs.
+    if (getRegLanes(RegOpers.Defs, Reg).any())
+      continue;
+    LaneBitmask LiveAfter = LiveRegs.contains(Reg);
+    LaneBitmask LiveBefore = LiveAfter | P.LaneMask;
+    increaseRegPressure(Reg, LiveAfter, LiveBefore);
   }
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index cbba3a294b3d6..c36b1cc9039c2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -24477,11 +24477,10 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
     unsigned InsIdx = V.getConstantOperandVal(2);
     unsigned NumSubElts = NVT.getVectorMinNumElements();
     if (InsIdx <= ExtIdx && (ExtIdx + NumSubElts) <= (InsIdx + NumInsElts) &&
-        TLI.isExtractSubvectorCheap(NVT, InsSubVT, ExtIdx - InsIdx)) {
-      SDLoc DL(N);
+        TLI.isExtractSubvectorCheap(NVT, InsSubVT, ExtIdx - InsIdx) &&
+        InsSubVT.isFixedLengthVector() && NVT.isFixedLengthVector())
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT, InsSub,
                          DAG.getVectorIdxConstant(ExtIdx - InsIdx, DL));
-    }
   }
 
   // Try to move vector bitcast after extract_subv by scaling extraction index:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 57a3f6a65e002..7a9cfdf5c3fda 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1159,8 +1159,14 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   }
 
   SDValue Unrolled = DAG.UnrollVectorOp(Node);
-  for (unsigned I = 0, E = Unrolled->getNumValues(); I != E; ++I)
-    Results.push_back(Unrolled.getValue(I));
+  if (Node->getNumValues() == 1) {
+    Results.push_back(Unrolled);
+  } else {
+    assert(Node->getNumValues() == Unrolled->getNumValues() &&
+      "VectorLegalizer Expand returned wrong number of results!");
+    for (unsigned I = 0, E = Unrolled->getNumValues(); I != E; ++I)
+      Results.push_back(Unrolled.getValue(I));
+  }
 }
 
 SDValue VectorLegalizer::ExpandSELECT(SDNode *Node) {
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index 5ed67bd0a121e..f5dd21cb92701 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -68,6 +68,18 @@ static cl::opt<unsigned> TailDupIndirectBranchSize(
              "end with indirect branches."), cl::init(20),
     cl::Hidden);
 
+static cl::opt<unsigned>
+    TailDupPredSize("tail-dup-pred-size",
+                    cl::desc("Maximum predecessors (maximum successors at the "
+                             "same time) to consider tail duplicating blocks."),
+                    cl::init(16), cl::Hidden);
+
+static cl::opt<unsigned>
+    TailDupSuccSize("tail-dup-succ-size",
+                    cl::desc("Maximum successors (maximum predecessors at the "
+                             "same time) to consider tail duplicating blocks."),
+                    cl::init(16), cl::Hidden);
+
 static cl::opt<bool>
     TailDupVerify("tail-dup-verify",
                   cl::desc("Verify sanity of PHI instructions during taildup"),
@@ -565,6 +577,14 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
   if (TailBB.isSuccessor(&TailBB))
     return false;
 
+  // Duplicating a BB which has both multiple predecessors and successors will
+  // result in a complex CFG and also may cause huge amount of PHI nodes. If we
+  // want to remove this limitation, we have to address
+  // https://github.com/llvm/llvm-project/issues/78578.
+  if (TailBB.pred_size() > TailDupPredSize &&
+      TailBB.succ_size() > TailDupSuccSize)
+    return false;
+
   // Set the limit on the cost to duplicate. When optimizing for size,
   // duplicate only one, because one branch instruction can be eliminated to
   // compensate for the duplication.
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index f64ded4f2cf96..6e7b67ded23c8 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1809,8 +1809,16 @@ void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType,
     else if (attr.hasRetAttr(Attribute::ZExt))
       Flags.setZExt();
 
-    for (unsigned i = 0; i < NumParts; ++i)
-      Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, /*isfixed=*/true, 0, 0));
+    for (unsigned i = 0; i < NumParts; ++i) {
+      ISD::ArgFlagsTy OutFlags = Flags;
+      if (NumParts > 1 && i == 0)
+        OutFlags.setSplit();
+      else if (i == NumParts - 1 && i != 0)
+        OutFlags.setSplitEnd();
+
+      Outs.push_back(
+          ISD::OutputArg(OutFlags, PartVT, VT, /*isfixed=*/true, 0, 0));
+    }
   }
 }
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
index 22c9e8cd143c2..ac19ac7932971 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
@@ -572,12 +572,12 @@ dwarf::findDebugNamesOffsets(uint64_t EndOfHeaderOffset,
 
 Error DWARFDebugNames::NameIndex::extract() {
   const DWARFDataExtractor &AS = Section.AccelSection;
-  uint64_t hdrSize = Base;
-  if (Error E = Hdr.extract(AS, &hdrSize))
+  uint64_t EndOfHeaderOffset = Base;
+  if (Error E = Hdr.extract(AS, &EndOfHeaderOffset))
     return E;
 
   const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format);
-  Offsets = dwarf::findDebugNamesOffsets(hdrSize, Hdr);
+  Offsets = dwarf::findDebugNamesOffsets(EndOfHeaderOffset, Hdr);
 
   uint64_t Offset =
       Offsets.EntryOffsetsBase + (Hdr.NameCount * SectionOffsetSize);
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 2c480fb76ee4d..634b2dd5119e8 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -5341,10 +5341,11 @@ MDNode *llvm::upgradeInstructionLoopAttachment(MDNode &N) {
 
 std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
   Triple T(TT);
-  // The only data layout upgrades needed for pre-GCN are setting the address
-  // space of globals to 1.
-  if (T.isAMDGPU() && !T.isAMDGCN() && !DL.contains("-G") &&
-      !DL.starts_with("G")) {
+  // The only data layout upgrades needed for pre-GCN, SPIR or SPIRV are setting
+  // the address space of globals to 1. This does not apply to SPIRV Logical.
+  if (((T.isAMDGPU() && !T.isAMDGCN()) ||
+       (T.isSPIR() || (T.isSPIRV() && !T.isSPIRVLogical()))) &&
+      !DL.contains("-G") && !DL.starts_with("G")) {
     return DL.empty() ? std::string("G1") : (DL + "-G1").str();
   }
 
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index a5fb497f54ed1..45b359a94b3ab 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -316,7 +316,7 @@ bool Constant::isElementWiseEqual(Value *Y) const {
   Constant *C0 = ConstantExpr::getBitCast(const_cast<Constant *>(this), IntTy);
   Constant *C1 = ConstantExpr::getBitCast(cast<Constant>(Y), IntTy);
   Constant *CmpEq = ConstantExpr::getICmp(ICmpInst::ICMP_EQ, C0, C1);
-  return isa<UndefValue>(CmpEq) || match(CmpEq, m_One());
+  return isa<PoisonValue>(CmpEq) || match(CmpEq, m_One());
 }
 
 static bool
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 8ce9c5ca63bed..6aff94f39d9c0 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -3769,6 +3769,10 @@ static AtomicRMWInst::BinOp mapFromLLVMRMWBinOp(LLVMAtomicRMWBinOp BinOp) {
     case LLVMAtomicRMWBinOpFSub: return AtomicRMWInst::FSub;
     case LLVMAtomicRMWBinOpFMax: return AtomicRMWInst::FMax;
     case LLVMAtomicRMWBinOpFMin: return AtomicRMWInst::FMin;
+    case LLVMAtomicRMWBinOpUIncWrap:
+      return AtomicRMWInst::UIncWrap;
+    case LLVMAtomicRMWBinOpUDecWrap:
+      return AtomicRMWInst::UDecWrap;
   }
 
   llvm_unreachable("Invalid LLVMAtomicRMWBinOp value!");
@@ -3791,6 +3795,10 @@ static LLVMAtomicRMWBinOp mapToLLVMRMWBinOp(AtomicRMWInst::BinOp BinOp) {
     case AtomicRMWInst::FSub: return LLVMAtomicRMWBinOpFSub;
     case AtomicRMWInst::FMax: return LLVMAtomicRMWBinOpFMax;
     case AtomicRMWInst::FMin: return LLVMAtomicRMWBinOpFMin;
+    case AtomicRMWInst::UIncWrap:
+      return LLVMAtomicRMWBinOpUIncWrap;
+    case AtomicRMWInst::UDecWrap:
+      return LLVMAtomicRMWBinOpUDecWrap;
     default: break;
   }
 
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 96953ac49c19b..818a167560de6 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -499,15 +499,7 @@ static MutableArrayRef<Argument> makeArgArray(Argument *Args, size_t Count) {
 }
 
 bool Function::isConstrainedFPIntrinsic() const {
-  switch (getIntrinsicID()) {
-#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
-  case Intrinsic::INTRINSIC:
-#include "llvm/IR/ConstrainedOps.def"
-    return true;
-#undef INSTRUCTION
-  default:
-    return false;
-  }
+  return Intrinsic::isConstrainedFPIntrinsic(getIntrinsicID());
 }
 
 void Function::clearArguments() {
@@ -1486,6 +1478,18 @@ Function *Intrinsic::getDeclaration(Module *M, ID id, ArrayRef<Type*> Tys) {
 #include "llvm/IR/IntrinsicImpl.inc"
 #undef GET_LLVM_INTRINSIC_FOR_MS_BUILTIN
 
+bool Intrinsic::isConstrainedFPIntrinsic(ID QID) {
+  switch (QID) {
+#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
+  case Intrinsic::INTRINSIC:
+#include "llvm/IR/ConstrainedOps.def"
+    return true;
+#undef INSTRUCTION
+  default:
+    return false;
+  }
+}
+
 using DeferredIntrinsicMatchPair =
     std::pair<Type *, ArrayRef<Intrinsic::IITDescriptor>>;
 
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 516d4a0515569..4cd61e6e531bf 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -5798,6 +5798,11 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
 
     break;
   }
+  case Intrinsic::vastart: {
+    Check(Call.getFunction()->isVarArg(),
+          "va_start called in a non-varargs function");
+    break;
+  }
   case Intrinsic::vector_reduce_and:
   case Intrinsic::vector_reduce_or:
   case Intrinsic::vector_reduce_xor:
diff --git a/llvm/lib/MC/MCPseudoProbe.cpp b/llvm/lib/MC/MCPseudoProbe.cpp
index eb3894dbb3c25..cec50322bb9f9 100644
--- a/llvm/lib/MC/MCPseudoProbe.cpp
+++ b/llvm/lib/MC/MCPseudoProbe.cpp
@@ -343,7 +343,7 @@ template <typename T> ErrorOr<T> MCPseudoProbeDecoder::readUnencodedNumber() {
   if (Data + sizeof(T) > End) {
     return std::error_code();
   }
-  T Val = endian::readNext<T, llvm::endianness::little, unaligned>(Data);
+  T Val = endian::readNext<T, llvm::endianness::little>(Data);
   return ErrorOr<T>(Val);
 }
 
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 3bb2ce0ae3460..90ba3b541553e 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1169,7 +1169,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   if (EnableSyntheticCounts && !PGOOpt)
     MPM.addPass(SyntheticCountsPropagation());
 
-  if (EnablePGOForceFunctionAttrs)
+  if (EnablePGOForceFunctionAttrs && PGOOpt)
     MPM.addPass(PGOForceFunctionAttrsPass(PGOOpt->ColdOptType));
 
   MPM.addPass(AlwaysInlinerPass(/*InsertLifetimeIntrinsics=*/true));
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index a5abf63b010f7..f9ba80bd99c85 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -1135,9 +1135,9 @@ static T swapToHostOrder(const unsigned char *&D, llvm::endianness Orig) {
   using namespace support;
 
   if (Orig == llvm::endianness::little)
-    return endian::readNext<T, llvm::endianness::little, unaligned>(D);
+    return endian::readNext<T, llvm::endianness::little>(D);
   else
-    return endian::readNext<T, llvm::endianness::big, unaligned>(D);
+    return endian::readNext<T, llvm::endianness::big>(D);
 }
 
 static std::unique_ptr<ValueProfData> allocValueProfData(uint32_t TotalSize) {
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index a35366a106a32..8574a96a1b06f 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -115,10 +115,9 @@ readBinaryIdsInternal(const MemoryBuffer &DataBuffer,
 
     uint64_t BILen = 0;
     if (Endian == llvm::endianness::little)
-      BILen =
-          endian::readNext<uint64_t, llvm::endianness::little, unaligned>(BI);
+      BILen = endian::readNext<uint64_t, llvm::endianness::little>(BI);
     else
-      BILen = endian::readNext<uint64_t, llvm::endianness::big, unaligned>(BI);
+      BILen = endian::readNext<uint64_t, llvm::endianness::big>(BI);
 
     if (BILen == 0)
       return make_error<InstrProfError>(instrprof_error::malformed,
@@ -923,8 +922,7 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
     // Read hash.
     if (D + sizeof(uint64_t) >= End)
       return data_type();
-    uint64_t Hash =
-        endian::readNext<uint64_t, llvm::endianness::little, unaligned>(D);
+    uint64_t Hash = endian::readNext<uint64_t, llvm::endianness::little>(D);
 
     // Initialize number of counters for GET_VERSION(FormatVersion) == 1.
     uint64_t CountsSize = N / sizeof(uint64_t) - 1;
@@ -932,8 +930,7 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
     if (GET_VERSION(FormatVersion) != IndexedInstrProf::ProfVersion::Version1) {
       if (D + sizeof(uint64_t) > End)
         return data_type();
-      CountsSize =
-          endian::readNext<uint64_t, llvm::endianness::little, unaligned>(D);
+      CountsSize = endian::readNext<uint64_t, llvm::endianness::little>(D);
     }
     // Read counter values.
     if (D + CountsSize * sizeof(uint64_t) > End)
@@ -943,15 +940,14 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
     CounterBuffer.reserve(CountsSize);
     for (uint64_t J = 0; J < CountsSize; ++J)
       CounterBuffer.push_back(
-          endian::readNext<uint64_t, llvm::endianness::little, unaligned>(D));
+          endian::readNext<uint64_t, llvm::endianness::little>(D));
 
     // Read bitmap bytes for GET_VERSION(FormatVersion) > 10.
     if (GET_VERSION(FormatVersion) > IndexedInstrProf::ProfVersion::Version10) {
       uint64_t BitmapBytes = 0;
       if (D + sizeof(uint64_t) > End)
         return data_type();
-      BitmapBytes =
-          endian::readNext<uint64_t, llvm::endianness::little, unaligned>(D);
+      BitmapBytes = endian::readNext<uint64_t, llvm::endianness::little>(D);
       // Read bitmap byte values.
       if (D + BitmapBytes * sizeof(uint8_t) > End)
         return data_type();
@@ -959,8 +955,7 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
       BitmapByteBuffer.reserve(BitmapBytes);
       for (uint64_t J = 0; J < BitmapBytes; ++J)
         BitmapByteBuffer.push_back(static_cast<uint8_t>(
-            endian::readNext<uint64_t, llvm::endianness::little, unaligned>(
-                D)));
+            endian::readNext<uint64_t, llvm::endianness::little>(D)));
     }
 
     DataBuffer.emplace_back(K, Hash, std::move(CounterBuffer),
@@ -1256,8 +1251,7 @@ Error IndexedInstrProfReader::readHeader() {
     // memprof::MemProfVersion0 or the MemProf version number in
     // memprof::MemProfVersion1.
     const uint64_t FirstWord =
-        support::endian::readNext<uint64_t, llvm::endianness::little,
-                                  unaligned>(Ptr);
+        support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
 
     memprof::IndexedVersion Version = memprof::Version0;
     if (FirstWord == memprof::Version1) {
@@ -1282,17 +1276,15 @@ Error IndexedInstrProfReader::readHeader() {
     const uint64_t RecordTableOffset =
         Version == memprof::Version0
             ? FirstWord
-            : support::endian::readNext<uint64_t, llvm::endianness::little,
-                                        unaligned>(Ptr);
+            : support::endian::readNext<uint64_t, llvm::endianness::little>(
+                  Ptr);
     // The offset in the stream right before invoking
     // FrameTableGenerator.Emit.
     const uint64_t FramePayloadOffset =
-        support::endian::readNext<uint64_t, llvm::endianness::little,
-                                  unaligned>(Ptr);
+        support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
     // The value returned from FrameTableGenerator.Emit.
     const uint64_t FrameTableOffset =
-        support::endian::readNext<uint64_t, llvm::endianness::little,
-                                  unaligned>(Ptr);
+        support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
 
     // Read the schema.
     auto SchemaOr = memprof::readMemProfSchema(Ptr);
@@ -1330,8 +1322,7 @@ Error IndexedInstrProfReader::readHeader() {
     const unsigned char *Ptr = Start + BinaryIdOffset;
     // Read binary ids size.
     BinaryIdsSize =
-        support::endian::readNext<uint64_t, llvm::endianness::little,
-                                  unaligned>(Ptr);
+        support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
     if (BinaryIdsSize % sizeof(uint64_t))
       return error(instrprof_error::bad_header);
     // Set the binary ids start.
@@ -1348,8 +1339,7 @@ Error IndexedInstrProfReader::readHeader() {
     const unsigned char *Ptr = Start + VTableNamesOffset;
 
     CompressedVTableNamesLen =
-        support::endian::readNext<uint64_t, llvm::endianness::little,
-                                  unaligned>(Ptr);
+        support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
 
     // Writer first writes the length of compressed string, and then the actual
     // content.
@@ -1369,29 +1359,24 @@ Error IndexedInstrProfReader::readHeader() {
     if (Ptr + 2 * sizeof(uint64_t) > PtrEnd)
       return error(instrprof_error::truncated);
     const uint64_t NumTraces =
-        support::endian::readNext<uint64_t, llvm::endianness::little,
-                                  unaligned>(Ptr);
+        support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
     TemporalProfTraceStreamSize =
-        support::endian::readNext<uint64_t, llvm::endianness::little,
-                                  unaligned>(Ptr);
+        support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
     for (unsigned i = 0; i < NumTraces; i++) {
       // Expect at least two 64 bit fields: Weight and NumFunctions
       if (Ptr + 2 * sizeof(uint64_t) > PtrEnd)
         return error(instrprof_error::truncated);
       TemporalProfTraceTy Trace;
       Trace.Weight =
-          support::endian::readNext<uint64_t, llvm::endianness::little,
-                                    unaligned>(Ptr);
+          support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
       const uint64_t NumFunctions =
-          support::endian::readNext<uint64_t, llvm::endianness::little,
-                                    unaligned>(Ptr);
+          support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
       // Expect at least NumFunctions 64 bit fields
       if (Ptr + NumFunctions * sizeof(uint64_t) > PtrEnd)
         return error(instrprof_error::truncated);
       for (unsigned j = 0; j < NumFunctions; j++) {
         const uint64_t NameRef =
-            support::endian::readNext<uint64_t, llvm::endianness::little,
-                                      unaligned>(Ptr);
+            support::endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
         Trace.FunctionNameRefs.push_back(NameRef);
       }
       TemporalProfTraces.push_back(std::move(Trace));
diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp
index 97414505f1c13..8e0402dd16e68 100644
--- a/llvm/lib/ProfileData/MemProf.cpp
+++ b/llvm/lib/ProfileData/MemProf.cpp
@@ -144,14 +144,14 @@ static IndexedMemProfRecord deserializeV0(const MemProfSchema &Schema,
 
   // Read the meminfo nodes.
   const uint64_t NumNodes =
-      endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+      endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
   for (uint64_t I = 0; I < NumNodes; I++) {
     IndexedAllocationInfo Node;
     const uint64_t NumFrames =
-        endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+        endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
     for (uint64_t J = 0; J < NumFrames; J++) {
       const FrameId Id =
-          endian::readNext<FrameId, llvm::endianness::little, unaligned>(Ptr);
+          endian::readNext<FrameId, llvm::endianness::little>(Ptr);
       Node.CallStack.push_back(Id);
     }
     Node.CSId = hashCallStack(Node.CallStack);
@@ -162,15 +162,15 @@ static IndexedMemProfRecord deserializeV0(const MemProfSchema &Schema,
 
   // Read the callsite information.
   const uint64_t NumCtxs =
-      endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+      endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
   for (uint64_t J = 0; J < NumCtxs; J++) {
     const uint64_t NumFrames =
-        endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+        endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
     llvm::SmallVector<FrameId> Frames;
     Frames.reserve(NumFrames);
     for (uint64_t K = 0; K < NumFrames; K++) {
       const FrameId Id =
-          endian::readNext<FrameId, llvm::endianness::little, unaligned>(Ptr);
+          endian::readNext<FrameId, llvm::endianness::little>(Ptr);
       Frames.push_back(Id);
     }
     Record.CallSites.push_back(Frames);
@@ -188,11 +188,10 @@ static IndexedMemProfRecord deserializeV2(const MemProfSchema &Schema,
 
   // Read the meminfo nodes.
   const uint64_t NumNodes =
-      endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+      endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
   for (uint64_t I = 0; I < NumNodes; I++) {
     IndexedAllocationInfo Node;
-    Node.CSId =
-        endian::readNext<CallStackId, llvm::endianness::little, unaligned>(Ptr);
+    Node.CSId = endian::readNext<CallStackId, llvm::endianness::little>(Ptr);
     Node.Info.deserialize(Schema, Ptr);
     Ptr += PortableMemInfoBlock::serializedSize();
     Record.AllocSites.push_back(Node);
@@ -200,10 +199,10 @@ static IndexedMemProfRecord deserializeV2(const MemProfSchema &Schema,
 
   // Read the callsite information.
   const uint64_t NumCtxs =
-      endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+      endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
   for (uint64_t J = 0; J < NumCtxs; J++) {
     CallStackId CSId =
-        endian::readNext<CallStackId, llvm::endianness::little, unaligned>(Ptr);
+        endian::readNext<CallStackId, llvm::endianness::little>(Ptr);
     Record.CallSiteIds.push_back(CSId);
   }
 
@@ -224,6 +223,24 @@ IndexedMemProfRecord::deserialize(const MemProfSchema &Schema,
   llvm_unreachable("unsupported MemProf version");
 }
 
+MemProfRecord IndexedMemProfRecord::toMemProfRecord(
+    std::function<const llvm::SmallVector<Frame>(const CallStackId)> Callback)
+    const {
+  MemProfRecord Record;
+
+  for (const memprof::IndexedAllocationInfo &IndexedAI : AllocSites) {
+    memprof::AllocationInfo AI;
+    AI.Info = IndexedAI.Info;
+    AI.CallStack = Callback(IndexedAI.CSId);
+    Record.AllocSites.push_back(AI);
+  }
+
+  for (memprof::CallStackId CSId : CallSiteIds)
+    Record.CallSites.push_back(Callback(CSId));
+
+  return Record;
+}
+
 GlobalValue::GUID IndexedMemProfRecord::getGUID(const StringRef FunctionName) {
   // Canonicalize the function name to drop suffixes such as ".llvm.". Note
   // we do not drop any ".__uniq." suffixes, as getCanonicalFnName does not drop
@@ -245,7 +262,7 @@ Expected<MemProfSchema> readMemProfSchema(const unsigned char *&Buffer) {
 
   const unsigned char *Ptr = Buffer;
   const uint64_t NumSchemaIds =
-      endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+      endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
   if (NumSchemaIds > static_cast<uint64_t>(Meta::Size)) {
     return make_error<InstrProfError>(instrprof_error::malformed,
                                       "memprof schema invalid");
@@ -254,7 +271,7 @@ Expected<MemProfSchema> readMemProfSchema(const unsigned char *&Buffer) {
   MemProfSchema Result;
   for (size_t I = 0; I < NumSchemaIds; I++) {
     const uint64_t Tag =
-        endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+        endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
     if (Tag >= static_cast<uint64_t>(Meta::Size)) {
       return make_error<InstrProfError>(instrprof_error::malformed,
                                         "memprof schema invalid");
diff --git a/llvm/lib/ProfileData/MemProfReader.cpp b/llvm/lib/ProfileData/MemProfReader.cpp
index 580867a9083fd..b4d2c6f043f6d 100644
--- a/llvm/lib/ProfileData/MemProfReader.cpp
+++ b/llvm/lib/ProfileData/MemProfReader.cpp
@@ -86,7 +86,7 @@ llvm::SmallVector<SegmentEntry> readSegmentEntries(const char *Ptr) {
   using namespace support;
 
   const uint64_t NumItemsToRead =
-      endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+      endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
   llvm::SmallVector<SegmentEntry> Items;
   for (uint64_t I = 0; I < NumItemsToRead; I++) {
     Items.push_back(*reinterpret_cast<const SegmentEntry *>(
@@ -100,11 +100,11 @@ readMemInfoBlocks(const char *Ptr) {
   using namespace support;
 
   const uint64_t NumItemsToRead =
-      endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+      endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
   llvm::SmallVector<std::pair<uint64_t, MemInfoBlock>> Items;
   for (uint64_t I = 0; I < NumItemsToRead; I++) {
     const uint64_t Id =
-        endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+        endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
     const MemInfoBlock MIB = *reinterpret_cast<const MemInfoBlock *>(Ptr);
     Items.push_back({Id, MIB});
     // Only increment by size of MIB since readNext implicitly increments.
@@ -117,20 +117,20 @@ CallStackMap readStackInfo(const char *Ptr) {
   using namespace support;
 
   const uint64_t NumItemsToRead =
-      endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+      endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
   CallStackMap Items;
 
   for (uint64_t I = 0; I < NumItemsToRead; I++) {
     const uint64_t StackId =
-        endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+        endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
     const uint64_t NumPCs =
-        endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr);
+        endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
 
     SmallVector<uint64_t> CallStack;
     CallStack.reserve(NumPCs);
     for (uint64_t J = 0; J < NumPCs; J++) {
       CallStack.push_back(
-          endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Ptr));
+          endian::readNext<uint64_t, llvm::endianness::little>(Ptr));
     }
 
     Items[StackId] = CallStack;
@@ -183,6 +183,28 @@ std::string getBuildIdString(const SegmentEntry &Entry) {
 }
 } // namespace
 
+MemProfReader::MemProfReader(
+    llvm::DenseMap<FrameId, Frame> FrameIdMap,
+    llvm::MapVector<GlobalValue::GUID, IndexedMemProfRecord> ProfData)
+    : IdToFrame(std::move(FrameIdMap)),
+      FunctionProfileData(std::move(ProfData)) {
+  // Populate CSId in each IndexedAllocationInfo and IndexedMemProfRecord
+  // while storing CallStack in CSIdToCallStack.
+  for (auto &KV : FunctionProfileData) {
+    IndexedMemProfRecord &Record = KV.second;
+    for (auto &AS : Record.AllocSites) {
+      CallStackId CSId = hashCallStack(AS.CallStack);
+      AS.CSId = CSId;
+      CSIdToCallStack.insert({CSId, AS.CallStack});
+    }
+    for (auto &CS : Record.CallSites) {
+      CallStackId CSId = hashCallStack(CS);
+      Record.CallSiteIds.push_back(CSId);
+      CSIdToCallStack.insert({CSId, CS});
+    }
+  }
+}
+
 Expected<std::unique_ptr<RawMemProfReader>>
 RawMemProfReader::create(const Twine &Path, const StringRef ProfiledBinary,
                          bool KeepName) {
@@ -445,6 +467,7 @@ Error RawMemProfReader::mapRawProfileToRecords() {
     }
 
     CallStackId CSId = hashCallStack(Callstack);
+    CSIdToCallStack.insert({CSId, Callstack});
 
     // We attach the memprof record to each function bottom-up including the
     // first non-inline frame.
@@ -467,7 +490,10 @@ Error RawMemProfReader::mapRawProfileToRecords() {
     auto Result = FunctionProfileData.insert({Id, IndexedMemProfRecord()});
     IndexedMemProfRecord &Record = Result.first->second;
     for (LocationPtr Loc : Locs) {
+      CallStackId CSId = hashCallStack(*Loc);
+      CSIdToCallStack.insert({CSId, *Loc});
       Record.CallSites.push_back(*Loc);
+      Record.CallSiteIds.push_back(CSId);
     }
   }
 
diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp
index 98d0aa794529c..f91a0e6177ea0 100644
--- a/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -503,7 +503,7 @@ ErrorOr<T> SampleProfileReaderBinary::readUnencodedNumber() {
   }
 
   using namespace support;
-  T Val = endian::readNext<T, llvm::endianness::little, unaligned>(Data);
+  T Val = endian::readNext<T, llvm::endianness::little>(Data);
   return Val;
 }
 
diff --git a/llvm/lib/Support/RISCVISAInfo.cpp b/llvm/lib/Support/RISCVISAInfo.cpp
index cbdc64bc7a97b..fa967403ea449 100644
--- a/llvm/lib/Support/RISCVISAInfo.cpp
+++ b/llvm/lib/Support/RISCVISAInfo.cpp
@@ -119,6 +119,7 @@ static const RISCVSupportedExtension SupportedExtensions[] = {
     {"za128rs", {1, 0}},
     {"za64rs", {1, 0}},
     {"zacas", {1, 0}},
+    {"zama16b", {1, 0}},
     {"zawrs", {1, 0}},
 
     {"zba", {1, 0}},
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 5cc612e89162a..419c141121c32 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -4325,10 +4325,7 @@ AArch64FrameLowering::inlineStackProbeLoopExactMultiple(
   ExitMBB->transferSuccessorsAndUpdatePHIs(&MBB);
   MBB.addSuccessor(LoopMBB);
   // Update liveins.
-  bool anyChange = false;
-  do {
-    anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*LoopMBB);
-  } while (anyChange);
+  fullyRecomputeLiveIns({ExitMBB, LoopMBB});
 
   return ExitMBB->begin();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index eee67a0f823c1..7947d73f9a4dd 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15888,7 +15888,7 @@ unsigned AArch64TargetLowering::getNumInterleavedAccesses(
   unsigned VecSize = 128;
   unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
   unsigned MinElts = VecTy->getElementCount().getKnownMinValue();
-  if (UseScalable)
+  if (UseScalable && isa<FixedVectorType>(VecTy))
     VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
   return std::max<unsigned>(1, (MinElts * ElSize + 127) / VecSize);
 }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 92647cb405252..9518d573bccdd 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -9556,15 +9556,8 @@ AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI,
   MBB.addSuccessor(LoopTestMBB);
 
   // Update liveins.
-  if (MF.getRegInfo().reservedRegsFrozen()) {
-    bool anyChange = false;
-    do {
-      anyChange = recomputeLiveIns(*ExitMBB) ||
-                  recomputeLiveIns(*LoopBodyMBB) ||
-                  recomputeLiveIns(*LoopTestMBB);
-    } while (anyChange);
-    ;
-  }
+  if (MF.getRegInfo().reservedRegsFrozen())
+    fullyRecomputeLiveIns({ExitMBB, LoopBodyMBB, LoopTestMBB});
 
   return ExitMBB->begin();
 }
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA510.td b/llvm/lib/Target/AArch64/AArch64SchedA510.td
index 68343674bc819..9456878946151 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA510.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA510.td
@@ -254,7 +254,7 @@ def : InstRW<[WriteIS], (instrs RBITWr, RBITXr)>;
 // Compute pointer authentication code for data address
 // Compute pointer authentication code, using generic key
 // Compute pointer authentication code for instruction address
-def : InstRW<[CortexA510Write<3, CortexA510UnitPAC>], (instregex "^AUT", "^PAC")>;
+def : InstRW<[CortexA510Write<5, CortexA510UnitPAC>], (instregex "^AUT", "^PAC")>;
 
 // Branch and link, register, with pointer authentication
 // Branch, register, with pointer authentication
@@ -401,30 +401,30 @@ def : InstRW<[CortexA510WriteFPALU_F3], (instrs FCSELHrrr, FCSELSrrr, FCSELDrrr)
 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDv(2i32|4i16|8i8)")>;
 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDv(16i8|4i32|8i16)")>;
 // ASIMD absolute diff accum
-def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>], (instregex "[SU]ABAL?v")>;
+def : InstRW<[CortexA510Write<6, CortexA510UnitVALU>], (instregex "[SU]ABAL?v")>;
 // ASIMD absolute diff long
 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]ABDLv")>;
 // ASIMD arith #1
-def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(ADD|SUB|NEG)v(1i64|2i32|4i16|8i8)",
-  "[SU]R?HADDv(2i32|4i16|8i8)", "[SU]HSUBv(2i32|4i16|8i8)")>;
-def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "(ADD|SUB|NEG)v(2i64|4i32|8i16|16i8)",
-  "[SU]R?HADDv(8i16|4i32|16i8)", "[SU]HSUBv(8i16|4i32|16i8)")>;
+def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(ADD|SUB|NEG)v",
+  "[SU]R?HADDv", "[SU]HSUBv")>;
 // ASIMD arith #2
-def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ABSv(1i64|2i32|4i16|8i8)$",
+def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "ABSv(1i64|2i32|4i16|8i8)$",
   "[SU]ADDLPv(2i32_v1i64|4i16_v2i32|8i8_v4i16)$",
-  "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(1i16|1i32|1i64|1i8|2i32|4i16|8i8)$",
   "ADDPv(2i32|4i16|8i8)$")>;
-def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "ABSv(2i64|4i32|8i16|16i8)$",
+def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(1i16|1i32|1i64|1i8|2i32|4i16|8i8)$")>;
+def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "ABSv(2i64|4i32|8i16|16i8)$",
   "[SU]ADDLPv(16i8_v8i16|4i32_v2i64|8i16_v4i32)$",
-  "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(16i8|2i64|4i32|8i16)$",
   "ADDPv(16i8|2i64|4i32|8i16)$")>;
+def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "([SU]QADD|[SU]QSUB|SQNEG|SUQADD|USQADD)v(16i8|2i64|4i32|8i16)$")>;
 // ASIMD arith #3
-def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex  "SADDLv", "UADDLv", "SADDWv",
-  "UADDWv", "SSUBLv", "USUBLv", "SSUBWv", "USUBWv", "ADDHNv", "SUBHNv")>;
+def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex  "SADDLv", "UADDLv", "SADDWv",
+  "UADDWv", "SSUBLv", "USUBLv", "SSUBWv", "USUBWv")>;
+def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex  "ADDHNv", "SUBHNv")>;
 // ASIMD arith #5
-def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "RADDHNv", "RSUBHNv")>;
+def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>], (instregex "RADDHNv", "RSUBHNv")>;
 // ASIMD arith, reduce
-def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex  "ADDVv", "SADDLVv", "UADDLVv")>;
+def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex  "ADDVv")>;
+def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex  "SADDLVv", "UADDLVv")>;
 // ASIMD compare #1
 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(1i64|2i32|4i16|8i8)")>;
 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "CM(EQ|GE|GT|HI|HS|LE|LT)v(2i64|4i32|8i16|16i8)")>;
@@ -437,8 +437,8 @@ def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(AND|EOR|NOT|
 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "(AND|EOR|NOT|ORN)v16i8",
   "(ORR|BIC)v(16i8|4i32|8i16)$", "MVNIv(4i32|4s|8i16)")>;
 // ASIMD max/min, basic
-def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU](MIN|MAX)P?v(2i32|4i16|8i8)")>;
-def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU](MIN|MAX)P?v(16i8|4i132|8i16)")>;
+def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU](MIN|MAX)P?v(2i32|4i16|8i8)")>;
+def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU](MIN|MAX)P?v(16i8|4i132|8i16)")>;
 // SIMD max/min, reduce
 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU](MAX|MIN)Vv")>;
 // ASIMD multiply, by element
@@ -467,12 +467,12 @@ def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]MULLv", "
 // ASIMD polynomial (8x8) multiply long
 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs PMULLv8i8, PMULLv16i8)>;
 // ASIMD pairwise add and accumulate
-def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "[SU]ADALPv")>;
+def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], (instregex "[SU]ADALPv")>;
 // ASIMD shift accumulate
-def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "[SU]SRA(d|v2i32|v4i16|v8i8)")>;
-def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "[SU]SRAv(16i8|2i64|4i32|8i16)")>;
+def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]SRA(d|v2i32|v4i16|v8i8)")>;
+def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "[SU]SRAv(16i8|2i64|4i32|8i16)")>;
 // ASIMD shift accumulate #2
-def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "[SU]RSRA[vd]")>;
+def : InstRW<[CortexA510MCWrite<7, 2, CortexA510UnitVALU>], (instregex "[SU]RSRA[vd]")>;
 // ASIMD shift by immed
 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "SHLd$", "SHLv",
   "SLId$", "SRId$", "[SU]SHR[vd]", "SHRNv")>;
@@ -504,7 +504,7 @@ def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "[SU]QRSHLv(2i
 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^AES[DE]rr$", "^AESI?MCrr")>;
 
 // Crypto polynomial (64x64) multiply long
-def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instrs PMULLv1i64, PMULLv2i64)>;
+def : InstRW<[CortexA510MCWrite<4, 0, CortexA510UnitVMC>], (instrs PMULLv1i64, PMULLv2i64)>;
 
 // Crypto SHA1 hash acceleration op
 // Crypto SHA1 schedule acceleration ops
@@ -512,25 +512,26 @@ def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^SHA1(H|SU0|S
 
 // Crypto SHA1 hash acceleration ops
 // Crypto SHA256 hash acceleration ops
-def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instregex "^SHA1[CMP]", "^SHA256H2?")>;
+def : InstRW<[CortexA510MCWrite<4, 0, CortexA510UnitVMC>], (instregex "^SHA1[CMP]", "^SHA256H2?")>;
 
 // Crypto SHA256 schedule acceleration ops
-def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instregex "^SHA256SU[01]")>;
+def : InstRW<[CortexA510MCWrite<4, 0, CortexA510UnitVMC>], (instregex "^SHA256SU[01]")>;
 
 // Crypto SHA512 hash acceleration ops
-def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instregex "^SHA512(H|H2|SU0|SU1)")>;
+def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instregex "^SHA512(H|H2|SU0|SU1)")>;
 
 // Crypto SHA3 ops
-def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instrs BCAX, EOR3, XAR)>;
-def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instrs RAX1)>;
+def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instrs BCAX, EOR3)>;
+def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs XAR)>;
+def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instrs RAX1)>;
 
 
 // Crypto SM3 ops
-def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instregex "^SM3PARTW[12]$", "^SM3SS1$",
+def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instregex "^SM3PARTW[12]$", "^SM3SS1$",
                                                             "^SM3TT[12][AB]$")>;
 
 // Crypto SM4 ops
-def : InstRW<[CortexA510MCWrite<8, 0, CortexA510UnitVMC>], (instrs SM4E, SM4ENCKEY)>;
+def : InstRW<[CortexA510MCWrite<9, 0, CortexA510UnitVMC>], (instrs SM4E, SM4ENCKEY)>;
 
 // CRC
 // -----------------------------------------------------------------------------
@@ -540,25 +541,25 @@ def : InstRW<[CortexA510MCWrite<2, 0, CortexA510UnitMAC>], (instregex "^CRC32")>
 // SVE Predicate instructions
 
 // Loop control, based on predicate
-def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKA_PPmP, BRKA_PPzP,
+def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKA_PPmP, BRKA_PPzP,
                                                   BRKB_PPmP, BRKB_PPzP)>;
 
 // Loop control, based on predicate and flag setting
-def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKAS_PPzP, BRKBS_PPzP)>;
+def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKAS_PPzP, BRKBS_PPzP)>;
 
 // Loop control, propagating
-def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKN_PPzP, BRKPA_PPzPP, BRKPB_PPzPP)>;
+def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKN_PPzP, BRKPA_PPzPP, BRKPB_PPzPP)>;
 
 // Loop control, propagating and flag setting
-def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKNS_PPzP)>;
-def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs BRKPAS_PPzPP, BRKPBS_PPzPP)>;
+def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs BRKNS_PPzP)>;
+def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instrs BRKPAS_PPzPP, BRKPBS_PPzPP)>;
 
 
 // Loop control, based on GPR
-def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>],
+def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>],
              (instregex "^WHILE(GE|GT|HI|HS|LE|LO|LS|LT)_P(WW|XX)_[BHSD]")>;
 
-def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^WHILE(RW|WR)_PXX_[BHSD]")>;
+def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^WHILE(RW|WR)_PXX_[BHSD]")>;
 
 // Loop terminate
 def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instregex "^CTERM(EQ|NE)_(WW|XX)")>;
@@ -569,20 +570,20 @@ def : InstRW<[CortexA510Write<1, CortexA510UnitALU>], (instrs ADDPL_XXI, ADDVL_X
 def : InstRW<[CortexA510Write<1, CortexA510UnitALU>],
              (instregex "^CNT[BHWD]_XPiI")>;
 
-def : InstRW<[CortexA510Write<1, CortexA510UnitALU>],
+def : InstRW<[CortexA510Write<3, CortexA510UnitALU>],
              (instregex "^(INC|DEC)[BHWD]_XPiI")>;
 
-def : InstRW<[CortexA510Write<1, CortexA510UnitALU>],
+def : InstRW<[CortexA510Write<4, CortexA510UnitALU>],
              (instregex "^(SQINC|SQDEC|UQINC|UQDEC)[BHWD]_[XW]Pi(Wd)?I")>;
 
 // Predicate counting scalar, active predicate
-def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>],
+def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>],
              (instregex "^CNTP_XPP_[BHSD]")>;
 
-def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>],
+def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>],
              (instregex "^(DEC|INC)P_XP_[BHSD]")>;
 
-def : InstRW<[CortexA510Write<8, CortexA510UnitVALU0>],
+def : InstRW<[CortexA510Write<9, CortexA510UnitVALU0>],
              (instregex "^(SQDEC|SQINC|UQDEC|UQINC)P_XP_[BHSD]",
                         "^(UQDEC|UQINC)P_WP_[BHSD]",
                         "^(SQDEC|SQINC|UQDEC|UQINC)P_XPWd_[BHSD]")>;
@@ -593,39 +594,39 @@ def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
              (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)P_ZP_[HSD]")>;
 
 // Predicate logical
-def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>],
+def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>],
              (instregex "^(AND|BIC|EOR|NAND|NOR|ORN|ORR)_PPzPP")>;
 
 // Predicate logical, flag setting
-def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>],
+def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>],
              (instregex "^(ANDS|BICS|EORS|NANDS|NORS|ORNS|ORRS)_PPzPP")>;
 
 // Predicate reverse
-def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^REV_PP_[BHSD]")>;
+def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^REV_PP_[BHSD]")>;
 
 // Predicate select
-def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs SEL_PPPP)>;
+def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs SEL_PPPP)>;
 
 // Predicate set
-def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^PFALSE", "^PTRUE_[BHSD]")>;
+def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^PFALSE", "^PTRUE_[BHSD]")>;
 
 // Predicate set/initialize, set flags
-def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^PTRUES_[BHSD]")>;
+def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^PTRUES_[BHSD]")>;
 
 // Predicate find first/next
-def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^PFIRST_B", "^PNEXT_[BHSD]")>;
+def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^PFIRST_B", "^PNEXT_[BHSD]")>;
 
 // Predicate test
-def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs PTEST_PP)>;
+def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs PTEST_PP)>;
 
 // Predicate transpose
-def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^TRN[12]_PPP_[BHSDQ]")>;
+def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^TRN[12]_PPP_[BHSDQ]")>;
 
 // Predicate unpack and widen
-def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instrs PUNPKHI_PP, PUNPKLO_PP)>;
+def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instrs PUNPKHI_PP, PUNPKLO_PP)>;
 
 // Predicate zip/unzip
-def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^(ZIP|UZP)[12]_PPP_[BHSDQ]")>;
+def : InstRW<[CortexA510Write<2, CortexA510UnitVALU0>], (instregex "^(ZIP|UZP)[12]_PPP_[BHSDQ]")>;
 
 
 // SVE integer instructions
@@ -634,10 +635,10 @@ def : InstRW<[CortexA510Write<6, CortexA510UnitVALU0>], (instregex "^(ZIP|UZP)[1
 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]ABD_(ZPmZ|ZPZZ)_[BHSD]")>;
 
 // Arithmetic, absolute diff accum
-def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "^[SU]ABA_ZZZ_[BHSD]")>;
+def : InstRW<[CortexA510MCWrite<6, 2, CortexA510UnitVALU>], (instregex "^[SU]ABA_ZZZ_[BHSD]")>;
 
 // Arithmetic, absolute diff accum long
-def : InstRW<[CortexA510MCWrite<8, 2, CortexA510UnitVALU>], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]")>;
+def : InstRW<[CortexA510MCWrite<6, 2, CortexA510UnitVALU>], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]")>;
 
 // Arithmetic, absolute diff long
 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]")>;
@@ -651,20 +652,22 @@ def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>],
                         "^(ADD|SUB|SUBR)_ZI_[BHSD]",
                         "^ADR_[SU]XTW_ZZZ_D_[0123]",
                         "^ADR_LSL_ZZZ_[SD]_[0123]",
-                        "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]",
+                        "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]")>;
+def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
+             (instregex "^[SU](ADD|SUB)[LW][BT]_ZZZ_[HSD]",
                         "^SADDLBT_ZZZ_[HSD]",
-                        "^[SU]H(ADD|SUB|SUBR)_ZPmZ_[BHSD]",
                         "^SSUBL(BT|TB)_ZZZ_[HSD]")>;
 
 // Arithmetic, complex
 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
-             (instregex "^R?(ADD|SUB)HN[BT]_ZZZ_[BHS]",
-                        "^SQ(ABS|NEG)_ZPmZ_[BHSD]",
+             (instregex "^SQ(ABS|NEG)_ZPmZ_[BHSD]",
                         "^SQ(ADD|SUB|SUBR)_ZPmZ_?[BHSD]",
                         "^[SU]Q(ADD|SUB)_ZZZ_[BHSD]",
                         "^[SU]Q(ADD|SUB)_ZI_[BHSD]",
                         "^(SRH|SUQ|UQ|USQ|URH)ADD_ZPmZ_[BHSD]",
                         "^(UQSUB|UQSUBR)_ZPmZ_[BHSD]")>;
+def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>],
+             (instregex "^R?(ADD|SUB)HN[BT]_ZZZ_[BHS]")>;
 
 // Arithmetic, large integer
 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]")>;
@@ -735,14 +738,14 @@ def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(BSL|BSL1N|B
 
 // Count/reverse bits
 def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(CLS|CLZ|RBIT)_ZPmZ_[BHSD]")>;
-def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_[BH]")>;
+def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_[BH]")>;
 def : InstRW<[CortexA510Write<8, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_S")>;
 def : InstRW<[CortexA510Write<12, CortexA510UnitVALU>], (instregex "^CNT_ZPmZ_D")>;
 // Broadcast logical bitmask immediate to vector
 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instrs DUPM_ZI)>;
 
 // Compare and set flags
-def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>],
+def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
              (instregex "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_PPzZ[IZ]_[BHSD]",
                         "^CMP(EQ|GE|GT|HI|HS|LE|LO|LS|LT|NE)_WIDE_PPzZZ_[BHS]")>;
 
@@ -939,12 +942,14 @@ def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^SQRDMULH_ZZZ
 // Multiply/multiply long, (8x8) polynomial
 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^PMUL_ZZZ_B")>;
 
-def : InstRW<[CortexA510Write<6, CortexA510UnitVMC>], (instregex "^PMULL[BT]_ZZZ_[HDQ]")>;
+def : InstRW<[CortexA510Write<9, CortexA510UnitVMC>], (instregex "^PMULL[BT]_ZZZ_[HDQ]")>;
 
 
 // Predicate counting vector
+def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>],
+             (instregex "^(DEC|INC)[HWD]_ZPiI")>;
 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>],
-             (instregex "^(DEC|INC|SQDEC|SQINC|UQDEC|UQINC)[HWD]_ZPiI")>;
+             (instregex "^(SQDEC|SQINC|UQDEC|UQINC)[HWD]_ZPiI")>;
 
 // Reciprocal estimate
 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^URECPE_ZPmZ_S", "^URSQRTE_ZPmZ_S")>;
@@ -965,7 +970,7 @@ def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^[SU](ADD|MA
 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU0>], (instregex "^(ANDV|EORV|ORV)_VPZ_[BHSD]")>;
 
 // Reverse, vector
-def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^REV_ZZ_[BHSD]",
+def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^REV_ZZ_[BHSD]",
                                            "^REVB_ZPmZ_[HSD]",
                                            "^REVH_ZPmZ_[SD]",
                                            "^REVW_ZPmZ_D")>;
@@ -980,13 +985,13 @@ def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^TBL_ZZZZ?_[B
 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^TBX_ZZZ_[BHSD]")>;
 
 // Transpose, vector form
-def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^TRN[12]_ZZZ_[BHSDQ]")>;
+def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^TRN[12]_ZZZ_[BHSDQ]")>;
 
 // Unpack and extend
 def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^[SU]UNPK(HI|LO)_ZZ_[HSD]")>;
 
 // Zip/unzip
-def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]")>;
+def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(UZP|ZIP)[12]_ZZZ_[BHSDQ]")>;
 
 // SVE floating-point instructions
 // -----------------------------------------------------------------------------
@@ -1142,7 +1147,7 @@ def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FTMAD_ZZI_[H
 
 // Floating point trigonometric, miscellaneous
 def : InstRW<[CortexA510Write<4, CortexA510UnitVMAC>], (instregex "^FTSMUL_ZZZ_[HSD]")>;
-def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^FTSSEL_ZZZ_[HSD]")>;
+def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^FTSSEL_ZZZ_[HSD]")>;
 
 
 // SVE BFloat16 (BF16) instructions
@@ -1251,12 +1256,12 @@ def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLdSt>],
                         "^GLD(FF)?1D(_SCALED)?$")>;
 
 // Gather load, 32-bit scaled offset
-def : InstRW<[CortexA510MCWrite<9, 9, CortexA510UnitLd>],
+def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLd>],
              (instregex "^GLD(FF)?1S?[HW]_S_[SU]XTW_SCALED$",
                         "^GLD(FF)?1W_[SU]XTW_SCALED")>;
 
 // Gather load, 32-bit unpacked unscaled offset
-def : InstRW<[CortexA510MCWrite<9, 9, CortexA510UnitLd>], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW$",
+def : InstRW<[CortexA510MCWrite<7, 7, CortexA510UnitLd>], (instregex "^GLD(FF)?1S?[BH]_S_[SU]XTW$",
                                               "^GLD(FF)?1W_[SU]XTW$")>;
 
 def : InstRW<[CortexA510Write<0, CortexA510UnitVALU>], (instregex "^PRF(B|H|W|D).*")>;
@@ -1377,12 +1382,12 @@ def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^AES[DE]_ZZZ_
                                            "^AESI?MC_ZZ_B$")>;
 
 // Crypto SHA3 ops
-def : InstRW<[CortexA510Write<3, CortexA510UnitVALU>], (instregex "^(BCAX|EOR3)_ZZZZ$",
+def : InstRW<[CortexA510Write<4, CortexA510UnitVALU>], (instregex "^(BCAX|EOR3)_ZZZZ$",
                                             "^XAR_ZZZI_[BHSD]$")>;
 
-def : InstRW<[CortexA510MC_RC0Write<8, CortexA510UnitVMC>], (instregex "^RAX1_ZZZ_D$")>;
+def : InstRW<[CortexA510MC_RC0Write<9, CortexA510UnitVMC>], (instregex "^RAX1_ZZZ_D$")>;
 
 // Crypto SM4 ops
-def : InstRW<[CortexA510MC_RC0Write<8, CortexA510UnitVMC>], (instregex "^SM4E(KEY)?_ZZZ_S$")>;
+def : InstRW<[CortexA510MC_RC0Write<9, CortexA510UnitVMC>], (instregex "^SM4E(KEY)?_ZZZ_S$")>;
 
 }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 661ea151d1a0c..85dd0f2eb192d 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -262,7 +262,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .minScalar(0, s32)
       .scalarize(0);
 
-  getActionDefinitionsBuilder(G_INTRINSIC_LRINT)
+  getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
       .legalFor({{s64, MinFPScalar}, {s64, s32}, {s64, s64}})
       .libcallFor({{s64, s128}})
       .minScalarOrElt(1, MinFPScalar);
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index d39de770eaf16..44ba9f0429e67 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -424,43 +424,6 @@ void AArch64RegisterBankInfo::applyMappingImpl(
   }
 }
 
-/// Returns whether opcode \p Opc is a pre-isel generic floating-point opcode,
-/// having only floating-point operands.
-static bool isPreISelGenericFloatingPointOpcode(unsigned Opc) {
-  switch (Opc) {
-  case TargetOpcode::G_FADD:
-  case TargetOpcode::G_FSUB:
-  case TargetOpcode::G_FMUL:
-  case TargetOpcode::G_FMA:
-  case TargetOpcode::G_FDIV:
-  case TargetOpcode::G_FCONSTANT:
-  case TargetOpcode::G_FPEXT:
-  case TargetOpcode::G_FPTRUNC:
-  case TargetOpcode::G_FCEIL:
-  case TargetOpcode::G_FFLOOR:
-  case TargetOpcode::G_FNEARBYINT:
-  case TargetOpcode::G_FNEG:
-  case TargetOpcode::G_FCOS:
-  case TargetOpcode::G_FSIN:
-  case TargetOpcode::G_FLOG10:
-  case TargetOpcode::G_FLOG:
-  case TargetOpcode::G_FLOG2:
-  case TargetOpcode::G_FSQRT:
-  case TargetOpcode::G_FABS:
-  case TargetOpcode::G_FEXP:
-  case TargetOpcode::G_FRINT:
-  case TargetOpcode::G_INTRINSIC_TRUNC:
-  case TargetOpcode::G_INTRINSIC_ROUND:
-  case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
-  case TargetOpcode::G_FMAXNUM:
-  case TargetOpcode::G_FMINNUM:
-  case TargetOpcode::G_FMAXIMUM:
-  case TargetOpcode::G_FMINIMUM:
-    return true;
-  }
-  return false;
-}
-
 const RegisterBankInfo::InstructionMapping &
 AArch64RegisterBankInfo::getSameKindOfOperandsMapping(
     const MachineInstr &MI) const {
@@ -830,6 +793,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case TargetOpcode::G_FPTOSI:
   case TargetOpcode::G_FPTOUI:
   case TargetOpcode::G_INTRINSIC_LRINT:
+  case TargetOpcode::G_INTRINSIC_LLRINT:
     if (MRI.getType(MI.getOperand(0).getReg()).isVector())
       break;
     OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR};
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index f283af6fa07d3..db69d50799e70 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -59,6 +59,12 @@ unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
                                            const AMDGPUSubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
+  // Always lower memset, memcpy, and memmove intrinsics to load/store
+  // instructions, rather then generating calls to memset, mempcy or memmove.
+  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~0U;
+  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~0U;
+  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~0U;
+
   // Lower floating point store/load to integer store/load to reduce the number
   // of patterns in tablegen.
   setOperationAction(ISD::LOAD, MVT::f32, Promote);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f7e552177d6f5..305a6c8c3b926 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -655,9 +655,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
   PB.registerPipelineStartEPCallback(
       [](ModulePassManager &PM, OptimizationLevel Level) {
         FunctionPassManager FPM;
-        FPM.addPass(AMDGPUUseNativeCallsPass());
-        if (EnableLibCallSimplify && Level != OptimizationLevel::O0)
-          FPM.addPass(AMDGPUSimplifyLibCallsPass());
         PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
         if (EnableHipStdPar)
           PM.addPass(HipStdParAcceleratorCodeSelectionPass());
@@ -681,6 +678,16 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(
           PM.addPass(AMDGPUAlwaysInlinePass());
       });
 
+  PB.registerPeepholeEPCallback(
+      [](FunctionPassManager &FPM, OptimizationLevel Level) {
+        if (Level == OptimizationLevel::O0)
+          return;
+
+        FPM.addPass(AMDGPUUseNativeCallsPass());
+        if (EnableLibCallSimplify)
+          FPM.addPass(AMDGPUSimplifyLibCallsPass());
+      });
+
   PB.registerCGSCCOptimizerLateEPCallback(
       [this](CGSCCPassManager &PM, OptimizationLevel Level) {
         if (Level == OptimizationLevel::O0)
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 273f92abf3546..8053d89aeb0a8 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1726,7 +1726,7 @@ let SubtargetPredicate = isGFX12Plus in {
   defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["noret"]>;
 }
 
-let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
+let OtherPredicates = [isGFX6GFX7GFX10Plus] in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f32, "BUFFER_ATOMIC_FMIN">;
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f32, "BUFFER_ATOMIC_FMAX">;
 }
diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
index 05e10a95b157c..1dda1b89b2d36 100644
--- a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
@@ -101,6 +101,7 @@ class GCNCreateVOPD : public MachineFunctionPass {
       }
     }
 
+    SII->fixImplicitOperands(*VOPDInst);
     for (auto CompIdx : VOPD::COMPONENTS)
       VOPDInst.copyImplicitOps(*MI[CompIdx]);
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 245731ad5fc7c..acb54fd10b90d 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -612,13 +612,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   // Reserve null register - it shall never be allocated
   reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
 
-  // Disallow vcc_hi allocation in wave32. It may be allocated but most likely
-  // will result in bugs.
-  if (isWave32) {
-    Reserved.set(AMDGPU::VCC);
-    Reserved.set(AMDGPU::VCC_HI);
-  }
-
   // Reserve SGPRs.
   //
   unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 8629551152cb6..ea5dd5427ce72 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -1806,13 +1806,7 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
   PostOrderLoopTraversal DFS(LoLoop.ML, *MLI);
   DFS.ProcessLoop();
   const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder();
-  bool anyChange = false;
-  do {
-    anyChange = false;
-    for (auto *MBB : PostOrder) {
-      anyChange = recomputeLiveIns(*MBB) || anyChange;
-    }
-  } while (anyChange);
+  fullyRecomputeLiveIns(PostOrder);
 
   for (auto *MBB : reverse(PostOrder))
     recomputeLivenessFlags(*MBB);
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
index de492f2b1f0a4..98f5014a34b1d 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -226,11 +226,8 @@ bool LoongArchAsmBackend::shouldInsertFixupForCodeAlign(
       MCFixup::create(0, Dummy, MCFixupKind(LoongArch::fixup_loongarch_align));
   const MCSymbolRefExpr *MCSym = getSecToAlignSym()[Sec];
   if (MCSym == nullptr) {
-    // Create a symbol and make the value of symbol is zero.
-    MCSymbol *Sym = Ctx.createNamedTempSymbol("la-relax-align");
-    Sym->setFragment(&*Sec->getBeginSymbol()->getFragment());
-    Asm.registerSymbol(*Sym);
-    MCSym = MCSymbolRefExpr::create(Sym, Ctx);
+    // Use section symbol directly.
+    MCSym = MCSymbolRefExpr::create(Sec->getBeginSymbol(), Ctx);
     getSecToAlignSym()[Sec] = MCSym;
   }
 
diff --git a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
index 6af1fd8c88e57..62b58cba9f24a 100644
--- a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
@@ -104,26 +104,6 @@ MipsRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
   }
 }
 
-// Instructions where all register operands are floating point.
-static bool isFloatingPointOpcode(unsigned Opc) {
-  switch (Opc) {
-  case TargetOpcode::G_FCONSTANT:
-  case TargetOpcode::G_FADD:
-  case TargetOpcode::G_FSUB:
-  case TargetOpcode::G_FMUL:
-  case TargetOpcode::G_FDIV:
-  case TargetOpcode::G_FABS:
-  case TargetOpcode::G_FSQRT:
-  case TargetOpcode::G_FCEIL:
-  case TargetOpcode::G_FFLOOR:
-  case TargetOpcode::G_FPEXT:
-  case TargetOpcode::G_FPTRUNC:
-    return true;
-  default:
-    return false;
-  }
-}
-
 // Instructions where use operands are floating point registers.
 // Def operands are general purpose.
 static bool isFloatingPointOpcodeUse(unsigned Opc) {
@@ -133,7 +113,7 @@ static bool isFloatingPointOpcodeUse(unsigned Opc) {
   case TargetOpcode::G_FCMP:
     return true;
   default:
-    return isFloatingPointOpcode(Opc);
+    return isPreISelGenericFloatingPointOpcode(Opc);
   }
 }
 
@@ -145,7 +125,7 @@ static bool isFloatingPointOpcodeDef(unsigned Opc) {
   case TargetOpcode::G_UITOFP:
     return true;
   default:
-    return isFloatingPointOpcode(Opc);
+    return isPreISelGenericFloatingPointOpcode(Opc);
   }
 }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 4daf47cfd4df8..cd8546005c028 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3785,8 +3785,7 @@ class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
 
 def Callseq_Start :
   NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
-            "\\{ // callseq $amt1, $amt2\n"
-            "\t.reg .b32 temp_param_reg;",
+            "\\{ // callseq $amt1, $amt2",
             [(callseq_start timm:$amt1, timm:$amt2)]>;
 def Callseq_End :
   NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2),
diff --git a/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp
index 6aeef145e3078..125a49de7b27d 100644
--- a/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp
+++ b/llvm/lib/Target/PowerPC/GISel/PPCRegisterBankInfo.cpp
@@ -13,6 +13,7 @@
 #include "PPCRegisterBankInfo.h"
 #include "PPCRegisterInfo.h"
 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
@@ -239,44 +240,6 @@ PPCRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   return getInstructionMapping(MappingID, Cost, OperandsMapping, NumOperands);
 }
 
-/// Returns whether opcode \p Opc is a pre-isel generic floating-point opcode,
-/// having only floating-point operands.
-/// FIXME: this is copied from target AArch64. Needs some code refactor here to
-/// put this function in GlobalISel/Utils.cpp.
-static bool isPreISelGenericFloatingPointOpcode(unsigned Opc) {
-  switch (Opc) {
-  case TargetOpcode::G_FADD:
-  case TargetOpcode::G_FSUB:
-  case TargetOpcode::G_FMUL:
-  case TargetOpcode::G_FMA:
-  case TargetOpcode::G_FDIV:
-  case TargetOpcode::G_FCONSTANT:
-  case TargetOpcode::G_FPEXT:
-  case TargetOpcode::G_FPTRUNC:
-  case TargetOpcode::G_FCEIL:
-  case TargetOpcode::G_FFLOOR:
-  case TargetOpcode::G_FNEARBYINT:
-  case TargetOpcode::G_FNEG:
-  case TargetOpcode::G_FCOS:
-  case TargetOpcode::G_FSIN:
-  case TargetOpcode::G_FLOG10:
-  case TargetOpcode::G_FLOG:
-  case TargetOpcode::G_FLOG2:
-  case TargetOpcode::G_FSQRT:
-  case TargetOpcode::G_FABS:
-  case TargetOpcode::G_FEXP:
-  case TargetOpcode::G_FRINT:
-  case TargetOpcode::G_INTRINSIC_TRUNC:
-  case TargetOpcode::G_INTRINSIC_ROUND:
-  case TargetOpcode::G_FMAXNUM:
-  case TargetOpcode::G_FMINNUM:
-  case TargetOpcode::G_FMAXIMUM:
-  case TargetOpcode::G_FMINIMUM:
-    return true;
-  }
-  return false;
-}
-
 /// \returns true if a given intrinsic \p ID only uses and defines FPRs.
 static bool isFPIntrinsic(unsigned ID) {
   // TODO: Add more intrinsics.
diff --git a/llvm/lib/Target/PowerPC/P10InstrResources.td b/llvm/lib/Target/PowerPC/P10InstrResources.td
index 5015ba887d0b6..32cebb65cb569 100644
--- a/llvm/lib/Target/PowerPC/P10InstrResources.td
+++ b/llvm/lib/Target/PowerPC/P10InstrResources.td
@@ -881,7 +881,7 @@ def : InstRW<[P10W_FX_3C, P10W_DISP_ANY],
 // 3 Cycles ALU operations, 1 input operands
 def : InstRW<[P10W_FX_3C, P10W_DISP_ANY, P10FX_Read],
       (instrs
-    ADDI, ADDI8, ADDIdtprelL32, ADDItlsldLADDR32, ADDItocL8, LI, LI8,
+    ADDI, ADDI8, ADDIdtprelL32, ADDItlsldLADDR32, ADDItocL, ADDItocL8, LI, LI8,
     ADDIC, ADDIC8,
     ADDIS, ADDIS8, ADDISdtprelHA32, ADDIStocHA, ADDIStocHA8, LIS, LIS8,
     ADDME, ADDME8,
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 1c57b92057fff..6e1002c45d81c 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1148,15 +1148,27 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
     MCSymbolRefExpr::VariantKind VK = GetVKForMO(MO);
 
-    // Always use TOC on AIX. Map the global address operand to be a reference
-    // to the TOC entry we will synthesize later. 'TOCEntry' is a label used to
-    // reference the storage allocated in the TOC which contains the address of
-    // 'MOSymbol'.
-    MCSymbol *TOCEntry =
-        lookUpOrCreateTOCEntry(MOSymbol, getTOCEntryTypeForMO(MO), VK);
-    const MCExpr *Exp = MCSymbolRefExpr::create(TOCEntry,
-                                                MCSymbolRefExpr::VK_PPC_U,
-                                                OutContext);
+    // If the symbol isn't toc-data then use the TOC on AIX.
+    // Map the global address operand to be a reference to the TOC entry we
+    // will synthesize later. 'TOCEntry' is a label used to reference the
+    // storage allocated in the TOC which contains the address of 'MOSymbol'.
+    // If the toc-data attribute is used, the TOC entry contains the data
+    // rather than the address of the MOSymbol.
+    if (![](const MachineOperand &MO) {
+          if (!MO.isGlobal())
+            return false;
+
+          const GlobalVariable *GV = dyn_cast<GlobalVariable>(MO.getGlobal());
+          if (!GV)
+            return false;
+
+          return GV->hasAttribute("toc-data");
+        }(MO)) {
+      MOSymbol = lookUpOrCreateTOCEntry(MOSymbol, getTOCEntryTypeForMO(MO), VK);
+    }
+
+    const MCExpr *Exp = MCSymbolRefExpr::create(
+        MOSymbol, MCSymbolRefExpr::VK_PPC_U, OutContext);
     TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
     return;
@@ -1273,25 +1285,32 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     EmitToStreamer(*OutStreamer, TmpInst);
     return;
   }
+  case PPC::ADDItocL:
   case PPC::ADDItocL8: {
-    // Transform %xd = ADDItocL8 %xs, @sym
+    // Transform %xd = ADDItocL %xs, @sym
     LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
 
-    // Change the opcode to ADDI8. If the global address is external, then
-    // generate a TOC entry and reference that. Otherwise, reference the
-    // symbol directly.
-    TmpInst.setOpcode(PPC::ADDI8);
+    unsigned Op = MI->getOpcode();
+
+    // Change the opcode to load address for tocdata
+    TmpInst.setOpcode(Op == PPC::ADDItocL8 ? PPC::ADDI8 : PPC::LA);
 
     const MachineOperand &MO = MI->getOperand(2);
-    assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL8.");
+    assert((Op == PPC::ADDItocL8)
+               ? (MO.isGlobal() || MO.isCPI())
+               : MO.isGlobal() && "Invalid operand for ADDItocL8.");
+    assert(!(MO.isGlobal() && Subtarget->isGVIndirectSymbol(MO.getGlobal())) &&
+           "Interposable definitions must use indirect accesses.");
 
-    LLVM_DEBUG(assert(
-        !(MO.isGlobal() && Subtarget->isGVIndirectSymbol(MO.getGlobal())) &&
-        "Interposable definitions must use indirect access."));
+    // Map the operand to its corresponding MCSymbol.
+    const MCSymbol *const MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
+
+    const MCExpr *Exp = MCSymbolRefExpr::create(
+        MOSymbol,
+        Op == PPC::ADDItocL8 ? MCSymbolRefExpr::VK_PPC_TOC_LO
+                             : MCSymbolRefExpr::VK_PPC_L,
+        OutContext);
 
-    const MCExpr *Exp =
-        MCSymbolRefExpr::create(getMCSymbolForTOCPseudoMO(MO, *this),
-                                MCSymbolRefExpr::VK_PPC_TOC_LO, OutContext);
     TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
     return;
diff --git a/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
index b43eee8fdd8c0..b3cfcb2aa1440 100644
--- a/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
+++ b/llvm/lib/Target/PowerPC/PPCExpandAtomicPseudoInsts.cpp
@@ -208,10 +208,7 @@ bool PPCExpandAtomicPseudo::expandAtomicRMW128(
       .addMBB(LoopMBB);
   CurrentMBB->addSuccessor(LoopMBB);
   CurrentMBB->addSuccessor(ExitMBB);
-  bool anyChange = false;
-  do {
-    anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*LoopMBB);
-  } while (anyChange);
+  fullyRecomputeLiveIns({ExitMBB, LoopMBB});
   NMBBI = MBB.end();
   MI.eraseFromParent();
   return true;
@@ -288,11 +285,7 @@ bool PPCExpandAtomicPseudo::expandAtomicCmpSwap128(
   CurrentMBB->addSuccessor(LoopCmpMBB);
   CurrentMBB->addSuccessor(ExitMBB);
 
-  bool anyChange = false;
-  do {
-    anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*CmpSuccMBB) ||
-                recomputeLiveIns(*LoopCmpMBB);
-  } while (anyChange);
+  fullyRecomputeLiveIns({ExitMBB, CmpSuccMBB, LoopCmpMBB});
   NMBBI = MBB.end();
   MI.eraseFromParent();
   return true;
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index 6dcb59a3a57f8..04e9f9e2366ed 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -1435,11 +1435,7 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF,
       ProbeLoopBodyMBB->addSuccessor(ProbeLoopBodyMBB);
     }
     // Update liveins.
-    bool anyChange = false;
-    do {
-      anyChange = recomputeLiveIns(*ProbeExitMBB) ||
-                  recomputeLiveIns(*ProbeLoopBodyMBB);
-    } while (anyChange);
+    fullyRecomputeLiveIns({ProbeExitMBB, ProbeLoopBodyMBB});
     return ProbeExitMBB;
   };
   // For case HasBP && MaxAlign > 1, we have to realign the SP by performing
@@ -1531,10 +1527,7 @@ void PPCFrameLowering::inlineStackProbe(MachineFunction &MF,
         buildDefCFAReg(*ExitMBB, ExitMBB->begin(), SPReg);
       }
       // Update liveins.
-      bool anyChange = false;
-      do {
-        anyChange = recomputeLiveIns(*ExitMBB) || recomputeLiveIns(*LoopMBB);
-      } while (anyChange);
+      fullyRecomputeLiveIns({ExitMBB, LoopMBB});
     }
   }
   ++NumPrologProbed;
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index af82b6cdb1809..2f647daa4bcb5 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -510,7 +510,7 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
 }
 
 // Check if a SDValue has the toc-data attribute.
-static bool hasTocDataAttr(SDValue Val, unsigned PointerSize) {
+static bool hasTocDataAttr(SDValue Val) {
   GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Val);
   if (!GA)
     return false;
@@ -6115,8 +6115,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
 
       assert(isAIXABI && "ELF ABI already handled");
 
-      if (hasTocDataAttr(N->getOperand(0),
-                         CurDAG->getDataLayout().getPointerSize())) {
+      if (hasTocDataAttr(N->getOperand(0))) {
         replaceWith(PPC::ADDItoc, N, MVT::i32);
         return;
       }
@@ -6128,8 +6127,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     if (isPPC64 && CModel == CodeModel::Small) {
       assert(isAIXABI && "ELF ABI handled in common SelectCode");
 
-      if (hasTocDataAttr(N->getOperand(0),
-                         CurDAG->getDataLayout().getPointerSize())) {
+      if (hasTocDataAttr(N->getOperand(0))) {
         replaceWith(PPC::ADDItoc8, N, MVT::i64);
         return;
       }
@@ -6144,9 +6142,10 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
            " ELF/AIX or 32-bit AIX in the following.");
 
     // Transforms the ISD::TOC_ENTRY node for 32-bit AIX large code model mode
-    // or 64-bit medium (ELF-only) or large (ELF and AIX) code model code. We
-    // generate two instructions as described below. The first source operand
-    // is a symbol reference. If it must be toc-referenced according to
+    // or 64-bit medium (ELF-only) or large (ELF and AIX) code model code non
+    // toc-data symbols.
+    // We generate two instructions as described below. The first source
+    // operand is a symbol reference. If it must be toc-referenced according to
     // Subtarget, we generate:
     // [32-bit AIX]
     //   LWZtocL(@sym, ADDIStocHA(%r2, @sym))
@@ -6154,6 +6153,13 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     //   LDtocL(@sym, ADDIStocHA8(%x2, @sym))
     // Otherwise we generate:
     //   ADDItocL8(ADDIStocHA8(%x2, @sym), @sym)
+
+    // For large code model toc-data symbols we generate:
+    // [32-bit AIX]
+    //   ADDItocL(ADDIStocHA(%x2, @sym), @sym)
+    // [64-bit AIX]
+    //   Currently not supported.
+
     SDValue GA = N->getOperand(0);
     SDValue TOCbase = N->getOperand(1);
 
@@ -6161,6 +6167,19 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     SDNode *Tmp = CurDAG->getMachineNode(
         isPPC64 ? PPC::ADDIStocHA8 : PPC::ADDIStocHA, dl, VT, TOCbase, GA);
 
+    // On AIX if the symbol has the toc-data attribute it will be defined
+    // in the TOC entry, so we use an ADDItocL similar to the medium code
+    // model ELF abi.
+    if (isAIXABI && hasTocDataAttr(GA)) {
+      if (isPPC64)
+        report_fatal_error(
+            "64-bit large code model toc-data not yet supported");
+
+      ReplaceNode(N, CurDAG->getMachineNode(PPC::ADDItocL, dl, VT,
+                                            SDValue(Tmp, 0), GA));
+      return;
+    }
+
     if (PPCLowering->isAccessedAsGotIndirect(GA)) {
       // If it is accessed as got-indirect, we need an extra LWZ/LD to load
       // the address.
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 93874d65531ae..b32f178ca38e6 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1090,6 +1090,7 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(
   case PPC::LIS8:
   case PPC::ADDIStocHA:
   case PPC::ADDIStocHA8:
+  case PPC::ADDItocL:
   case PPC::ADDItocL8:
   case PPC::LOAD_STACK_GUARD:
   case PPC::PPCLdFixedAddr:
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 6423e692d88c3..43e3902cf4074 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -3346,11 +3346,13 @@ def ADDIStocHA : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, tocentr
                        "#ADDIStocHA",
                        [(set i32:$rD,
                          (PPCtoc_entry i32:$reg, tglobaladdr:$disp))]>;
-// Local Data Transform
+// TOC Data Transform AIX
 def ADDItoc : PPCEmitTimePseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg),
                    "#ADDItoc",
                    [(set i32:$rD,
                      (PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>;
+def ADDItocL : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, tocentry32:$disp),
+                   "#ADDItocL", []>;
 
 // Get Global (GOT) Base Register offset, from the word immediately preceding
 // the function label.
diff --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.def b/llvm/lib/Target/PowerPC/PPCMacroFusion.def
index fb6e656edb8b9..1a61ae23e5d72 100644
--- a/llvm/lib/Target/PowerPC/PPCMacroFusion.def
+++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.def
@@ -32,7 +32,7 @@
 // {addi} followed by one of these {lxvd2x, lxvw4x, lxvdsx, lvebx, lvehx,
 // lvewx, lvx, lxsdx}
 FUSION_FEATURE(AddiLoad, hasAddiLoadFusion, 2, \
-               FUSION_OP_SET(ADDI, ADDI8, ADDItocL8), \
+               FUSION_OP_SET(ADDI, ADDI8, ADDItocL, ADDItocL8), \
                FUSION_OP_SET(LXVD2X, LXVW4X, LXVDSX, LVEBX, LVEHX, LVEWX, \
                              LVX, LXSDX))
 
@@ -134,13 +134,13 @@ FUSION_FEATURE(XorisXori, hasWideImmFusion, 1, FUSION_OP_SET(XORIS, XORIS8),
 
 // addis rx,ra,si - addi rt,rx,SI, SI >= 0
 FUSION_FEATURE(AddisAddi, hasWideImmFusion, 1,
-               FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8),
-               FUSION_OP_SET(ADDI, ADDI8, ADDItocL8))
+               FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8, ADDIStocHA),
+               FUSION_OP_SET(ADDI, ADDI8, ADDItocL8, ADDItocL))
 
 // addi rx,ra,si - addis rt,rx,SI, ra > 0, SI >= 2
 FUSION_FEATURE(AddiAddis, hasWideImmFusion, 1,
-               FUSION_OP_SET(ADDI, ADDI8, ADDItocL8),
-               FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8))
+               FUSION_OP_SET(ADDI, ADDI8, ADDItocL8, ADDItocL),
+               FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8, ADDIStocHA))
 
 // mtctr - { bcctr,bcctrl }
 FUSION_FEATURE(ZeroMoveCTR, hasZeroMoveFusion, -1,
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
index 45e19cdea300b..c18892ac62f24 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVCallLowering.cpp
@@ -34,14 +34,15 @@ struct RISCVOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner {
   // Whether this is assigning args for a return.
   bool IsRet;
 
-  // true if assignArg has been called for a mask argument, false otherwise.
-  bool AssignedFirstMaskArg = false;
+  RVVArgDispatcher &RVVDispatcher;
 
 public:
   RISCVOutgoingValueAssigner(
-      RISCVTargetLowering::RISCVCCAssignFn *RISCVAssignFn_, bool IsRet)
+      RISCVTargetLowering::RISCVCCAssignFn *RISCVAssignFn_, bool IsRet,
+      RVVArgDispatcher &RVVDispatcher)
       : CallLowering::OutgoingValueAssigner(nullptr),
-        RISCVAssignFn(RISCVAssignFn_), IsRet(IsRet) {}
+        RISCVAssignFn(RISCVAssignFn_), IsRet(IsRet),
+        RVVDispatcher(RVVDispatcher) {}
 
   bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT,
                  CCValAssign::LocInfo LocInfo,
@@ -51,16 +52,9 @@ struct RISCVOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner {
     const DataLayout &DL = MF.getDataLayout();
     const RISCVSubtarget &Subtarget = MF.getSubtarget<RISCVSubtarget>();
 
-    std::optional<unsigned> FirstMaskArgument;
-    if (Subtarget.hasVInstructions() && !AssignedFirstMaskArg &&
-        ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) {
-      FirstMaskArgument = ValNo;
-      AssignedFirstMaskArg = true;
-    }
-
     if (RISCVAssignFn(DL, Subtarget.getTargetABI(), ValNo, ValVT, LocVT,
                       LocInfo, Flags, State, Info.IsFixed, IsRet, Info.Ty,
-                      *Subtarget.getTargetLowering(), FirstMaskArgument))
+                      *Subtarget.getTargetLowering(), RVVDispatcher))
       return true;
 
     StackSize = State.getStackSize();
@@ -181,14 +175,15 @@ struct RISCVIncomingValueAssigner : public CallLowering::IncomingValueAssigner {
   // Whether this is assigning args from a return.
   bool IsRet;
 
-  // true if assignArg has been called for a mask argument, false otherwise.
-  bool AssignedFirstMaskArg = false;
+  RVVArgDispatcher &RVVDispatcher;
 
 public:
   RISCVIncomingValueAssigner(
-      RISCVTargetLowering::RISCVCCAssignFn *RISCVAssignFn_, bool IsRet)
+      RISCVTargetLowering::RISCVCCAssignFn *RISCVAssignFn_, bool IsRet,
+      RVVArgDispatcher &RVVDispatcher)
       : CallLowering::IncomingValueAssigner(nullptr),
-        RISCVAssignFn(RISCVAssignFn_), IsRet(IsRet) {}
+        RISCVAssignFn(RISCVAssignFn_), IsRet(IsRet),
+        RVVDispatcher(RVVDispatcher) {}
 
   bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT,
                  CCValAssign::LocInfo LocInfo,
@@ -201,16 +196,9 @@ struct RISCVIncomingValueAssigner : public CallLowering::IncomingValueAssigner {
     if (LocVT.isScalableVector())
       MF.getInfo<RISCVMachineFunctionInfo>()->setIsVectorCall();
 
-    std::optional<unsigned> FirstMaskArgument;
-    if (Subtarget.hasVInstructions() && !AssignedFirstMaskArg &&
-        ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) {
-      FirstMaskArgument = ValNo;
-      AssignedFirstMaskArg = true;
-    }
-
     if (RISCVAssignFn(DL, Subtarget.getTargetABI(), ValNo, ValVT, LocVT,
                       LocInfo, Flags, State, /*IsFixed=*/true, IsRet, Info.Ty,
-                      *Subtarget.getTargetLowering(), FirstMaskArgument))
+                      *Subtarget.getTargetLowering(), RVVDispatcher))
       return true;
 
     StackSize = State.getStackSize();
@@ -420,9 +408,11 @@ bool RISCVCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder,
   SmallVector<ArgInfo, 4> SplitRetInfos;
   splitToValueTypes(OrigRetInfo, SplitRetInfos, DL, CC);
 
+  RVVArgDispatcher Dispatcher{&MF, getTLI<RISCVTargetLowering>(),
+                              ArrayRef(F.getReturnType())};
   RISCVOutgoingValueAssigner Assigner(
       CC == CallingConv::Fast ? RISCV::CC_RISCV_FastCC : RISCV::CC_RISCV,
-      /*IsRet=*/true);
+      /*IsRet=*/true, Dispatcher);
   RISCVOutgoingValueHandler Handler(MIRBuilder, MF.getRegInfo(), Ret);
   return determineAndHandleAssignments(Handler, Assigner, SplitRetInfos,
                                        MIRBuilder, CC, F.isVarArg());
@@ -531,6 +521,7 @@ bool RISCVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   CallingConv::ID CC = F.getCallingConv();
 
   SmallVector<ArgInfo, 32> SplitArgInfos;
+  SmallVector<Type *, 4> TypeList;
   unsigned Index = 0;
   for (auto &Arg : F.args()) {
     // Construct the ArgInfo object from destination register and argument type.
@@ -542,12 +533,16 @@ bool RISCVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
     // correspondingly and appended to SplitArgInfos.
     splitToValueTypes(AInfo, SplitArgInfos, DL, CC);
 
+    TypeList.push_back(Arg.getType());
+
     ++Index;
   }
 
+  RVVArgDispatcher Dispatcher{&MF, getTLI<RISCVTargetLowering>(),
+                              ArrayRef(TypeList)};
   RISCVIncomingValueAssigner Assigner(
       CC == CallingConv::Fast ? RISCV::CC_RISCV_FastCC : RISCV::CC_RISCV,
-      /*IsRet=*/false);
+      /*IsRet=*/false, Dispatcher);
   RISCVFormalArgHandler Handler(MIRBuilder, MF.getRegInfo());
 
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -585,11 +580,13 @@ bool RISCVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
 
   SmallVector<ArgInfo, 32> SplitArgInfos;
   SmallVector<ISD::OutputArg, 8> Outs;
+  SmallVector<Type *, 4> TypeList;
   for (auto &AInfo : Info.OrigArgs) {
     // Handle any required unmerging of split value types from a given VReg into
     // physical registers. ArgInfo objects are constructed correspondingly and
     // appended to SplitArgInfos.
     splitToValueTypes(AInfo, SplitArgInfos, DL, CC);
+    TypeList.push_back(AInfo.Ty);
   }
 
   // TODO: Support tail calls.
@@ -607,9 +604,11 @@ bool RISCVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   Call.addRegMask(TRI->getCallPreservedMask(MF, Info.CallConv));
 
+  RVVArgDispatcher ArgDispatcher{&MF, getTLI<RISCVTargetLowering>(),
+                                 ArrayRef(TypeList)};
   RISCVOutgoingValueAssigner ArgAssigner(
       CC == CallingConv::Fast ? RISCV::CC_RISCV_FastCC : RISCV::CC_RISCV,
-      /*IsRet=*/false);
+      /*IsRet=*/false, ArgDispatcher);
   RISCVOutgoingValueHandler ArgHandler(MIRBuilder, MF.getRegInfo(), Call);
   if (!determineAndHandleAssignments(ArgHandler, ArgAssigner, SplitArgInfos,
                                      MIRBuilder, CC, Info.IsVarArg))
@@ -637,9 +636,11 @@ bool RISCVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   SmallVector<ArgInfo, 4> SplitRetInfos;
   splitToValueTypes(Info.OrigRet, SplitRetInfos, DL, CC);
 
+  RVVArgDispatcher RetDispatcher{&MF, getTLI<RISCVTargetLowering>(),
+                                 ArrayRef(F.getReturnType())};
   RISCVIncomingValueAssigner RetAssigner(
       CC == CallingConv::Fast ? RISCV::CC_RISCV_FastCC : RISCV::CC_RISCV,
-      /*IsRet=*/true);
+      /*IsRet=*/true, RetDispatcher);
   RISCVCallReturnHandler RetHandler(MIRBuilder, MF.getRegInfo(), Call);
   if (!determineAndHandleAssignments(RetHandler, RetAssigner, SplitRetInfos,
                                      MIRBuilder, CC, Info.IsVarArg))
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
index 86e44343b5086..cc534f29685f2 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
@@ -154,46 +154,6 @@ static const RegisterBankInfo::ValueMapping *getFPValueMapping(unsigned Size) {
   return &RISCV::ValueMappings[Idx];
 }
 
-/// Returns whether opcode \p Opc is a pre-isel generic floating-point opcode,
-/// having only floating-point operands.
-/// FIXME: this is copied from target AArch64. Needs some code refactor here to
-/// put this function in GlobalISel/Utils.cpp.
-static bool isPreISelGenericFloatingPointOpcode(unsigned Opc) {
-  switch (Opc) {
-  case TargetOpcode::G_FADD:
-  case TargetOpcode::G_FSUB:
-  case TargetOpcode::G_FMUL:
-  case TargetOpcode::G_FMA:
-  case TargetOpcode::G_FDIV:
-  case TargetOpcode::G_FCONSTANT:
-  case TargetOpcode::G_FPEXT:
-  case TargetOpcode::G_FPTRUNC:
-  case TargetOpcode::G_FCEIL:
-  case TargetOpcode::G_FFLOOR:
-  case TargetOpcode::G_FNEARBYINT:
-  case TargetOpcode::G_FNEG:
-  case TargetOpcode::G_FCOPYSIGN:
-  case TargetOpcode::G_FCOS:
-  case TargetOpcode::G_FSIN:
-  case TargetOpcode::G_FLOG10:
-  case TargetOpcode::G_FLOG:
-  case TargetOpcode::G_FLOG2:
-  case TargetOpcode::G_FSQRT:
-  case TargetOpcode::G_FABS:
-  case TargetOpcode::G_FEXP:
-  case TargetOpcode::G_FRINT:
-  case TargetOpcode::G_INTRINSIC_TRUNC:
-  case TargetOpcode::G_INTRINSIC_ROUND:
-  case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
-  case TargetOpcode::G_FMAXNUM:
-  case TargetOpcode::G_FMINNUM:
-  case TargetOpcode::G_FMAXIMUM:
-  case TargetOpcode::G_FMINIMUM:
-    return true;
-  }
-  return false;
-}
-
 // TODO: Make this more like AArch64?
 bool RISCVRegisterBankInfo::hasFPConstraints(
     const MachineInstr &MI, const MachineRegisterInfo &MRI,
diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index 173995f05b51c..d93709ac03420 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -326,8 +326,8 @@ bool RISCVExpandPseudo::expandRV32ZdinxStore(MachineBasicBlock &MBB,
       .setMemRefs(MMOLo);
 
   if (MBBI->getOperand(2).isGlobal() || MBBI->getOperand(2).isCPI()) {
-    // FIXME: Zdinx RV32 can not work on unaligned memory.
-    assert(!STI->hasFastUnalignedAccess());
+    // FIXME: Zdinx RV32 can not work on unaligned scalar memory.
+    assert(!STI->enableUnalignedScalarMem());
 
     assert(MBBI->getOperand(2).getOffset() % 8 == 0);
     MBBI->getOperand(2).setOffset(MBBI->getOperand(2).getOffset() + 4);
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 794455aa73040..f830ead5dd692 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -208,6 +208,13 @@ def HasStdExtAOrZalrsc
                          "'A' (Atomic Instructions) or "
                          "'Zalrsc' (Load-Reserved/Store-Conditional)">;
 
+def FeatureStdExtZama16b
+    : SubtargetFeature<"zama16b", "HasStdExtZama16b", "true",
+                       "'Zama16b' (Atomic 16-byte misaligned loads, stores and AMOs)">;
+def HasStdExtZama16b : Predicate<"Subtarget->hasStdExtZama16b()">,
+                       AssemblerPredicate<(all_of FeatureStdExtZama16b),
+                           "'Zama16b' (Atomic 16-byte misaligned loads, stores and AMOs)">;
+
 def FeatureStdExtZawrs : SubtargetFeature<"zawrs", "HasStdExtZawrs", "true",
                                           "'Zawrs' (Wait on Reservation Set)">;
 def HasStdExtZawrs : Predicate<"Subtarget->hasStdExtZawrs()">,
@@ -1183,10 +1190,15 @@ def FeatureTrailingSeqCstFence : SubtargetFeature<"seq-cst-trailing-fence",
                                           "true",
                                           "Enable trailing fence for seq-cst store.">;
 
-def FeatureFastUnalignedAccess
-   : SubtargetFeature<"fast-unaligned-access", "HasFastUnalignedAccess",
-                      "true", "Has reasonably performant unaligned "
-                      "loads and stores (both scalar and vector)">;
+def FeatureUnalignedScalarMem
+   : SubtargetFeature<"unaligned-scalar-mem", "EnableUnalignedScalarMem",
+                      "true", "Has reasonably performant unaligned scalar "
+                      "loads and stores">;
+
+def FeatureUnalignedVectorMem
+   : SubtargetFeature<"unaligned-vector-mem", "EnableUnalignedVectorMem",
+                      "true", "Has reasonably performant unaligned vector "
+                      "loads and stores">;
 
 def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
     "UsePostRAScheduler", "true", "Schedule again after register allocation">;
@@ -1226,9 +1238,9 @@ def TuneNoSinkSplatOperands
                        "false", "Disable sink splat operands to enable .vx, .vf,"
                        ".wx, and .wf instructions">;
 
-def TuneNoStripWSuffix
-    : SubtargetFeature<"no-strip-w-suffix", "EnableStripWSuffix", "false",
-                       "Disable strip W suffix">;
+def TunePreferWInst
+    : SubtargetFeature<"prefer-w-inst", "PreferWInst", "true",
+                       "Prefer instructions with W suffix">;
 
 def TuneConditionalCompressedMoveFusion
     : SubtargetFeature<"conditional-cmv-fusion", "HasConditionalCompressedMoveFusion",
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 71672ed7b4ae7..cb41577c5d943 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -435,6 +435,33 @@ void RISCVFrameLowering::adjustStackForRVV(MachineFunction &MF,
                Flag, getStackAlign());
 }
 
+static void appendScalableVectorExpression(const TargetRegisterInfo &TRI,
+                                           SmallVectorImpl<char> &Expr,
+                                           int FixedOffset, int ScalableOffset,
+                                           llvm::raw_string_ostream &Comment) {
+  unsigned DwarfVLenB = TRI.getDwarfRegNum(RISCV::VLENB, true);
+  uint8_t Buffer[16];
+  if (FixedOffset) {
+    Expr.push_back(dwarf::DW_OP_consts);
+    Expr.append(Buffer, Buffer + encodeSLEB128(FixedOffset, Buffer));
+    Expr.push_back((uint8_t)dwarf::DW_OP_plus);
+    Comment << (FixedOffset < 0 ? " - " : " + ") << std::abs(FixedOffset);
+  }
+
+  Expr.push_back((uint8_t)dwarf::DW_OP_consts);
+  Expr.append(Buffer, Buffer + encodeSLEB128(ScalableOffset, Buffer));
+
+  Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
+  Expr.append(Buffer, Buffer + encodeULEB128(DwarfVLenB, Buffer));
+  Expr.push_back(0);
+
+  Expr.push_back((uint8_t)dwarf::DW_OP_mul);
+  Expr.push_back((uint8_t)dwarf::DW_OP_plus);
+
+  Comment << (ScalableOffset < 0 ? " - " : " + ") << std::abs(ScalableOffset)
+          << " * vlenb";
+}
+
 static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
                                                Register Reg,
                                                uint64_t FixedOffset,
@@ -452,30 +479,38 @@ static MCCFIInstruction createDefCFAExpression(const TargetRegisterInfo &TRI,
   else
     Comment << printReg(Reg, &TRI);
 
-  uint8_t buffer[16];
-  if (FixedOffset) {
-    Expr.push_back(dwarf::DW_OP_consts);
-    Expr.append(buffer, buffer + encodeSLEB128(FixedOffset, buffer));
-    Expr.push_back((uint8_t)dwarf::DW_OP_plus);
-    Comment << " + " << FixedOffset;
-  }
+  appendScalableVectorExpression(TRI, Expr, FixedOffset, ScalableOffset,
+                                 Comment);
 
-  Expr.push_back((uint8_t)dwarf::DW_OP_consts);
-  Expr.append(buffer, buffer + encodeSLEB128(ScalableOffset, buffer));
+  SmallString<64> DefCfaExpr;
+  uint8_t Buffer[16];
+  DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
+  DefCfaExpr.append(Buffer, Buffer + encodeULEB128(Expr.size(), Buffer));
+  DefCfaExpr.append(Expr.str());
 
-  unsigned DwarfVlenb = TRI.getDwarfRegNum(RISCV::VLENB, true);
-  Expr.push_back((uint8_t)dwarf::DW_OP_bregx);
-  Expr.append(buffer, buffer + encodeULEB128(DwarfVlenb, buffer));
-  Expr.push_back(0);
+  return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
+                                        Comment.str());
+}
 
-  Expr.push_back((uint8_t)dwarf::DW_OP_mul);
-  Expr.push_back((uint8_t)dwarf::DW_OP_plus);
+static MCCFIInstruction createDefCFAOffset(const TargetRegisterInfo &TRI,
+                                           Register Reg, uint64_t FixedOffset,
+                                           uint64_t ScalableOffset) {
+  assert(ScalableOffset != 0 && "Did not need to adjust CFA for RVV");
+  SmallString<64> Expr;
+  std::string CommentBuffer;
+  llvm::raw_string_ostream Comment(CommentBuffer);
+  Comment << printReg(Reg, &TRI) << "  @ cfa";
 
-  Comment << " + " << ScalableOffset << " * vlenb";
+  // Build up the expression (FixedOffset + ScalableOffset * VLENB).
+  appendScalableVectorExpression(TRI, Expr, FixedOffset, ScalableOffset,
+                                 Comment);
 
   SmallString<64> DefCfaExpr;
-  DefCfaExpr.push_back(dwarf::DW_CFA_def_cfa_expression);
-  DefCfaExpr.append(buffer, buffer + encodeULEB128(Expr.size(), buffer));
+  uint8_t Buffer[16];
+  unsigned DwarfReg = TRI.getDwarfRegNum(Reg, true);
+  DefCfaExpr.push_back(dwarf::DW_CFA_expression);
+  DefCfaExpr.append(Buffer, Buffer + encodeULEB128(DwarfReg, Buffer));
+  DefCfaExpr.append(Buffer, Buffer + encodeULEB128(Expr.size(), Buffer));
   DefCfaExpr.append(Expr.str());
 
   return MCCFIInstruction::createEscape(nullptr, DefCfaExpr.str(), SMLoc(),
@@ -671,6 +706,9 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
           .addCFIIndex(CFIIndex)
           .setMIFlag(MachineInstr::FrameSetup);
     }
+
+    std::advance(MBBI, getRVVCalleeSavedInfo(MF, CSI).size());
+    emitCalleeSavedRVVPrologCFI(MBB, MBBI, hasFP(MF));
   }
 
   if (hasFP(MF)) {
@@ -1492,6 +1530,41 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters(
   return true;
 }
 
+void RISCVFrameLowering::emitCalleeSavedRVVPrologCFI(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, bool HasFP) const {
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
+  RISCVMachineFunctionInfo *RVFI = MF->getInfo<RISCVMachineFunctionInfo>();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+  DebugLoc DL = MBB.findDebugLoc(MI);
+
+  const auto &RVVCSI = getRVVCalleeSavedInfo(*MF, MFI.getCalleeSavedInfo());
+  if (RVVCSI.empty())
+    return;
+
+  uint64_t FixedSize = getStackSizeWithRVVPadding(*MF);
+  if (!HasFP) {
+    uint64_t ScalarLocalVarSize =
+        MFI.getStackSize() - RVFI->getCalleeSavedStackSize() -
+        RVFI->getRVPushStackSize() - RVFI->getVarArgsSaveSize() +
+        RVFI->getRVVPadding();
+    FixedSize -= ScalarLocalVarSize;
+  }
+
+  for (auto &CS : RVVCSI) {
+    // Insert the spill to the stack frame.
+    int FI = CS.getFrameIdx();
+    if (FI >= 0 && MFI.getStackID(FI) == TargetStackID::ScalableVector) {
+      unsigned CFIIndex = MF->addFrameInst(
+          createDefCFAOffset(*STI.getRegisterInfo(), CS.getReg(), -FixedSize,
+                             MFI.getObjectOffset(FI) / 8));
+      BuildMI(MBB, MI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex)
+          .setMIFlag(MachineInstr::FrameSetup);
+    }
+  }
+}
+
 bool RISCVFrameLowering::restoreCalleeSavedRegisters(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
     MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index 210f8c1064724..28ab4aff3b9d5 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -88,6 +88,9 @@ class RISCVFrameLowering : public TargetFrameLowering {
   void adjustStackForRVV(MachineFunction &MF, MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
                          int64_t Amount, MachineInstr::MIFlag Flag) const;
+  void emitCalleeSavedRVVPrologCFI(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   bool HasFP) const;
   std::pair<int64_t, Align>
   assignRVVStackObjectOffsets(MachineFunction &MF) const;
 };
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 27387595164a4..b0deb1d266995 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -1484,6 +1485,11 @@ bool RISCVTargetLowering::shouldExpandGetVectorLength(EVT TripCountVT,
   return VF > MaxVF || !isPowerOf2_32(VF);
 }
 
+bool RISCVTargetLowering::shouldExpandCttzElements(EVT VT) const {
+  return !Subtarget.hasVInstructions() ||
+         VT.getVectorElementType() != MVT::i1 || !isTypeLegal(VT);
+}
+
 bool RISCVTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                              const CallInst &I,
                                              MachineFunction &MF,
@@ -1918,7 +1924,7 @@ bool RISCVTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
   // replace. If we don't support unaligned scalar mem, prefer the constant
   // pool.
   // TODO: Can the caller pass down the alignment?
-  if (!Subtarget.hasFastUnalignedAccess())
+  if (!Subtarget.enableUnalignedScalarMem())
     return true;
 
   // Prefer to keep the load if it would require many instructions.
@@ -8718,6 +8724,29 @@ static SDValue lowerGetVectorLength(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), Res);
 }
 
+static SDValue lowerCttzElts(SDNode *N, SelectionDAG &DAG,
+                             const RISCVSubtarget &Subtarget) {
+  SDValue Op0 = N->getOperand(1);
+  MVT OpVT = Op0.getSimpleValueType();
+  MVT ContainerVT = OpVT;
+  if (OpVT.isFixedLengthVector()) {
+    ContainerVT = getContainerForFixedLengthVector(DAG, OpVT, Subtarget);
+    Op0 = convertToScalableVector(ContainerVT, Op0, DAG, Subtarget);
+  }
+  MVT XLenVT = Subtarget.getXLenVT();
+  SDLoc DL(N);
+  auto [Mask, VL] = getDefaultVLOps(OpVT, ContainerVT, DL, DAG, Subtarget);
+  SDValue Res = DAG.getNode(RISCVISD::VFIRST_VL, DL, XLenVT, Op0, Mask, VL);
+  if (isOneConstant(N->getOperand(2)))
+    return Res;
+
+  // Convert -1 to VL.
+  SDValue Setcc =
+      DAG.getSetCC(DL, XLenVT, Res, DAG.getConstant(0, DL, XLenVT), ISD::SETLT);
+  VL = DAG.getElementCount(DL, XLenVT, OpVT.getVectorElementCount());
+  return DAG.getSelect(DL, XLenVT, Setcc, VL, Res);
+}
+
 static inline void promoteVCIXScalar(const SDValue &Op,
                                      SmallVectorImpl<SDValue> &Operands,
                                      SelectionDAG &DAG) {
@@ -8913,6 +8942,8 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
   case Intrinsic::experimental_get_vector_length:
     return lowerGetVectorLength(Op.getNode(), DAG, Subtarget);
+  case Intrinsic::experimental_cttz_elts:
+    return lowerCttzElts(Op.getNode(), DAG, Subtarget);
   case Intrinsic::riscv_vmv_x_s: {
     SDValue Res = DAG.getNode(RISCVISD::VMV_X_S, DL, XLenVT, Op.getOperand(1));
     return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Res);
@@ -10403,14 +10434,10 @@ RISCVTargetLowering::lowerFixedLengthVectorLoadToRVV(SDValue Op,
   if (MinVLMAX == MaxVLMAX && MinVLMAX == VT.getVectorNumElements() &&
       getLMUL1VT(ContainerVT).bitsLE(ContainerVT)) {
     MachineMemOperand *MMO = Load->getMemOperand();
-    MachineFunction &MF = DAG.getMachineFunction();
-    MMO = MF.getMachineMemOperand(
-        MMO, MMO->getPointerInfo(),
-        MMO->getMemoryType().isValid()
-            ? LLT::scalable_vector(1, MMO->getMemoryType().getSizeInBits())
-            : MMO->getMemoryType());
     SDValue NewLoad =
-        DAG.getLoad(ContainerVT, DL, Load->getChain(), Load->getBasePtr(), MMO);
+        DAG.getLoad(ContainerVT, DL, Load->getChain(), Load->getBasePtr(),
+                    MMO->getPointerInfo(), MMO->getBaseAlign(), MMO->getFlags(),
+                    MMO->getAAInfo(), MMO->getRanges());
     SDValue Result = convertFromScalableVector(VT, NewLoad, DAG, Subtarget);
     return DAG.getMergeValues({Result, NewLoad.getValue(1)}, DL);
   }
@@ -10470,14 +10497,9 @@ RISCVTargetLowering::lowerFixedLengthVectorStoreToRVV(SDValue Op,
   if (MinVLMAX == MaxVLMAX && MinVLMAX == VT.getVectorNumElements() &&
       getLMUL1VT(ContainerVT).bitsLE(ContainerVT)) {
     MachineMemOperand *MMO = Store->getMemOperand();
-    MachineFunction &MF = DAG.getMachineFunction();
-    MMO = MF.getMachineMemOperand(
-        MMO, MMO->getPointerInfo(),
-        MMO->getMemoryType().isValid()
-            ? LLT::scalable_vector(1, MMO->getMemoryType().getSizeInBits())
-            : MMO->getMemoryType());
     return DAG.getStore(Store->getChain(), DL, NewValue, Store->getBasePtr(),
-                        MMO);
+                        MMO->getPointerInfo(), MMO->getBaseAlign(),
+                        MMO->getFlags(), MMO->getAAInfo());
   }
 
   SDValue VL = getVLOp(VT.getVectorNumElements(), ContainerVT, DL, DAG,
@@ -12336,6 +12358,12 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
       return;
     }
+    case Intrinsic::experimental_cttz_elts: {
+      SDValue Res = lowerCttzElts(N, DAG, Subtarget);
+      Results.push_back(
+          DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), Res));
+      return;
+    }
     case Intrinsic::riscv_orc_b:
     case Intrinsic::riscv_brev8:
     case Intrinsic::riscv_sha256sig0:
@@ -13363,10 +13391,99 @@ static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG,
   return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false, Subtarget);
 }
 
-static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG) {
+// Try to expand a scalar multiply to a faster sequence.
+static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
+                         TargetLowering::DAGCombinerInfo &DCI,
+                         const RISCVSubtarget &Subtarget) {
+
   EVT VT = N->getValueType(0);
-  if (!VT.isVector())
+
+  // LI + MUL is usually smaller than the alternative sequence.
+  if (DAG.getMachineFunction().getFunction().hasMinSize())
+    return SDValue();
+
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  if (VT != Subtarget.getXLenVT())
+    return SDValue();
+
+  if (!Subtarget.hasStdExtZba() && !Subtarget.hasVendorXTHeadBa())
+    return SDValue();
+
+  ConstantSDNode *CNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!CNode)
+    return SDValue();
+  uint64_t MulAmt = CNode->getZExtValue();
+
+  // 3/5/9 * 2^N -> shXadd (sll X, C), (sll X, C)
+  // Matched in tablegen, avoid perturbing patterns.
+  for (uint64_t Divisor : {3, 5, 9})
+    if (MulAmt % Divisor == 0 && isPowerOf2_64(MulAmt / Divisor))
+      return SDValue();
+
+  // If this is a power 2 + 2/4/8, we can use a shift followed by a single
+  // shXadd. First check if this a sum of two power of 2s because that's
+  // easy. Then count how many zeros are up to the first bit.
+  if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {
+    unsigned ScaleShift = llvm::countr_zero(MulAmt);
+    if (ScaleShift >= 1 && ScaleShift < 4) {
+      unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));
+      SDLoc DL(N);
+      SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                                   DAG.getConstant(ShiftAmt, DL, VT));
+      SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                                   DAG.getConstant(ScaleShift, DL, VT));
+      return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);
+    }
+  }
+
+  // 2^(1,2,3) * 3,5,9 + 1 -> (shXadd (shYadd x, x), x)
+  // Matched in tablegen, avoid perturbing patterns.
+  switch (MulAmt) {
+  case 11:
+  case 13:
+  case 19:
+  case 21:
+  case 25:
+  case 27:
+  case 29:
+  case 37:
+  case 41:
+  case 45:
+  case 73:
+  case 91:
     return SDValue();
+  default:
+    break;
+  }
+
+  // 2^n + 2/4/8 + 1 -> (add (shl X, C1), (shXadd X, X))
+  if (MulAmt > 2 && isPowerOf2_64((MulAmt - 1) & (MulAmt - 2))) {
+    unsigned ScaleShift = llvm::countr_zero(MulAmt - 1);
+    if (ScaleShift >= 1 && ScaleShift < 4) {
+      unsigned ShiftAmt = Log2_64(((MulAmt - 1) & (MulAmt - 2)));
+      SDLoc DL(N);
+      SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                                   DAG.getConstant(ShiftAmt, DL, VT));
+      SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                                   DAG.getConstant(ScaleShift, DL, VT));
+      return DAG.getNode(
+          ISD::ADD, DL, VT, Shift1,
+          DAG.getNode(ISD::ADD, DL, VT, Shift2, N->getOperand(0)));
+    }
+  }
+
+  return SDValue();
+}
+
+
+static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const RISCVSubtarget &Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector())
+    return expandMul(N, DAG, DCI, Subtarget);
 
   SDLoc DL(N);
   SDValue N0 = N->getOperand(0);
@@ -15720,7 +15837,7 @@ static bool matchIndexAsWiderOp(EVT VT, SDValue Index, SDValue Mask,
   if (WiderElementSize > ST.getELen()/8)
     return false;
 
-  if (!ST.hasFastUnalignedAccess() && BaseAlign < WiderElementSize)
+  if (!ST.enableUnalignedVectorMem() && BaseAlign < WiderElementSize)
     return false;
 
   for (unsigned i = 0; i < Index->getNumOperands(); i++) {
@@ -15913,7 +16030,7 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::MUL:
     if (SDValue V = combineBinOp_VLToVWBinOp_VL(N, DCI, Subtarget))
       return V;
-    return performMULCombine(N, DAG);
+    return performMULCombine(N, DAG, DCI, Subtarget);
   case ISD::SDIV:
   case ISD::UDIV:
   case ISD::SREM:
@@ -18150,33 +18267,12 @@ static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1,
   return false;
 }
 
-static unsigned allocateRVVReg(MVT ValVT, unsigned ValNo,
-                               std::optional<unsigned> FirstMaskArgument,
-                               CCState &State, const RISCVTargetLowering &TLI) {
-  const TargetRegisterClass *RC = TLI.getRegClassFor(ValVT);
-  if (RC == &RISCV::VRRegClass) {
-    // Assign the first mask argument to V0.
-    // This is an interim calling convention and it may be changed in the
-    // future.
-    if (FirstMaskArgument && ValNo == *FirstMaskArgument)
-      return State.AllocateReg(RISCV::V0);
-    return State.AllocateReg(ArgVRs);
-  }
-  if (RC == &RISCV::VRM2RegClass)
-    return State.AllocateReg(ArgVRM2s);
-  if (RC == &RISCV::VRM4RegClass)
-    return State.AllocateReg(ArgVRM4s);
-  if (RC == &RISCV::VRM8RegClass)
-    return State.AllocateReg(ArgVRM8s);
-  llvm_unreachable("Unhandled register class for ValueType");
-}
-
 // Implements the RISC-V calling convention. Returns true upon failure.
 bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
                      MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
                      ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed,
                      bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI,
-                     std::optional<unsigned> FirstMaskArgument) {
+                     RVVArgDispatcher &RVVDispatcher) {
   unsigned XLen = DL.getLargestLegalIntTypeSizeInBits();
   assert(XLen == 32 || XLen == 64);
   MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64;
@@ -18345,7 +18441,7 @@ bool RISCV::CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
   else if (ValVT == MVT::f64 && !UseGPRForF64)
     Reg = State.AllocateReg(ArgFPR64s);
   else if (ValVT.isVector()) {
-    Reg = allocateRVVReg(ValVT, ValNo, FirstMaskArgument, State, TLI);
+    Reg = RVVDispatcher.getNextPhysReg();
     if (!Reg) {
       // For return values, the vector must be passed fully via registers or
       // via the stack.
@@ -18431,9 +18527,15 @@ void RISCVTargetLowering::analyzeInputArgs(
   unsigned NumArgs = Ins.size();
   FunctionType *FType = MF.getFunction().getFunctionType();
 
-  std::optional<unsigned> FirstMaskArgument;
-  if (Subtarget.hasVInstructions())
-    FirstMaskArgument = preAssignMask(Ins);
+  RVVArgDispatcher Dispatcher;
+  if (IsRet) {
+    Dispatcher = RVVArgDispatcher{&MF, this, ArrayRef(Ins)};
+  } else {
+    SmallVector<Type *, 4> TypeList;
+    for (const Argument &Arg : MF.getFunction().args())
+      TypeList.push_back(Arg.getType());
+    Dispatcher = RVVArgDispatcher{&MF, this, ArrayRef(TypeList)};
+  }
 
   for (unsigned i = 0; i != NumArgs; ++i) {
     MVT ArgVT = Ins[i].VT;
@@ -18448,7 +18550,7 @@ void RISCVTargetLowering::analyzeInputArgs(
     RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
     if (Fn(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
            ArgFlags, CCInfo, /*IsFixed=*/true, IsRet, ArgTy, *this,
-           FirstMaskArgument)) {
+           Dispatcher)) {
       LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type "
                         << ArgVT << '\n');
       llvm_unreachable(nullptr);
@@ -18462,9 +18564,13 @@ void RISCVTargetLowering::analyzeOutputArgs(
     CallLoweringInfo *CLI, RISCVCCAssignFn Fn) const {
   unsigned NumArgs = Outs.size();
 
-  std::optional<unsigned> FirstMaskArgument;
-  if (Subtarget.hasVInstructions())
-    FirstMaskArgument = preAssignMask(Outs);
+  SmallVector<Type *, 4> TypeList;
+  if (IsRet)
+    TypeList.push_back(MF.getFunction().getReturnType());
+  else if (CLI)
+    for (const TargetLowering::ArgListEntry &Arg : CLI->getArgs())
+      TypeList.push_back(Arg.Ty);
+  RVVArgDispatcher Dispatcher{&MF, this, ArrayRef(TypeList)};
 
   for (unsigned i = 0; i != NumArgs; i++) {
     MVT ArgVT = Outs[i].VT;
@@ -18474,7 +18580,7 @@ void RISCVTargetLowering::analyzeOutputArgs(
     RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
     if (Fn(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
            ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy, *this,
-           FirstMaskArgument)) {
+           Dispatcher)) {
       LLVM_DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type "
                         << ArgVT << "\n");
       llvm_unreachable(nullptr);
@@ -18655,7 +18761,7 @@ bool RISCV::CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI,
                             ISD::ArgFlagsTy ArgFlags, CCState &State,
                             bool IsFixed, bool IsRet, Type *OrigTy,
                             const RISCVTargetLowering &TLI,
-                            std::optional<unsigned> FirstMaskArgument) {
+                            RVVArgDispatcher &RVVDispatcher) {
   if (LocVT == MVT::i32 || LocVT == MVT::i64) {
     if (unsigned Reg = State.AllocateReg(getFastCCArgGPRs(ABI))) {
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
@@ -18733,13 +18839,14 @@ bool RISCV::CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI,
   }
 
   if (LocVT.isVector()) {
-    if (unsigned Reg =
-            allocateRVVReg(ValVT, ValNo, FirstMaskArgument, State, TLI)) {
+    MCPhysReg AllocatedVReg = RVVDispatcher.getNextPhysReg();
+    if (AllocatedVReg) {
       // Fixed-length vectors are located in the corresponding scalable-vector
       // container types.
       if (ValVT.isFixedLengthVector())
         LocVT = TLI.getContainerForFixedLengthVector(LocVT);
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      State.addLoc(
+          CCValAssign::getReg(ValNo, ValVT, AllocatedVReg, LocVT, LocInfo));
     } else {
       // Try and pass the address via a "fast" GPR.
       if (unsigned GPRReg = State.AllocateReg(getFastCCArgGPRs(ABI))) {
@@ -19367,17 +19474,15 @@ bool RISCVTargetLowering::CanLowerReturn(
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
 
-  std::optional<unsigned> FirstMaskArgument;
-  if (Subtarget.hasVInstructions())
-    FirstMaskArgument = preAssignMask(Outs);
+  RVVArgDispatcher Dispatcher{&MF, this, ArrayRef(Outs)};
 
   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
     MVT VT = Outs[i].VT;
     ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
     RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
     if (RISCV::CC_RISCV(MF.getDataLayout(), ABI, i, VT, VT, CCValAssign::Full,
-                 ArgFlags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true, nullptr,
-                 *this, FirstMaskArgument))
+                        ArgFlags, CCInfo, /*IsFixed=*/true, /*IsRet=*/true,
+                        nullptr, *this, Dispatcher))
       return false;
   }
   return true;
@@ -20558,8 +20663,8 @@ bool RISCVTargetLowering::allowsMisalignedMemoryAccesses(
     unsigned *Fast) const {
   if (!VT.isVector()) {
     if (Fast)
-      *Fast = Subtarget.hasFastUnalignedAccess();
-    return Subtarget.hasFastUnalignedAccess();
+      *Fast = Subtarget.enableUnalignedScalarMem();
+    return Subtarget.enableUnalignedScalarMem();
   }
 
   // All vector implementations must support element alignment
@@ -20575,8 +20680,8 @@ bool RISCVTargetLowering::allowsMisalignedMemoryAccesses(
   // misaligned accesses.  TODO: Work through the codegen implications of
   // allowing such accesses to be formed, and considered fast.
   if (Fast)
-    *Fast = Subtarget.hasFastUnalignedAccess();
-  return Subtarget.hasFastUnalignedAccess();
+    *Fast = Subtarget.enableUnalignedVectorMem();
+  return Subtarget.enableUnalignedVectorMem();
 }
 
 
@@ -20611,7 +20716,7 @@ EVT RISCVTargetLowering::getOptimalMemOpType(const MemOp &Op,
 
   // Do we have sufficient alignment for our preferred VT?  If not, revert
   // to largest size allowed by our alignment criteria.
-  if (PreferredVT != MVT::i8 && !Subtarget.hasFastUnalignedAccess()) {
+  if (PreferredVT != MVT::i8 && !Subtarget.enableUnalignedVectorMem()) {
     Align RequiredAlign(PreferredVT.getStoreSize());
     if (Op.isFixedDstAlign())
       RequiredAlign = std::min(RequiredAlign, Op.getDstAlign());
@@ -20803,7 +20908,7 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType,
   if (!isLegalElementTypeForRVV(ScalarType))
     return false;
 
-  if (!Subtarget.hasFastUnalignedAccess() &&
+  if (!Subtarget.enableUnalignedVectorMem() &&
       Alignment < ScalarType.getStoreSize())
     return false;
 
@@ -21174,6 +21279,181 @@ unsigned RISCVTargetLowering::getMinimumJumpTableEntries() const {
   return Subtarget.getMinimumJumpTableEntries();
 }
 
+// Handle single arg such as return value.
+template <typename Arg>
+void RVVArgDispatcher::constructArgInfos(ArrayRef<Arg> ArgList) {
+  // This lambda determines whether an array of types are constructed by
+  // homogeneous vector types.
+  auto isHomogeneousScalableVectorType = [](ArrayRef<Arg> ArgList) {
+    // First, extract the first element in the argument type.
+    auto It = ArgList.begin();
+    MVT FirstArgRegType = It->VT;
+
+    // Return if there is no return or the type needs split.
+    if (It == ArgList.end() || It->Flags.isSplit())
+      return false;
+
+    ++It;
+
+    // Return if this argument type contains only 1 element, or it's not a
+    // vector type.
+    if (It == ArgList.end() || !FirstArgRegType.isScalableVector())
+      return false;
+
+    // Second, check if the following elements in this argument type are all the
+    // same.
+    for (; It != ArgList.end(); ++It)
+      if (It->Flags.isSplit() || It->VT != FirstArgRegType)
+        return false;
+
+    return true;
+  };
+
+  if (isHomogeneousScalableVectorType(ArgList)) {
+    // Handle as tuple type
+    RVVArgInfos.push_back({(unsigned)ArgList.size(), ArgList[0].VT, false});
+  } else {
+    // Handle as normal vector type
+    bool FirstVMaskAssigned = false;
+    for (const auto &OutArg : ArgList) {
+      MVT RegisterVT = OutArg.VT;
+
+      // Skip non-RVV register type
+      if (!RegisterVT.isVector())
+        continue;
+
+      if (RegisterVT.isFixedLengthVector())
+        RegisterVT = TLI->getContainerForFixedLengthVector(RegisterVT);
+
+      if (!FirstVMaskAssigned && RegisterVT.getVectorElementType() == MVT::i1) {
+        RVVArgInfos.push_back({1, RegisterVT, true});
+        FirstVMaskAssigned = true;
+        continue;
+      }
+
+      RVVArgInfos.push_back({1, RegisterVT, false});
+    }
+  }
+}
+
+// Handle multiple args.
+template <>
+void RVVArgDispatcher::constructArgInfos<Type *>(ArrayRef<Type *> TypeList) {
+  const DataLayout &DL = MF->getDataLayout();
+  const Function &F = MF->getFunction();
+  LLVMContext &Context = F.getContext();
+
+  bool FirstVMaskAssigned = false;
+  for (Type *Ty : TypeList) {
+    StructType *STy = dyn_cast<StructType>(Ty);
+    if (STy && STy->containsHomogeneousScalableVectorTypes()) {
+      Type *ElemTy = STy->getTypeAtIndex(0U);
+      EVT VT = TLI->getValueType(DL, ElemTy);
+      MVT RegisterVT =
+          TLI->getRegisterTypeForCallingConv(Context, F.getCallingConv(), VT);
+      unsigned NumRegs =
+          TLI->getNumRegistersForCallingConv(Context, F.getCallingConv(), VT);
+
+      RVVArgInfos.push_back(
+          {NumRegs * STy->getNumElements(), RegisterVT, false});
+    } else {
+      SmallVector<EVT, 4> ValueVTs;
+      ComputeValueVTs(*TLI, DL, Ty, ValueVTs);
+
+      for (unsigned Value = 0, NumValues = ValueVTs.size(); Value != NumValues;
+           ++Value) {
+        EVT VT = ValueVTs[Value];
+        MVT RegisterVT =
+            TLI->getRegisterTypeForCallingConv(Context, F.getCallingConv(), VT);
+        unsigned NumRegs =
+            TLI->getNumRegistersForCallingConv(Context, F.getCallingConv(), VT);
+
+        // Skip non-RVV register type
+        if (!RegisterVT.isVector())
+          continue;
+
+        if (RegisterVT.isFixedLengthVector())
+          RegisterVT = TLI->getContainerForFixedLengthVector(RegisterVT);
+
+        if (!FirstVMaskAssigned &&
+            RegisterVT.getVectorElementType() == MVT::i1) {
+          RVVArgInfos.push_back({1, RegisterVT, true});
+          FirstVMaskAssigned = true;
+          --NumRegs;
+        }
+
+        RVVArgInfos.insert(RVVArgInfos.end(), NumRegs, {1, RegisterVT, false});
+      }
+    }
+  }
+}
+
+void RVVArgDispatcher::allocatePhysReg(unsigned NF, unsigned LMul,
+                                       unsigned StartReg) {
+  assert((StartReg % LMul) == 0 &&
+         "Start register number should be multiple of lmul");
+  const MCPhysReg *VRArrays;
+  switch (LMul) {
+  default:
+    report_fatal_error("Invalid lmul");
+  case 1:
+    VRArrays = ArgVRs;
+    break;
+  case 2:
+    VRArrays = ArgVRM2s;
+    break;
+  case 4:
+    VRArrays = ArgVRM4s;
+    break;
+  case 8:
+    VRArrays = ArgVRM8s;
+    break;
+  }
+
+  for (unsigned i = 0; i < NF; ++i)
+    if (StartReg)
+      AllocatedPhysRegs.push_back(VRArrays[(StartReg - 8) / LMul + i]);
+    else
+      AllocatedPhysRegs.push_back(MCPhysReg());
+}
+
+/// This function determines if each RVV argument is passed by register, if the
+/// argument can be assigned to a VR, then give it a specific register.
+/// Otherwise, assign the argument to 0 which is a invalid MCPhysReg.
+void RVVArgDispatcher::compute() {
+  uint32_t AssignedMap = 0;
+  auto allocate = [&](const RVVArgInfo &ArgInfo) {
+    // Allocate first vector mask argument to V0.
+    if (ArgInfo.FirstVMask) {
+      AllocatedPhysRegs.push_back(RISCV::V0);
+      return;
+    }
+
+    unsigned RegsNeeded = divideCeil(
+        ArgInfo.VT.getSizeInBits().getKnownMinValue(), RISCV::RVVBitsPerBlock);
+    unsigned TotalRegsNeeded = ArgInfo.NF * RegsNeeded;
+    for (unsigned StartReg = 0; StartReg + TotalRegsNeeded <= NumArgVRs;
+         StartReg += RegsNeeded) {
+      uint32_t Map = ((1 << TotalRegsNeeded) - 1) << StartReg;
+      if ((AssignedMap & Map) == 0) {
+        allocatePhysReg(ArgInfo.NF, RegsNeeded, StartReg + 8);
+        AssignedMap |= Map;
+        return;
+      }
+    }
+
+    allocatePhysReg(ArgInfo.NF, RegsNeeded, 0);
+  };
+
+  for (unsigned i = 0; i < RVVArgInfos.size(); ++i)
+    allocate(RVVArgInfos[i]);
+}
+
+MCPhysReg RVVArgDispatcher::getNextPhysReg() {
+  assert(CurIdx < AllocatedPhysRegs.size() && "Index out of range");
+  return AllocatedPhysRegs[CurIdx++];
+}
+
 namespace llvm::RISCVVIntrinsicsTable {
 
 #define GET_RISCVVIntrinsicsTable_IMPL
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index ace5b3fd2b95b..b10da3d40befb 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -24,6 +24,7 @@ namespace llvm {
 class InstructionCost;
 class RISCVSubtarget;
 struct RISCVRegisterInfo;
+class RVVArgDispatcher;
 
 namespace RISCVISD {
 // clang-format off
@@ -875,7 +876,7 @@ class RISCVTargetLowering : public TargetLowering {
                                ISD::ArgFlagsTy ArgFlags, CCState &State,
                                bool IsFixed, bool IsRet, Type *OrigTy,
                                const RISCVTargetLowering &TLI,
-                               std::optional<unsigned> FirstMaskArgument);
+                               RVVArgDispatcher &RVVDispatcher);
 
 private:
   void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo,
@@ -986,6 +987,8 @@ class RISCVTargetLowering : public TargetLowering {
   bool shouldExpandGetVectorLength(EVT TripCountVT, unsigned VF,
                                    bool IsScalable) const override;
 
+  bool shouldExpandCttzElements(EVT VT) const override;
+
   /// RVV code generation for fixed length vectors does not lower all
   /// BUILD_VECTORs. This makes BUILD_VECTOR legalisation a source of stores to
   /// merge. However, merging them creates a BUILD_VECTOR that is just as
@@ -1015,19 +1018,71 @@ class RISCVTargetLowering : public TargetLowering {
   unsigned getMinimumJumpTableEntries() const override;
 };
 
+/// As per the spec, the rules for passing vector arguments are as follows:
+///
+/// 1. For the first vector mask argument, use v0 to pass it.
+/// 2. For vector data arguments or rest vector mask arguments, starting from
+/// the v8 register, if a vector register group between v8-v23 that has not been
+/// allocated can be found and the first register number is a multiple of LMUL,
+/// then allocate this vector register group to the argument and mark these
+/// registers as allocated. Otherwise, pass it by reference and are replaced in
+/// the argument list with the address.
+/// 3. For tuple vector data arguments, starting from the v8 register, if
+/// NFIELDS consecutive vector register groups between v8-v23 that have not been
+/// allocated can be found and the first register number is a multiple of LMUL,
+/// then allocate these vector register groups to the argument and mark these
+/// registers as allocated. Otherwise, pass it by reference and are replaced in
+/// the argument list with the address.
+class RVVArgDispatcher {
+public:
+  static constexpr unsigned NumArgVRs = 16;
+
+  struct RVVArgInfo {
+    unsigned NF;
+    MVT VT;
+    bool FirstVMask = false;
+  };
+
+  template <typename Arg>
+  RVVArgDispatcher(const MachineFunction *MF, const RISCVTargetLowering *TLI,
+                   ArrayRef<Arg> ArgList)
+      : MF(MF), TLI(TLI) {
+    constructArgInfos(ArgList);
+    compute();
+  }
+
+  RVVArgDispatcher() = default;
+
+  MCPhysReg getNextPhysReg();
+
+private:
+  SmallVector<RVVArgInfo, 4> RVVArgInfos;
+  SmallVector<MCPhysReg, 4> AllocatedPhysRegs;
+
+  const MachineFunction *MF = nullptr;
+  const RISCVTargetLowering *TLI = nullptr;
+
+  unsigned CurIdx = 0;
+
+  template <typename Arg> void constructArgInfos(ArrayRef<Arg> Ret);
+  void compute();
+  void allocatePhysReg(unsigned NF = 1, unsigned LMul = 1,
+                       unsigned StartReg = 0);
+};
+
 namespace RISCV {
 
 bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
               MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
               ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed,
               bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI,
-              std::optional<unsigned> FirstMaskArgument);
+              RVVArgDispatcher &RVVDispatcher);
 
 bool CC_RISCV_FastCC(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
                      MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo,
                      ISD::ArgFlagsTy ArgFlags, CCState &State, bool IsFixed,
                      bool IsRet, Type *OrigTy, const RISCVTargetLowering &TLI,
-                     std::optional<unsigned> FirstMaskArgument);
+                     RVVArgDispatcher &RVVDispatcher);
 
 bool CC_RISCV_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
                   CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index a14f9a2835473..fa37d1ccccd73 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -468,6 +468,7 @@ class VSETVLIInfo {
   bool isUnknown() const { return State == Unknown; }
 
   void setAVLReg(Register Reg) {
+    assert(Reg.isVirtual() || Reg == RISCV::X0 || Reg == RISCV::NoRegister);
     AVLReg = Reg;
     State = AVLIsReg;
   }
@@ -1514,17 +1515,12 @@ static bool canMutatePriorConfig(const MachineInstr &PrevMI,
 
     // If the AVL is a register, we need to make sure MI's AVL dominates PrevMI.
     // For now just check that PrevMI uses the same virtual register.
-    if (AVL.isReg() && AVL.getReg() != RISCV::X0) {
-      if (AVL.getReg().isPhysical())
-        return false;
-      if (!PrevAVL.isReg() || PrevAVL.getReg() != AVL.getReg())
-        return false;
-    }
+    if (AVL.isReg() && AVL.getReg() != RISCV::X0 &&
+        (!PrevAVL.isReg() || PrevAVL.getReg() != AVL.getReg()))
+      return false;
   }
 
-  if (!PrevMI.getOperand(2).isImm() || !MI.getOperand(2).isImm())
-    return false;
-
+  assert(PrevMI.getOperand(2).isImm() && MI.getOperand(2).isImm());
   auto PriorVType = PrevMI.getOperand(2).getImm();
   auto VType = MI.getOperand(2).getImm();
   return areCompatibleVTYPEs(PriorVType, VType, Used);
@@ -1542,12 +1538,15 @@ void RISCVInsertVSETVLI::doLocalPostpass(MachineBasicBlock &MBB) {
 
     if (!isVectorConfigInstr(MI)) {
       doUnion(Used, getDemanded(MI, MRI, ST));
+      if (MI.isCall() || MI.isInlineAsm() || MI.modifiesRegister(RISCV::VL) ||
+          MI.modifiesRegister(RISCV::VTYPE))
+        NextMI = nullptr;
       continue;
     }
 
-    Register VRegDef = MI.getOperand(0).getReg();
-    if (VRegDef != RISCV::X0 &&
-        !(VRegDef.isVirtual() && MRI->use_nodbg_empty(VRegDef)))
+    Register RegDef = MI.getOperand(0).getReg();
+    assert(RegDef == RISCV::X0 || RegDef.isVirtual());
+    if (RegDef != RISCV::X0 && !MRI->use_nodbg_empty(RegDef))
       Used.demandVL();
 
     if (NextMI) {
@@ -1555,7 +1554,9 @@ void RISCVInsertVSETVLI::doLocalPostpass(MachineBasicBlock &MBB) {
         ToDelete.push_back(&MI);
         // Leave NextMI unchanged
         continue;
-      } else if (canMutatePriorConfig(MI, *NextMI, Used, *MRI)) {
+      }
+
+      if (canMutatePriorConfig(MI, *NextMI, Used, *MRI)) {
         if (!isVLPreservingConfig(*NextMI)) {
           MI.getOperand(0).setReg(NextMI->getOperand(0).getReg());
           MI.getOperand(0).setIsDead(false);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index b0fda040519a5..8331fc0b8c302 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -361,15 +361,12 @@ void RISCVInstrInfo::copyPhysRegVector(
     return {RISCVII::LMUL_1, RISCV::VRRegClass, RISCV::VMV1R_V,
             RISCV::PseudoVMV_V_V_M1, RISCV::PseudoVMV_V_I_M1};
   };
-  auto FindRegWithEncoding = [&TRI](const TargetRegisterClass &RegClass,
-                                    uint16_t Encoding) {
-    ArrayRef<MCPhysReg> Regs = RegClass.getRegisters();
-    const auto *FoundReg = llvm::find_if(Regs, [&](MCPhysReg Reg) {
-      return TRI->getEncodingValue(Reg) == Encoding;
-    });
-    // We should be always able to find one valid register.
-    assert(FoundReg != Regs.end());
-    return *FoundReg;
+  auto FindRegWithEncoding = [TRI](const TargetRegisterClass &RegClass,
+                                   uint16_t Encoding) {
+    MCRegister Reg = RISCV::V0 + Encoding;
+    if (&RegClass == &RISCV::VRRegClass)
+      return Reg;
+    return TRI->getMatchingSuperReg(Reg, RISCV::sub_vrm1_0, &RegClass);
   };
   while (I != NumRegs) {
     // For non-segment copying, we only do this once as the registers are always
@@ -2718,6 +2715,50 @@ std::string RISCVInstrInfo::createMIROperandComment(
   return Comment;
 }
 
+// clang-format off
+#define CASE_RVV_OPCODE_UNMASK_LMUL(OP, LMUL)                                 \
+  RISCV::Pseudo##OP##_##LMUL
+
+#define CASE_RVV_OPCODE_MASK_LMUL(OP, LMUL)                                   \
+  RISCV::Pseudo##OP##_##LMUL##_MASK
+
+#define CASE_RVV_OPCODE_LMUL(OP, LMUL)                                        \
+  CASE_RVV_OPCODE_UNMASK_LMUL(OP, LMUL):                                      \
+  case CASE_RVV_OPCODE_MASK_LMUL(OP, LMUL)
+
+#define CASE_RVV_OPCODE_UNMASK_WIDEN(OP)                                      \
+  CASE_RVV_OPCODE_UNMASK_LMUL(OP, MF8):                                       \
+  case CASE_RVV_OPCODE_UNMASK_LMUL(OP, MF4):                                  \
+  case CASE_RVV_OPCODE_UNMASK_LMUL(OP, MF2):                                  \
+  case CASE_RVV_OPCODE_UNMASK_LMUL(OP, M1):                                   \
+  case CASE_RVV_OPCODE_UNMASK_LMUL(OP, M2):                                   \
+  case CASE_RVV_OPCODE_UNMASK_LMUL(OP, M4)
+
+#define CASE_RVV_OPCODE_UNMASK(OP)                                            \
+  CASE_RVV_OPCODE_UNMASK_WIDEN(OP):                                           \
+  case CASE_RVV_OPCODE_UNMASK_LMUL(OP, M8)
+
+#define CASE_RVV_OPCODE_MASK_WIDEN(OP)                                        \
+  CASE_RVV_OPCODE_MASK_LMUL(OP, MF8):                                         \
+  case CASE_RVV_OPCODE_MASK_LMUL(OP, MF4):                                    \
+  case CASE_RVV_OPCODE_MASK_LMUL(OP, MF2):                                    \
+  case CASE_RVV_OPCODE_MASK_LMUL(OP, M1):                                     \
+  case CASE_RVV_OPCODE_MASK_LMUL(OP, M2):                                     \
+  case CASE_RVV_OPCODE_MASK_LMUL(OP, M4)
+
+#define CASE_RVV_OPCODE_MASK(OP)                                              \
+  CASE_RVV_OPCODE_MASK_WIDEN(OP):                                             \
+  case CASE_RVV_OPCODE_MASK_LMUL(OP, M8)
+
+#define CASE_RVV_OPCODE_WIDEN(OP)                                             \
+  CASE_RVV_OPCODE_UNMASK_WIDEN(OP):                                           \
+  case CASE_RVV_OPCODE_MASK_WIDEN(OP)
+
+#define CASE_RVV_OPCODE(OP)                                                   \
+  CASE_RVV_OPCODE_UNMASK(OP):                                                 \
+  case CASE_RVV_OPCODE_MASK(OP)
+// clang-format on
+
 // clang-format off
 #define CASE_VMA_OPCODE_COMMON(OP, TYPE, LMUL)                                 \
   RISCV::PseudoV##OP##_##TYPE##_##LMUL
@@ -2798,6 +2839,28 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   case RISCV::PseudoCCMOVGPR:
     // Operands 4 and 5 are commutable.
     return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 4, 5);
+  case CASE_RVV_OPCODE(VADD_VV):
+  case CASE_RVV_OPCODE(VAND_VV):
+  case CASE_RVV_OPCODE(VOR_VV):
+  case CASE_RVV_OPCODE(VXOR_VV):
+  case CASE_RVV_OPCODE_MASK(VMSEQ_VV):
+  case CASE_RVV_OPCODE_MASK(VMSNE_VV):
+  case CASE_RVV_OPCODE(VMIN_VV):
+  case CASE_RVV_OPCODE(VMINU_VV):
+  case CASE_RVV_OPCODE(VMAX_VV):
+  case CASE_RVV_OPCODE(VMAXU_VV):
+  case CASE_RVV_OPCODE(VMUL_VV):
+  case CASE_RVV_OPCODE(VMULH_VV):
+  case CASE_RVV_OPCODE(VMULHU_VV):
+  case CASE_RVV_OPCODE_WIDEN(VWADD_VV):
+  case CASE_RVV_OPCODE_WIDEN(VWADDU_VV):
+  case CASE_RVV_OPCODE_WIDEN(VWMUL_VV):
+  case CASE_RVV_OPCODE_WIDEN(VWMULU_VV):
+  case CASE_RVV_OPCODE_WIDEN(VWMACC_VV):
+  case CASE_RVV_OPCODE_WIDEN(VWMACCU_VV):
+  case CASE_RVV_OPCODE_UNMASK(VADC_VVM):
+    // Operands 2 and 3 are commutable.
+    return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3);
   case CASE_VFMA_SPLATS(FMADD):
   case CASE_VFMA_SPLATS(FMSUB):
   case CASE_VFMA_SPLATS(FMACC):
@@ -2950,7 +3013,7 @@ bool RISCVInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
   CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, TYPE, SEW)
 
 #define CASE_VFMA_CHANGE_OPCODE_VV(OLDOP, NEWOP)                               \
-  CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, VV, E16)                     \
+  CASE_VFMA_CHANGE_OPCODE_LMULS_MF4(OLDOP, NEWOP, VV, E16)                     \
   CASE_VFMA_CHANGE_OPCODE_LMULS_MF2(OLDOP, NEWOP, VV, E32)                     \
   CASE_VFMA_CHANGE_OPCODE_LMULS_M1(OLDOP, NEWOP, VV, E64)
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index ad1821d57256b..435cd7f84c612 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -2127,8 +2127,9 @@ multiclass VPseudoBinary<VReg RetClass,
                          LMULInfo MInfo,
                          string Constraint = "",
                          int sew = 0,
-                         int TargetConstraintType = 1> {
-  let VLMul = MInfo.value, SEW=sew in {
+                         int TargetConstraintType = 1,
+                         bit Commutable = 0> {
+  let VLMul = MInfo.value, SEW=sew, isCommutable = Commutable in {
     defvar suffix = !if(sew, "_" # MInfo.MX # "_E" # sew, "_" # MInfo.MX);
     def suffix : VPseudoBinaryNoMaskTU<RetClass, Op1Class, Op2Class,
                                        Constraint, TargetConstraintType>;
@@ -2167,8 +2168,9 @@ multiclass VPseudoBinaryM<VReg RetClass,
                           DAGOperand Op2Class,
                           LMULInfo MInfo,
                           string Constraint = "",
-                          int TargetConstraintType = 1> {
-  let VLMul = MInfo.value in {
+                          int TargetConstraintType = 1,
+                          bit Commutable = 0> {
+  let VLMul = MInfo.value, isCommutable = Commutable in {
     def "_" # MInfo.MX : VPseudoBinaryMOutNoMask<RetClass, Op1Class, Op2Class,
                                                  Constraint, TargetConstraintType>;
     let ForceTailAgnostic = true in
@@ -2226,8 +2228,8 @@ multiclass VPseudoTiedBinaryRoundingMode<VReg RetClass,
 }
 
 
-multiclass VPseudoBinaryV_VV<LMULInfo m, string Constraint = "", int sew = 0> {
-  defm _VV : VPseudoBinary<m.vrclass, m.vrclass, m.vrclass, m, Constraint, sew>;
+multiclass VPseudoBinaryV_VV<LMULInfo m, string Constraint = "", int sew = 0, bit Commutable = 0> {
+  defm _VV : VPseudoBinary<m.vrclass, m.vrclass, m.vrclass, m, Constraint, sew, Commutable=Commutable>;
 }
 
 multiclass VPseudoBinaryV_VV_RM<LMULInfo m, string Constraint = ""> {
@@ -2331,9 +2333,10 @@ multiclass VPseudoVALU_MM<bit Commutable = 0> {
 // * The destination EEW is greater than the source EEW, the source EMUL is
 //   at least 1, and the overlap is in the highest-numbered part of the
 //   destination register group is legal. Otherwise, it is illegal.
-multiclass VPseudoBinaryW_VV<LMULInfo m> {
+multiclass VPseudoBinaryW_VV<LMULInfo m, bit Commutable = 0> {
   defm _VV : VPseudoBinary<m.wvrclass, m.vrclass, m.vrclass, m,
-                           "@earlyclobber $rd", TargetConstraintType=3>;
+                           "@earlyclobber $rd", TargetConstraintType=3,
+                           Commutable=Commutable>;
 }
 
 multiclass VPseudoBinaryW_VV_RM<LMULInfo m, int sew = 0> {
@@ -2453,7 +2456,9 @@ multiclass VPseudoBinaryV_VM<LMULInfo m, bit CarryOut = 0, bit CarryIn = 1,
                          m.vrclass, m.vrclass, m, CarryIn, Constraint, TargetConstraintType>;
 }
 
-multiclass VPseudoTiedBinaryV_VM<LMULInfo m, int TargetConstraintType = 1> {
+multiclass VPseudoTiedBinaryV_VM<LMULInfo m, int TargetConstraintType = 1,
+                                 bit Commutable = 0> {
+  let isCommutable = Commutable in
   def "_VVM" # "_" # m.MX:
     VPseudoTiedBinaryCarryIn<GetVRegNoV0<m.vrclass>.R,
                              m.vrclass, m.vrclass, m, 1, "",
@@ -2667,9 +2672,11 @@ multiclass PseudoVEXT_VF8 {
 //  lowest-numbered part of the source register group".
 // With LMUL<=1 the source and dest occupy a single register so any overlap
 // is in the lowest-numbered part.
-multiclass VPseudoBinaryM_VV<LMULInfo m, int TargetConstraintType = 1> {
+multiclass VPseudoBinaryM_VV<LMULInfo m, int TargetConstraintType = 1,
+                             bit Commutable = 0> {
   defm _VV : VPseudoBinaryM<VR, m.vrclass, m.vrclass, m,
-                            !if(!ge(m.octuple, 16), "@earlyclobber $rd", ""), TargetConstraintType>;
+                            !if(!ge(m.octuple, 16), "@earlyclobber $rd", ""),
+                            TargetConstraintType, Commutable=Commutable>;
 }
 
 multiclass VPseudoBinaryM_VX<LMULInfo m, int TargetConstraintType = 1> {
@@ -2751,10 +2758,11 @@ multiclass VPseudoVSSHT_VV_VX_VI_RM<Operand ImmType = simm5, string Constraint =
   }
 }
 
-multiclass VPseudoVALU_VV_VX_VI<Operand ImmType = simm5, string Constraint = ""> {
+multiclass VPseudoVALU_VV_VX_VI<Operand ImmType = simm5, string Constraint = "",
+                                bit Commutable = 0> {
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryV_VV<m, Constraint>,
+    defm "" : VPseudoBinaryV_VV<m, Constraint, Commutable=Commutable>,
             SchedBinary<"WriteVIALUV", "ReadVIALUV", "ReadVIALUV", mx,
                         forceMergeOpRead=true>;
     defm "" : VPseudoBinaryV_VX<m, Constraint>,
@@ -2804,17 +2812,17 @@ multiclass VPseudoVAALU_VV_VX_RM {
 multiclass VPseudoVMINMAX_VV_VX {
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryV_VV<m>,
+    defm "" : VPseudoBinaryV_VV<m, Commutable=1>,
               SchedBinary<"WriteVIMinMaxV", "ReadVIMinMaxV", "ReadVIMinMaxV", mx>;
     defm "" : VPseudoBinaryV_VX<m>,
               SchedBinary<"WriteVIMinMaxX", "ReadVIMinMaxV", "ReadVIMinMaxX", mx>;
   }
 }
 
-multiclass VPseudoVMUL_VV_VX {
+multiclass VPseudoVMUL_VV_VX<bit Commutable = 0> {
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryV_VV<m>,
+    defm "" : VPseudoBinaryV_VV<m, Commutable=Commutable>,
               SchedBinary<"WriteVIMulV", "ReadVIMulV", "ReadVIMulV", mx>;
     defm "" : VPseudoBinaryV_VX<m>,
               SchedBinary<"WriteVIMulX", "ReadVIMulV", "ReadVIMulX", mx>;
@@ -2964,10 +2972,10 @@ multiclass VPseudoVALU_VX_VI<Operand ImmType = simm5> {
   }
 }
 
-multiclass VPseudoVWALU_VV_VX {
+multiclass VPseudoVWALU_VV_VX<bit Commutable = 0> {
   foreach m = MxListW in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryW_VV<m>,
+    defm "" : VPseudoBinaryW_VV<m, Commutable=Commutable>,
               SchedBinary<"WriteVIWALUV", "ReadVIWALUV", "ReadVIWALUV", mx,
                           forceMergeOpRead=true>;
     defm "" : VPseudoBinaryW_VX<m>, 
@@ -2976,10 +2984,10 @@ multiclass VPseudoVWALU_VV_VX {
   }
 }
 
-multiclass VPseudoVWMUL_VV_VX {
+multiclass VPseudoVWMUL_VV_VX<bit Commutable = 0> {
   foreach m = MxListW in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryW_VV<m>,
+    defm "" : VPseudoBinaryW_VV<m, Commutable=Commutable>,
               SchedBinary<"WriteVIWMulV", "ReadVIWMulV", "ReadVIWMulV", mx,
                           forceMergeOpRead=true>;
     defm "" : VPseudoBinaryW_VX<m>,
@@ -3074,7 +3082,7 @@ multiclass VPseudoVMRG_VM_XM_IM {
 multiclass VPseudoVCALU_VM_XM_IM {
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoTiedBinaryV_VM<m>,
+    defm "" : VPseudoTiedBinaryV_VM<m, Commutable=1>,
               SchedBinary<"WriteVICALUV", "ReadVICALUV", "ReadVICALUV", mx,
                           forceMergeOpRead=true>;
     defm "" : VPseudoTiedBinaryV_XM<m>,
@@ -3287,10 +3295,10 @@ multiclass VPseudoTernaryV_VF_AAXA_RM<LMULInfo m, FPR_Info f,
                                                           sew, Commutable=1>;
 }
 
-multiclass VPseudoTernaryW_VV<LMULInfo m> {
+multiclass VPseudoTernaryW_VV<LMULInfo m, bit Commutable = 0> {
   defvar constraint = "@earlyclobber $rd";
   defm _VV : VPseudoTernaryWithPolicy<m.wvrclass, m.vrclass, m.vrclass, m,
-                                      constraint, /*Commutable*/ 0, TargetConstraintType=3>;
+                                      constraint, Commutable=Commutable, TargetConstraintType=3>;
 }
 
 multiclass VPseudoTernaryW_VV_RM<LMULInfo m, int sew = 0> {
@@ -3380,10 +3388,10 @@ multiclass VPseudoVSLD_VX_VI<Operand ImmType = simm5, string Constraint = ""> {
   }
 }
 
-multiclass VPseudoVWMAC_VV_VX {
+multiclass VPseudoVWMAC_VV_VX<bit Commutable = 0> {
   foreach m = MxListW in {
     defvar mx = m.MX;
-    defm "" : VPseudoTernaryW_VV<m>,
+    defm "" : VPseudoTernaryW_VV<m, Commutable=Commutable>,
               SchedTernary<"WriteVIWMulAddV", "ReadVIWMulAddV", "ReadVIWMulAddV",
                            "ReadVIWMulAddV", mx>;
     defm "" : VPseudoTernaryW_VX<m>,
@@ -3436,10 +3444,10 @@ multiclass VPseudoVWMAC_VV_VF_BF_RM {
   }
 }
 
-multiclass VPseudoVCMPM_VV_VX_VI {
+multiclass VPseudoVCMPM_VV_VX_VI<bit Commutable = 0> {
   foreach m = MxList in {
     defvar mx = m.MX;
-    defm "" : VPseudoBinaryM_VV<m, TargetConstraintType=2>,
+    defm "" : VPseudoBinaryM_VV<m, TargetConstraintType=2, Commutable=Commutable>,
               SchedBinary<"WriteVICmpV", "ReadVICmpV", "ReadVICmpV", mx>;
     defm "" : VPseudoBinaryM_VX<m, TargetConstraintType=2>,
               SchedBinary<"WriteVICmpX", "ReadVICmpV", "ReadVICmpX", mx>;
@@ -6248,7 +6256,7 @@ defm PseudoVLSEG : VPseudoUSSegLoadFF;
 //===----------------------------------------------------------------------===//
 // 11.1. Vector Single-Width Integer Add and Subtract
 //===----------------------------------------------------------------------===//
-defm PseudoVADD   : VPseudoVALU_VV_VX_VI;
+defm PseudoVADD   : VPseudoVALU_VV_VX_VI<Commutable=1>;
 defm PseudoVSUB   : VPseudoVALU_VV_VX;
 defm PseudoVRSUB  : VPseudoVALU_VX_VI;
 
@@ -6313,9 +6321,9 @@ foreach vti = AllIntegerVectors in {
 //===----------------------------------------------------------------------===//
 // 11.2. Vector Widening Integer Add/Subtract
 //===----------------------------------------------------------------------===//
-defm PseudoVWADDU : VPseudoVWALU_VV_VX;
+defm PseudoVWADDU : VPseudoVWALU_VV_VX<Commutable=1>;
 defm PseudoVWSUBU : VPseudoVWALU_VV_VX;
-defm PseudoVWADD  : VPseudoVWALU_VV_VX;
+defm PseudoVWADD  : VPseudoVWALU_VV_VX<Commutable=1>;
 defm PseudoVWSUB  : VPseudoVWALU_VV_VX;
 defm PseudoVWADDU : VPseudoVWALU_WV_WX;
 defm PseudoVWSUBU : VPseudoVWALU_WV_WX;
@@ -6346,9 +6354,9 @@ defm PseudoVMSBC : VPseudoVCALUM_V_X<"@earlyclobber $rd">;
 //===----------------------------------------------------------------------===//
 // 11.5. Vector Bitwise Logical Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVAND : VPseudoVALU_VV_VX_VI;
-defm PseudoVOR  : VPseudoVALU_VV_VX_VI;
-defm PseudoVXOR : VPseudoVALU_VV_VX_VI;
+defm PseudoVAND : VPseudoVALU_VV_VX_VI<Commutable=1>;
+defm PseudoVOR  : VPseudoVALU_VV_VX_VI<Commutable=1>;
+defm PseudoVXOR : VPseudoVALU_VV_VX_VI<Commutable=1>;
 
 //===----------------------------------------------------------------------===//
 // 11.6. Vector Single-Width Bit Shift Instructions
@@ -6366,8 +6374,8 @@ defm PseudoVNSRA : VPseudoVNSHT_WV_WX_WI;
 //===----------------------------------------------------------------------===//
 // 11.8. Vector Integer Comparison Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVMSEQ  : VPseudoVCMPM_VV_VX_VI;
-defm PseudoVMSNE  : VPseudoVCMPM_VV_VX_VI;
+defm PseudoVMSEQ  : VPseudoVCMPM_VV_VX_VI<Commutable=1>;
+defm PseudoVMSNE  : VPseudoVCMPM_VV_VX_VI<Commutable=1>;
 defm PseudoVMSLTU : VPseudoVCMPM_VV_VX;
 defm PseudoVMSLT  : VPseudoVCMPM_VV_VX;
 defm PseudoVMSLEU : VPseudoVCMPM_VV_VX_VI;
@@ -6386,9 +6394,9 @@ defm PseudoVMAX  : VPseudoVMINMAX_VV_VX;
 //===----------------------------------------------------------------------===//
 // 11.10. Vector Single-Width Integer Multiply Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVMUL    : VPseudoVMUL_VV_VX;
-defm PseudoVMULH   : VPseudoVMUL_VV_VX;
-defm PseudoVMULHU  : VPseudoVMUL_VV_VX;
+defm PseudoVMUL    : VPseudoVMUL_VV_VX<Commutable=1>;
+defm PseudoVMULH   : VPseudoVMUL_VV_VX<Commutable=1>;
+defm PseudoVMULHU  : VPseudoVMUL_VV_VX<Commutable=1>;
 defm PseudoVMULHSU : VPseudoVMUL_VV_VX;
 
 //===----------------------------------------------------------------------===//
@@ -6402,8 +6410,8 @@ defm PseudoVREM  : VPseudoVDIV_VV_VX;
 //===----------------------------------------------------------------------===//
 // 11.12. Vector Widening Integer Multiply Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVWMUL   : VPseudoVWMUL_VV_VX;
-defm PseudoVWMULU  : VPseudoVWMUL_VV_VX;
+defm PseudoVWMUL   : VPseudoVWMUL_VV_VX<Commutable=1>;
+defm PseudoVWMULU  : VPseudoVWMUL_VV_VX<Commutable=1>;
 defm PseudoVWMULSU : VPseudoVWMUL_VV_VX;
 
 //===----------------------------------------------------------------------===//
@@ -6417,8 +6425,8 @@ defm PseudoVNMSUB : VPseudoVMAC_VV_VX_AAXA;
 //===----------------------------------------------------------------------===//
 // 11.14. Vector Widening Integer Multiply-Add Instructions
 //===----------------------------------------------------------------------===//
-defm PseudoVWMACCU  : VPseudoVWMAC_VV_VX;
-defm PseudoVWMACC   : VPseudoVWMAC_VV_VX;
+defm PseudoVWMACCU  : VPseudoVWMAC_VV_VX<Commutable=1>;
+defm PseudoVWMACC   : VPseudoVWMAC_VV_VX<Commutable=1>;
 defm PseudoVWMACCSU : VPseudoVWMAC_VV_VX;
 defm PseudoVWMACCUS : VPseudoVWMAC_VX;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index 9a6818c99af20..71aa1f19e089a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -307,10 +307,16 @@ multiclass VPseudoVC_X<LMULInfo m, DAGOperand RS1Class,
                        Operand OpClass = payload2> {
   let VLMul = m.value in {
     let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in {
-      def "PseudoVC_" # NAME # "_SE_" # m.MX : VPseudoVC_X<OpClass, RS1Class>;
-      def "PseudoVC_V_" # NAME # "_SE_" # m.MX : VPseudoVC_V_X<OpClass, m.vrclass, RS1Class>;
+      def "PseudoVC_" # NAME # "_SE_" # m.MX
+        : VPseudoVC_X<OpClass, RS1Class>,
+          Sched<[!cast<SchedWrite>("WriteVC_" # NAME # "_" # m.MX)]>;
+      def "PseudoVC_V_" # NAME # "_SE_" # m.MX
+        : VPseudoVC_V_X<OpClass, m.vrclass, RS1Class>,
+          Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
     }
-    def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_X<OpClass, m.vrclass, RS1Class>;
+    def "PseudoVC_V_" # NAME # "_" # m.MX
+      : VPseudoVC_V_X<OpClass, m.vrclass, RS1Class>,
+        Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
   }
 }
 
@@ -318,10 +324,16 @@ multiclass VPseudoVC_XV<LMULInfo m, DAGOperand RS1Class,
                         Operand OpClass = payload2> {
   let VLMul = m.value in {
     let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in {
-      def "PseudoVC_" # NAME # "_SE_" # m.MX : VPseudoVC_XV<OpClass, m.vrclass, RS1Class>;
-      def "PseudoVC_V_" # NAME # "_SE_" # m.MX : VPseudoVC_V_XV<OpClass, m.vrclass, m.vrclass, RS1Class>;
+      def "PseudoVC_" # NAME # "_SE_" # m.MX
+        : VPseudoVC_XV<OpClass, m.vrclass, RS1Class>,
+          Sched<[!cast<SchedWrite>("WriteVC_" # NAME # "_" # m.MX)]>;
+      def "PseudoVC_V_" # NAME # "_SE_" # m.MX
+        : VPseudoVC_V_XV<OpClass, m.vrclass, m.vrclass, RS1Class>,
+          Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
     }
-    def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_XV<OpClass, m.vrclass, m.vrclass, RS1Class>;
+    def "PseudoVC_V_" # NAME # "_" # m.MX
+      : VPseudoVC_V_XV<OpClass, m.vrclass, m.vrclass, RS1Class>,
+        Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
   }
 }
 
@@ -329,10 +341,16 @@ multiclass VPseudoVC_XVV<LMULInfo m, DAGOperand RS1Class,
                          Operand OpClass = payload2> {
   let VLMul = m.value in {
     let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in {
-      def "PseudoVC_" # NAME # "_SE_" # m.MX : VPseudoVC_XVV<OpClass, m.vrclass, m.vrclass, RS1Class>;
-      def "PseudoVC_V_" # NAME # "_SE_" # m.MX : VPseudoVC_V_XVV<OpClass, m.vrclass, m.vrclass, RS1Class>;
+      def "PseudoVC_" # NAME # "_SE_" # m.MX
+        : VPseudoVC_XVV<OpClass, m.vrclass, m.vrclass, RS1Class>,
+          Sched<[!cast<SchedWrite>("WriteVC_" # NAME # "_" # m.MX)]>;
+      def "PseudoVC_V_" # NAME # "_SE_" # m.MX
+        : VPseudoVC_V_XVV<OpClass, m.vrclass, m.vrclass, RS1Class>,
+          Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
     }
-    def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_XVV<OpClass, m.vrclass, m.vrclass, RS1Class>;
+    def "PseudoVC_V_" # NAME # "_" # m.MX
+      : VPseudoVC_V_XVV<OpClass, m.vrclass, m.vrclass, RS1Class>,
+        Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
   }
 }
 
@@ -340,11 +358,17 @@ multiclass VPseudoVC_XVW<LMULInfo m, DAGOperand RS1Class,
                          Operand OpClass = payload2> {
   let VLMul = m.value in {
     let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in
-    def "PseudoVC_" # NAME # "_SE_" # m.MX : VPseudoVC_XVV<OpClass, m.wvrclass, m.vrclass, RS1Class>;
+    def "PseudoVC_" # NAME # "_SE_" # m.MX
+      : VPseudoVC_XVV<OpClass, m.wvrclass, m.vrclass, RS1Class>,
+        Sched<[!cast<SchedWrite>("WriteVC_" # NAME # "_" # m.MX)]>;
     let Constraints = "@earlyclobber $rd, $rd = $rs3" in {
       let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in
-      def "PseudoVC_V_" # NAME # "_SE_" # m.MX : VPseudoVC_V_XVV<OpClass, m.wvrclass, m.vrclass, RS1Class>;
-      def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_XVV<OpClass, m.wvrclass, m.vrclass, RS1Class>;
+      def "PseudoVC_V_" # NAME # "_SE_" # m.MX
+        : VPseudoVC_V_XVV<OpClass, m.wvrclass, m.vrclass, RS1Class>,
+          Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
+      def "PseudoVC_V_" # NAME # "_" # m.MX
+        : VPseudoVC_V_XVV<OpClass, m.wvrclass, m.vrclass, RS1Class>,
+          Sched<[!cast<SchedWrite>("WriteVC_V_" # NAME # "_" # m.MX)]>;
     }
   }
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td
index dd13a07d606d0..32e7f962aa2ab 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZcmop.td
@@ -20,13 +20,7 @@ class CMOPInst<bits<3> imm3, string opcodestr>
   let Inst{12-11} = 0;
 }
 
-// CMOP1, CMOP5 is used by Zicfiss.
-let Predicates = [HasStdExtZcmop, NoHasStdExtZicfiss] in {
-  def CMOP1 : CMOPInst<0, "cmop.1">, Sched<[]>;
-  def CMOP5 : CMOPInst<2, "cmop.5">, Sched<[]>;
-}
-
-foreach n = [3, 7, 9, 11, 13, 15] in {
+foreach n = [1, 3, 5, 7, 9, 11, 13, 15] in {
   let Predicates = [HasStdExtZcmop] in
-  def CMOP # n : CMOPInst<!srl(n, 1), "cmop." # n>, Sched<[]>;
+  def C_MOP # n : CMOPInst<!srl(n, 1), "c.mop." # n>, Sched<[]>;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
index 39d420c2fbf08..ead91c5656be8 100644
--- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
+++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
@@ -12,15 +12,24 @@
 // extended bits aren't consumed or because the input was already sign extended
 // by an earlier instruction.
 //
-// Then it removes the -w suffix from opw instructions whenever all users are
-// dependent only on the lower word of the result of the instruction.
-// The cases handled are:
-// * addw because c.add has a larger register encoding than c.addw.
-// * addiw because it helps reduce test differences between RV32 and RV64
-//   w/o being a pessimization.
-// * mulw because c.mulw doesn't exist but c.mul does (w/ zcb)
-// * slliw because c.slliw doesn't exist and c.slli does
+// Then:
+// 1. Unless explicit disabled or the target prefers instructions with W suffix,
+//    it removes the -w suffix from opw instructions whenever all users are
+//    dependent only on the lower word of the result of the instruction.
+//    The cases handled are:
+//    * addw because c.add has a larger register encoding than c.addw.
+//    * addiw because it helps reduce test differences between RV32 and RV64
+//      w/o being a pessimization.
+//    * mulw because c.mulw doesn't exist but c.mul does (w/ zcb)
+//    * slliw because c.slliw doesn't exist and c.slli does
 //
+// 2. Or if explicit enabled or the target prefers instructions with W suffix,
+//    it adds the W suffix to the instruction whenever all users are dependent
+//    only on the lower word of the result of the instruction.
+//    The cases handled are:
+//    * add/addi/sub/mul.
+//    * slli with imm < 32.
+//    * ld/lwu.
 //===---------------------------------------------------------------------===//
 
 #include "RISCV.h"
@@ -60,6 +69,8 @@ class RISCVOptWInstrs : public MachineFunctionPass {
                          const RISCVSubtarget &ST, MachineRegisterInfo &MRI);
   bool stripWSuffixes(MachineFunction &MF, const RISCVInstrInfo &TII,
                       const RISCVSubtarget &ST, MachineRegisterInfo &MRI);
+  bool appendWSuffixes(MachineFunction &MF, const RISCVInstrInfo &TII,
+                       const RISCVSubtarget &ST, MachineRegisterInfo &MRI);
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
@@ -672,9 +683,6 @@ bool RISCVOptWInstrs::stripWSuffixes(MachineFunction &MF,
                                      const RISCVInstrInfo &TII,
                                      const RISCVSubtarget &ST,
                                      MachineRegisterInfo &MRI) {
-  if (DisableStripWSuffix || !ST.enableStripWSuffix())
-    return false;
-
   bool MadeChange = false;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
@@ -698,6 +706,58 @@ bool RISCVOptWInstrs::stripWSuffixes(MachineFunction &MF,
   return MadeChange;
 }
 
+bool RISCVOptWInstrs::appendWSuffixes(MachineFunction &MF,
+                                      const RISCVInstrInfo &TII,
+                                      const RISCVSubtarget &ST,
+                                      MachineRegisterInfo &MRI) {
+  bool MadeChange = false;
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      unsigned WOpc;
+      // TODO: Add more?
+      switch (MI.getOpcode()) {
+      default:
+        continue;
+      case RISCV::ADD:
+        WOpc = RISCV::ADDW;
+        break;
+      case RISCV::ADDI:
+        WOpc = RISCV::ADDIW;
+        break;
+      case RISCV::SUB:
+        WOpc = RISCV::SUBW;
+        break;
+      case RISCV::MUL:
+        WOpc = RISCV::MULW;
+        break;
+      case RISCV::SLLI:
+        // SLLIW reads the lowest 5 bits, while SLLI reads lowest 6 bits
+        if (MI.getOperand(2).getImm() >= 32)
+          continue;
+        WOpc = RISCV::SLLIW;
+        break;
+      case RISCV::LD:
+      case RISCV::LWU:
+        WOpc = RISCV::LW;
+        break;
+      }
+
+      if (hasAllWUsers(MI, ST, MRI)) {
+        LLVM_DEBUG(dbgs() << "Replacing " << MI);
+        MI.setDesc(TII.get(WOpc));
+        MI.clearFlag(MachineInstr::MIFlag::NoSWrap);
+        MI.clearFlag(MachineInstr::MIFlag::NoUWrap);
+        MI.clearFlag(MachineInstr::MIFlag::IsExact);
+        LLVM_DEBUG(dbgs() << "     with " << MI);
+        ++NumTransformedToWInstrs;
+        MadeChange = true;
+      }
+    }
+  }
+
+  return MadeChange;
+}
+
 bool RISCVOptWInstrs::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -711,7 +771,12 @@ bool RISCVOptWInstrs::runOnMachineFunction(MachineFunction &MF) {
 
   bool MadeChange = false;
   MadeChange |= removeSExtWInstrs(MF, TII, ST, MRI);
-  MadeChange |= stripWSuffixes(MF, TII, ST, MRI);
+
+  if (!(DisableStripWSuffix || ST.preferWInst()))
+    MadeChange |= stripWSuffixes(MF, TII, ST, MRI);
+
+  if (ST.preferWInst())
+    MadeChange |= appendWSuffixes(MF, TII, ST, MRI);
 
   return MadeChange;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index fd6d6078ec238..f9a557e02bfe1 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -56,11 +56,13 @@ class RISCVTuneProcessorModel<string n,
 
 def GENERIC_RV32 : RISCVProcessorModel<"generic-rv32",
                                        NoSchedModel,
-                                       [Feature32Bit]>,
+                                       [Feature32Bit,
+                                        FeatureStdExtI]>,
                    GenericTuneInfo;
 def GENERIC_RV64 : RISCVProcessorModel<"generic-rv64",
                                        NoSchedModel,
-                                       [Feature64Bit]>,
+                                       [Feature64Bit,
+                                        FeatureStdExtI]>,
                    GenericTuneInfo;
 // Support generic for compatibility with other targets. The triple will be used
 // to change to the appropriate rv32/rv64 version.
@@ -69,11 +71,13 @@ def : ProcessorModel<"generic", NoSchedModel, []>, GenericTuneInfo;
 def ROCKET_RV32 : RISCVProcessorModel<"rocket-rv32",
                                       RocketModel,
                                       [Feature32Bit,
+                                       FeatureStdExtI,
                                        FeatureStdExtZifencei,
                                        FeatureStdExtZicsr]>;
 def ROCKET_RV64 : RISCVProcessorModel<"rocket-rv64",
                                       RocketModel,
                                       [Feature64Bit,
+                                       FeatureStdExtI,
                                        FeatureStdExtZifencei,
                                        FeatureStdExtZicsr]>;
 def ROCKET : RISCVTuneProcessorModel<"rocket",
@@ -86,6 +90,7 @@ def SIFIVE_7 : RISCVTuneProcessorModel<"sifive-7-series",
 def SIFIVE_E20 : RISCVProcessorModel<"sifive-e20",
                                      RocketModel,
                                      [Feature32Bit,
+                                      FeatureStdExtI,
                                       FeatureStdExtZicsr,
                                       FeatureStdExtZifencei,
                                       FeatureStdExtM,
@@ -94,6 +99,7 @@ def SIFIVE_E20 : RISCVProcessorModel<"sifive-e20",
 def SIFIVE_E21 : RISCVProcessorModel<"sifive-e21",
                                      RocketModel,
                                      [Feature32Bit,
+                                      FeatureStdExtI,
                                       FeatureStdExtZicsr,
                                       FeatureStdExtZifencei,
                                       FeatureStdExtM,
@@ -103,6 +109,7 @@ def SIFIVE_E21 : RISCVProcessorModel<"sifive-e21",
 def SIFIVE_E24 : RISCVProcessorModel<"sifive-e24",
                                      RocketModel,
                                      [Feature32Bit,
+                                      FeatureStdExtI,
                                       FeatureStdExtZifencei,
                                       FeatureStdExtM,
                                       FeatureStdExtA,
@@ -112,6 +119,7 @@ def SIFIVE_E24 : RISCVProcessorModel<"sifive-e24",
 def SIFIVE_E31 : RISCVProcessorModel<"sifive-e31",
                                      RocketModel,
                                      [Feature32Bit,
+                                      FeatureStdExtI,
                                       FeatureStdExtZifencei,
                                       FeatureStdExtZicsr,
                                       FeatureStdExtM,
@@ -121,6 +129,7 @@ def SIFIVE_E31 : RISCVProcessorModel<"sifive-e31",
 def SIFIVE_E34 : RISCVProcessorModel<"sifive-e34",
                                      RocketModel,
                                      [Feature32Bit,
+                                      FeatureStdExtI,
                                       FeatureStdExtZifencei,
                                       FeatureStdExtM,
                                       FeatureStdExtA,
@@ -130,6 +139,7 @@ def SIFIVE_E34 : RISCVProcessorModel<"sifive-e34",
 def SIFIVE_E76 : RISCVProcessorModel<"sifive-e76",
                                      SiFive7Model,
                                      [Feature32Bit,
+                                      FeatureStdExtI,
                                       FeatureStdExtZifencei,
                                       FeatureStdExtM,
                                       FeatureStdExtA,
@@ -140,6 +150,7 @@ def SIFIVE_E76 : RISCVProcessorModel<"sifive-e76",
 def SIFIVE_S21 : RISCVProcessorModel<"sifive-s21",
                                      RocketModel,
                                      [Feature64Bit,
+                                      FeatureStdExtI,
                                       FeatureStdExtZicsr,
                                       FeatureStdExtZifencei,
                                       FeatureStdExtM,
@@ -149,6 +160,7 @@ def SIFIVE_S21 : RISCVProcessorModel<"sifive-s21",
 def SIFIVE_S51 : RISCVProcessorModel<"sifive-s51",
                                      RocketModel,
                                      [Feature64Bit,
+                                      FeatureStdExtI,
                                       FeatureStdExtZicsr,
                                       FeatureStdExtZifencei,
                                       FeatureStdExtM,
@@ -158,6 +170,7 @@ def SIFIVE_S51 : RISCVProcessorModel<"sifive-s51",
 def SIFIVE_S54 : RISCVProcessorModel<"sifive-s54",
                                       RocketModel,
                                       [Feature64Bit,
+                                       FeatureStdExtI,
                                        FeatureStdExtZifencei,
                                        FeatureStdExtM,
                                        FeatureStdExtA,
@@ -168,6 +181,7 @@ def SIFIVE_S54 : RISCVProcessorModel<"sifive-s54",
 def SIFIVE_S76 : RISCVProcessorModel<"sifive-s76",
                                      SiFive7Model,
                                      [Feature64Bit,
+                                      FeatureStdExtI,
                                       FeatureStdExtZifencei,
                                       FeatureStdExtM,
                                       FeatureStdExtA,
@@ -180,6 +194,7 @@ def SIFIVE_S76 : RISCVProcessorModel<"sifive-s76",
 def SIFIVE_U54 : RISCVProcessorModel<"sifive-u54",
                                      RocketModel,
                                      [Feature64Bit,
+                                      FeatureStdExtI,
                                       FeatureStdExtZifencei,
                                       FeatureStdExtM,
                                       FeatureStdExtA,
@@ -190,6 +205,7 @@ def SIFIVE_U54 : RISCVProcessorModel<"sifive-u54",
 def SIFIVE_U74 : RISCVProcessorModel<"sifive-u74",
                                      SiFive7Model,
                                      [Feature64Bit,
+                                      FeatureStdExtI,
                                       FeatureStdExtZifencei,
                                       FeatureStdExtM,
                                       FeatureStdExtA,
@@ -200,6 +216,7 @@ def SIFIVE_U74 : RISCVProcessorModel<"sifive-u74",
 
 def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model,
                                       [Feature64Bit,
+                                       FeatureStdExtI,
                                        FeatureStdExtZifencei,
                                        FeatureStdExtM,
                                        FeatureStdExtA,
@@ -217,6 +234,7 @@ def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model,
 
 def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
                                       [Feature64Bit,
+                                       FeatureStdExtI,
                                        FeatureStdExtZifencei,
                                        FeatureStdExtM,
                                        FeatureStdExtA,
@@ -239,7 +257,8 @@ def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
                                        FeatureStdExtZbb,
                                        FeatureStdExtZbs,
                                        FeatureStdExtZfhmin,
-                                       FeatureFastUnalignedAccess],
+                                       FeatureUnalignedScalarMem,
+                                       FeatureUnalignedVectorMem],
                                       [TuneNoDefaultUnroll,
                                        TuneConditionalCompressedMoveFusion,
                                        TuneLUIADDIFusion,
@@ -247,6 +266,7 @@ def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
 
 def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
                                       [Feature64Bit,
+                                       FeatureStdExtI,
                                        FeatureStdExtZifencei,
                                        FeatureStdExtM,
                                        FeatureStdExtA,
@@ -276,7 +296,8 @@ def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
                                        FeatureStdExtZvkng,
                                        FeatureStdExtZvksc,
                                        FeatureStdExtZvksg,
-                                       FeatureFastUnalignedAccess],
+                                       FeatureUnalignedScalarMem,
+                                       FeatureUnalignedVectorMem],
                                       [TuneNoDefaultUnroll,
                                        TuneConditionalCompressedMoveFusion,
                                        TuneLUIADDIFusion,
@@ -286,6 +307,7 @@ def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
 def SYNTACORE_SCR1_BASE : RISCVProcessorModel<"syntacore-scr1-base",
                                               SyntacoreSCR1Model,
                                               [Feature32Bit,
+                                               FeatureStdExtI,
                                                FeatureStdExtZicsr,
                                                FeatureStdExtZifencei,
                                                FeatureStdExtC],
@@ -294,6 +316,7 @@ def SYNTACORE_SCR1_BASE : RISCVProcessorModel<"syntacore-scr1-base",
 def SYNTACORE_SCR1_MAX : RISCVProcessorModel<"syntacore-scr1-max",
                                              SyntacoreSCR1Model,
                                              [Feature32Bit,
+                                              FeatureStdExtI,
                                               FeatureStdExtZicsr,
                                               FeatureStdExtZifencei,
                                               FeatureStdExtM,
@@ -303,6 +326,7 @@ def SYNTACORE_SCR1_MAX : RISCVProcessorModel<"syntacore-scr1-max",
 def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1",
                                             NoSchedModel,
                                             [Feature64Bit,
+                                             FeatureStdExtI,
                                              FeatureStdExtZifencei,
                                              FeatureStdExtZicsr,
                                              FeatureStdExtZicntr,
@@ -332,6 +356,7 @@ def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1",
 def XIANGSHAN_NANHU : RISCVProcessorModel<"xiangshan-nanhu",
                                           XiangShanNanHuModel,
                                           [Feature64Bit,
+                                           FeatureStdExtI,
                                            FeatureStdExtZicsr,
                                            FeatureStdExtZifencei,
                                            FeatureStdExtM,
diff --git a/llvm/lib/Target/RISCV/RISCVSchedRocket.td b/llvm/lib/Target/RISCV/RISCVSchedRocket.td
index e74c7aab7474d..65494e73758d6 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedRocket.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedRocket.td
@@ -261,4 +261,5 @@ defm : UnsupportedSchedZbkx;
 defm : UnsupportedSchedZfa;
 defm : UnsupportedSchedZfh;
 defm : UnsupportedSchedSFB;
+defm : UnsupportedSchedXsfvcp;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 4dcec96df9d57..a532066b3a1c8 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -962,6 +962,54 @@ let Latency = 3 in
 
 def : InstRW<[WriteIALU], (instrs COPY)>;
 
+// VCIX
+//
+// In principle we don't know the latency of any VCIX instructions. But instead
+// of taking the default of 1, which can lead to issues [1], we assume that they
+// have a fairly high latency.
+//
+// [1] https://github.com/llvm/llvm-project/issues/83391
+foreach mx = SchedMxList in {
+  defvar Cycles = SiFive7GetCyclesDefault<mx>.c;
+  defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
+  let Latency = !mul(Cycles, 10),
+      AcquireAtCycles = [0, 1],
+      ReleaseAtCycles = [1, !add(1, Cycles)] in {
+    defm "" : LMULWriteResMX<"WriteVC_V_I",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_V_X",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_V_IV",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_V_VV",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_V_XV",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_V_IVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_V_IVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_V_VVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_V_VVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_V_XVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_V_XVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    foreach f = ["FPR16", "FPR32", "FPR64"] in {
+      defm "" : LMULWriteResMX<"WriteVC_V_" # f # "V",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVC_V_" # f # "VV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVC_V_" # f # "VW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    }
+    defm "" : LMULWriteResMX<"WriteVC_I",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_X",   [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_IV",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_VV",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_XV",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_IVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_IVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_VVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_VVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_XVV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVC_XVW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    foreach f = ["FPR16", "FPR32", "FPR64"] in {
+      defm "" : LMULWriteResMX<"WriteVC_" # f # "V",  [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVC_" # f # "VV", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVC_" # f # "VW", [SiFive7VCQ, SiFive7VA], mx, IsWorstCase>;
+    }
+  }
+}
+
 //===----------------------------------------------------------------------===//
 
 // Bypass and advance
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
index 8ec2e4ff885eb..fccdd7e4f3ec2 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td
@@ -366,4 +366,5 @@ defm : UnsupportedSchedZbkx;
 defm : UnsupportedSchedSFB;
 defm : UnsupportedSchedZfa;
 defm : UnsupportedSchedV;
+defm : UnsupportedSchedXsfvcp;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
index 0ced2efa3f7ab..6e4fb19361f55 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
@@ -1040,4 +1040,5 @@ defm : UnsupportedSchedZbkb;
 defm : UnsupportedSchedZbkx;
 defm : UnsupportedSchedSFB;
 defm : UnsupportedSchedZfa;
+defm : UnsupportedSchedXsfvcp;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td
index 9625d17e0b260..0885e325f24e6 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td
@@ -212,4 +212,5 @@ defm : UnsupportedSchedZbkb;
 defm : UnsupportedSchedZbkx;
 defm : UnsupportedSchedZfa;
 defm : UnsupportedSchedZfh;
+defm : UnsupportedSchedXsfvcp;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td b/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td
index 4fc7b0335af53..e0f1fab1d6b40 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedXiangShanNanHu.td
@@ -311,4 +311,5 @@ defm : UnsupportedSchedZfa;
 defm : UnsupportedSchedZfh;
 defm : UnsupportedSchedSFB;
 defm : UnsupportedSchedZabha;
+defm : UnsupportedSchedXsfvcp;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVSchedule.td b/llvm/lib/Target/RISCV/RISCVSchedule.td
index 1d19624342d2b..0086557a41fe7 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedule.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedule.td
@@ -296,3 +296,4 @@ def : ReadAdvance<ReadAtomicHD, 0>;
 // Include the scheduler resources for other instruction extensions.
 include "RISCVScheduleZb.td"
 include "RISCVScheduleV.td"
+include "RISCVScheduleXSf.td"
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td
index 5993884bc2c1c..5be06d4c3f7e7 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleV.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td
@@ -1079,7 +1079,7 @@ defm "" : LMULReadAdvance<"ReadVFCvtFToIV", 0>;
 defm "" : LMULSEWReadAdvanceW<"ReadVFWCvtIToFV", 0>;
 defm "" : LMULReadAdvanceFW<"ReadVFWCvtFToIV", 0>;
 defm "" : LMULSEWReadAdvanceFW<"ReadVFWCvtFToFV", 0>;
-defm "" : LMULSEWReadAdvanceFW<"SEWReadVFNCvtIToFV", 0>;
+defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtIToFV", 0>;
 defm "" : LMULReadAdvanceW<"ReadVFNCvtFToIV", 0>;
 defm "" : LMULSEWReadAdvanceFW<"ReadVFNCvtFToFV", 0>;
 
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleXSf.td b/llvm/lib/Target/RISCV/RISCVScheduleXSf.td
new file mode 100644
index 0000000000000..58d508460f019
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVScheduleXSf.td
@@ -0,0 +1,59 @@
+//===-- RISCVScheduleXSf.td - Scheduling Definitions XSf ---*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the scheduling information for SiFive extensions.
+//
+//===----------------------------------------------------------------------===//
+
+multiclass LMULSchedWritesVCIX<string id>{
+defm "" : LMULSchedWrites<"WriteVC_" # id>;
+defm "" : LMULSchedWrites<"WriteVC_V_" # id>;
+}
+
+defm "" : LMULSchedWritesVCIX<"I">;
+defm "" : LMULSchedWritesVCIX<"X">;
+defm "" : LMULSchedWritesVCIX<"IV">;
+defm "" : LMULSchedWritesVCIX<"VV">;
+defm "" : LMULSchedWritesVCIX<"XV">;
+defm "" : LMULSchedWritesVCIX<"IVV">;
+defm "" : LMULSchedWritesVCIX<"IVW">;
+defm "" : LMULSchedWritesVCIX<"VVV">;
+defm "" : LMULSchedWritesVCIX<"VVW">;
+defm "" : LMULSchedWritesVCIX<"XVV">;
+defm "" : LMULSchedWritesVCIX<"XVW">;
+foreach f = ["FPR16", "FPR32", "FPR64"] in {
+  defm "" : LMULSchedWritesVCIX<f # "V">;
+  defm "" : LMULSchedWritesVCIX<f # "VV">;
+  defm "" : LMULSchedWritesVCIX<f # "VW">;
+}
+
+multiclass LMULWriteResVCIX<string id, list<ProcResourceKind> resources>{
+defm : LMULWriteRes<"WriteVC_" # id, resources>;
+defm : LMULWriteRes<"WriteVC_V_" # id, resources>;
+}
+
+multiclass UnsupportedSchedXsfvcp {
+let Unsupported = true in {
+defm : LMULWriteResVCIX<"I", []>;
+defm : LMULWriteResVCIX<"X", []>;
+defm : LMULWriteResVCIX<"IV", []>;
+defm : LMULWriteResVCIX<"VV", []>;
+defm : LMULWriteResVCIX<"XV", []>;
+defm : LMULWriteResVCIX<"IVV", []>;
+defm : LMULWriteResVCIX<"IVW", []>;
+defm : LMULWriteResVCIX<"VVV", []>;
+defm : LMULWriteResVCIX<"VVW", []>;
+defm : LMULWriteResVCIX<"XVV", []>;
+defm : LMULWriteResVCIX<"XVW", []>;
+foreach f = ["FPR16", "FPR32", "FPR64"] in {
+  defm : LMULWriteResVCIX<f # "V", []>;
+  defm : LMULWriteResVCIX<f # "VV", []>;
+  defm : LMULWriteResVCIX<f # "VW", []>;
+}
+}
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index bc9756c5e6dda..56f5bd8794ae7 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1335,8 +1335,8 @@ InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
                                      I);
 
+  std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
   if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
-    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
     if (CondTy->isVectorTy()) {
       if (ValTy->getScalarSizeInBits() == 1) {
         // vmandn.mm v8, v8, v9
@@ -1375,14 +1375,15 @@ InstructionCost RISCVTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
                           LT.second, CostKind);
   }
 
-  if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
-      ValTy->isVectorTy()) {
-    std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
-
-    // Support natively.
-    if (CmpInst::isIntPredicate(VecPred))
-      return LT.first * 1;
+  if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
+      CmpInst::isIntPredicate(VecPred)) {
+    // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
+    // provided they incur the same cost across all implementations
+    return LT.first *
+           getRISCVInstructionCost(RISCV::VMSLT_VV, LT.second, CostKind);
+  }
 
+  if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy()) {
     // If we do not support the input floating point vector type, use the base
     // one which will calculate as:
     // ScalarizeCost + Num * Cost for fixed vector,
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index e0c0e6517b6f1..2f9281ab89244 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -228,7 +228,7 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
       return false;
 
     EVT ElemType = DataTypeVT.getScalarType();
-    if (!ST->hasFastUnalignedAccess() && Alignment < ElemType.getStoreSize())
+    if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
       return false;
 
     return TLI->isLegalElementTypeForRVV(ElemType);
@@ -253,7 +253,7 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
       return false;
 
     EVT ElemType = DataTypeVT.getScalarType();
-    if (!ST->hasFastUnalignedAccess() && Alignment < ElemType.getStoreSize())
+    if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
       return false;
 
     return TLI->isLegalElementTypeForRVV(ElemType);
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 05e41e06248e3..cebe230d3e8ce 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -653,7 +653,8 @@ Register SPIRVGlobalRegistry::buildGlobalVariable(
   auto MRI = MIRBuilder.getMRI();
   assert(MRI->getType(ResVReg).isPointer() && "Pointer type is expected");
   if (Reg != ResVReg) {
-    LLT RegLLTy = LLT::pointer(MRI->getType(ResVReg).getAddressSpace(), 32);
+    LLT RegLLTy =
+        LLT::pointer(MRI->getType(ResVReg).getAddressSpace(), getPointerSize());
     MRI->setType(Reg, RegLLTy);
     assignSPIRVTypeToVReg(BaseType, Reg, MIRBuilder.getMF());
   } else {
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
index aacfecc1e313f..af98f2f880459 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.cpp
@@ -247,8 +247,10 @@ void SPIRVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
 bool SPIRVInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   if (MI.getOpcode() == SPIRV::GET_ID || MI.getOpcode() == SPIRV::GET_fID ||
-      MI.getOpcode() == SPIRV::GET_pID || MI.getOpcode() == SPIRV::GET_vfID ||
-      MI.getOpcode() == SPIRV::GET_vID || MI.getOpcode() == SPIRV::GET_vpID) {
+      MI.getOpcode() == SPIRV::GET_pID32 ||
+      MI.getOpcode() == SPIRV::GET_pID64 || MI.getOpcode() == SPIRV::GET_vfID ||
+      MI.getOpcode() == SPIRV::GET_vID || MI.getOpcode() == SPIRV::GET_vpID32 ||
+      MI.getOpcode() == SPIRV::GET_vpID64) {
     auto &MRI = MI.getMF()->getRegInfo();
     MRI.replaceRegWith(MI.getOperand(0).getReg(), MI.getOperand(1).getReg());
     MI.eraseFromParent();
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
index a3f981457c8da..151d0ec1fe569 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td
@@ -19,10 +19,12 @@ let isCodeGenOnly=1 in {
   def DECL_TYPE: Pseudo<(outs ANYID:$dst_id), (ins ANYID:$src_id, TYPE:$src_ty)>;
   def GET_ID: Pseudo<(outs ID:$dst_id), (ins ANYID:$src)>;
   def GET_fID: Pseudo<(outs fID:$dst_id), (ins ANYID:$src)>;
-  def GET_pID: Pseudo<(outs pID:$dst_id), (ins ANYID:$src)>;
+  def GET_pID32: Pseudo<(outs pID32:$dst_id), (ins ANYID:$src)>;
+  def GET_pID64: Pseudo<(outs pID64:$dst_id), (ins ANYID:$src)>;
   def GET_vID: Pseudo<(outs vID:$dst_id), (ins ANYID:$src)>;
   def GET_vfID: Pseudo<(outs vfID:$dst_id), (ins ANYID:$src)>;
-  def GET_vpID: Pseudo<(outs vpID:$dst_id), (ins ANYID:$src)>;
+  def GET_vpID32: Pseudo<(outs vpID32:$dst_id), (ins ANYID:$src)>;
+  def GET_vpID64: Pseudo<(outs vpID64:$dst_id), (ins ANYID:$src)>;
 }
 
 def SPVTypeBin : SDTypeProfile<1, 2, []>;
@@ -66,8 +68,10 @@ multiclass TernOpTypedGen<string name, bits<16> opCode, SDNode node, bit genP =
     def SIVCond: TernOpTyped<name, opCode, vID, ID, node>;
   }
   if genP then {
-    def SPSCond: TernOpTyped<name, opCode, ID, pID, node>;
-    def SPVCond: TernOpTyped<name, opCode, vID, pID, node>;
+    def SPSCond32: TernOpTyped<name, opCode, ID, pID32, node>;
+    def SPVCond32: TernOpTyped<name, opCode, vID, pID32, node>;
+    def SPSCond64: TernOpTyped<name, opCode, ID, pID64, node>;
+    def SPVCond64: TernOpTyped<name, opCode, vID, pID64, node>;
   }
   if genV then {
     if genF then {
@@ -79,8 +83,10 @@ multiclass TernOpTypedGen<string name, bits<16> opCode, SDNode node, bit genP =
       def VIVCond: TernOpTyped<name, opCode, vID, vID, node>;
     }
     if genP then {
-      def VPSCond: TernOpTyped<name, opCode, ID, vpID, node>;
-      def VPVCond: TernOpTyped<name, opCode, vID, vpID, node>;
+      def VPSCond32: TernOpTyped<name, opCode, ID, vpID32, node>;
+      def VPVCond32: TernOpTyped<name, opCode, vID, vpID32, node>;
+      def VPSCond64: TernOpTyped<name, opCode, ID, vpID64, node>;
+      def VPVCond64: TernOpTyped<name, opCode, vID, vpID64, node>;
     }
   }
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 200fe38298c02..72e5a7bcac983 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -290,14 +290,18 @@ bool SPIRVInstructionSelector::select(MachineInstr &I) {
   // If it's not a GMIR instruction, we've selected it already.
   if (!isPreISelGenericOpcode(Opcode)) {
     if (Opcode == SPIRV::ASSIGN_TYPE) { // These pseudos aren't needed any more.
-      auto *Def = MRI->getVRegDef(I.getOperand(1).getReg());
+      Register DstReg = I.getOperand(0).getReg();
+      Register SrcReg = I.getOperand(1).getReg();
+      auto *Def = MRI->getVRegDef(SrcReg);
       if (isTypeFoldingSupported(Def->getOpcode())) {
+        if (MRI->getType(DstReg).isPointer())
+          MRI->setType(DstReg, LLT::scalar(32));
         bool Res = selectImpl(I, *CoverageInfo);
         assert(Res || Def->getOpcode() == TargetOpcode::G_CONSTANT);
         if (Res)
           return Res;
       }
-      MRI->replaceRegWith(I.getOperand(1).getReg(), I.getOperand(0).getReg());
+      MRI->replaceRegWith(SrcReg, DstReg);
       I.removeFromParent();
       return true;
     } else if (I.getNumDefs() == 1) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
index f069a92ac6868..d652b5de60808 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
@@ -55,8 +55,9 @@ extern void processInstr(MachineInstr &MI, MachineIRBuilder &MIB,
 
 static bool isMetaInstrGET(unsigned Opcode) {
   return Opcode == SPIRV::GET_ID || Opcode == SPIRV::GET_fID ||
-         Opcode == SPIRV::GET_pID || Opcode == SPIRV::GET_vID ||
-         Opcode == SPIRV::GET_vfID || Opcode == SPIRV::GET_vpID;
+         Opcode == SPIRV::GET_pID32 || Opcode == SPIRV::GET_pID64 ||
+         Opcode == SPIRV::GET_vID || Opcode == SPIRV::GET_vfID ||
+         Opcode == SPIRV::GET_vpID32 || Opcode == SPIRV::GET_vpID64;
 }
 
 static bool mayBeInserted(unsigned Opcode) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index 2c964595fc39e..d16f6d5bf67ef 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -171,6 +171,12 @@ static void insertBitcasts(MachineFunction &MF, SPIRVGlobalRegistry *GR,
 //   %1 = G_GLOBAL_VALUE
 //   %2 = COPY %1
 //   %3 = G_ADDRSPACE_CAST %2
+//
+// or
+//
+//  %1 = G_ZEXT %2
+//  G_MEMCPY ... %2 ...
+//
 // New registers have no SPIRVType and no register class info.
 //
 // Set SPIRVType for GV, propagate it from GV to other instructions,
@@ -200,6 +206,24 @@ static SPIRVType *propagateSPIRVType(MachineInstr *MI, SPIRVGlobalRegistry *GR,
         SpirvTy = GR->getOrCreateSPIRVType(Ty, MIB);
         break;
       }
+      case TargetOpcode::G_ZEXT: {
+        if (MI->getOperand(1).isReg()) {
+          if (MachineInstr *DefInstr =
+                  MRI.getVRegDef(MI->getOperand(1).getReg())) {
+            if (SPIRVType *Def = propagateSPIRVType(DefInstr, GR, MRI, MIB)) {
+              unsigned CurrentBW = GR->getScalarOrVectorBitWidth(Def);
+              unsigned ExpectedBW =
+                  std::max(MRI.getType(Reg).getScalarSizeInBits(), CurrentBW);
+              unsigned NumElements = GR->getScalarOrVectorComponentCount(Def);
+              SpirvTy = GR->getOrCreateSPIRVIntegerType(ExpectedBW, MIB);
+              if (NumElements > 1)
+                SpirvTy =
+                    GR->getOrCreateSPIRVVectorType(SpirvTy, NumElements, MIB);
+            }
+          }
+        }
+        break;
+      }
       case TargetOpcode::G_TRUNC:
       case TargetOpcode::G_ADDRSPACE_CAST:
       case TargetOpcode::G_PTR_ADD:
@@ -223,11 +247,12 @@ static SPIRVType *propagateSPIRVType(MachineInstr *MI, SPIRVGlobalRegistry *GR,
 }
 
 static std::pair<Register, unsigned>
-createNewIdReg(Register ValReg, unsigned Opcode, MachineRegisterInfo &MRI,
+createNewIdReg(SPIRVType *SpvType, Register SrcReg, MachineRegisterInfo &MRI,
                const SPIRVGlobalRegistry &GR) {
-  LLT NewT = LLT::scalar(32);
-  SPIRVType *SpvType = GR.getSPIRVTypeForVReg(ValReg);
+  if (!SpvType)
+    SpvType = GR.getSPIRVTypeForVReg(SrcReg);
   assert(SpvType && "VReg is expected to have SPIRV type");
+  LLT NewT = LLT::scalar(32);
   bool IsFloat = SpvType->getOpcode() == SPIRV::OpTypeFloat;
   bool IsVectorFloat =
       SpvType->getOpcode() == SPIRV::OpTypeVector &&
@@ -236,14 +261,38 @@ createNewIdReg(Register ValReg, unsigned Opcode, MachineRegisterInfo &MRI,
   IsFloat |= IsVectorFloat;
   auto GetIdOp = IsFloat ? SPIRV::GET_fID : SPIRV::GET_ID;
   auto DstClass = IsFloat ? &SPIRV::fIDRegClass : &SPIRV::IDRegClass;
-  if (MRI.getType(ValReg).isPointer()) {
-    NewT = LLT::pointer(0, 32);
-    GetIdOp = SPIRV::GET_pID;
-    DstClass = &SPIRV::pIDRegClass;
-  } else if (MRI.getType(ValReg).isVector()) {
+  if (MRI.getType(SrcReg).isPointer()) {
+    unsigned PtrSz = GR.getPointerSize();
+    NewT = LLT::pointer(0, PtrSz);
+    bool IsVec = MRI.getType(SrcReg).isVector();
+    if (IsVec)
+      NewT = LLT::fixed_vector(2, NewT);
+    if (PtrSz == 64) {
+      if (IsVec) {
+        GetIdOp = SPIRV::GET_vpID64;
+        DstClass = &SPIRV::vpID64RegClass;
+      } else {
+        GetIdOp = SPIRV::GET_pID64;
+        DstClass = &SPIRV::pID64RegClass;
+      }
+    } else {
+      if (IsVec) {
+        GetIdOp = SPIRV::GET_vpID32;
+        DstClass = &SPIRV::vpID32RegClass;
+      } else {
+        GetIdOp = SPIRV::GET_pID32;
+        DstClass = &SPIRV::pID32RegClass;
+      }
+    }
+  } else if (MRI.getType(SrcReg).isVector()) {
     NewT = LLT::fixed_vector(2, NewT);
-    GetIdOp = IsFloat ? SPIRV::GET_vfID : SPIRV::GET_vID;
-    DstClass = IsFloat ? &SPIRV::vfIDRegClass : &SPIRV::vIDRegClass;
+    if (IsFloat) {
+      GetIdOp = SPIRV::GET_vfID;
+      DstClass = &SPIRV::vfIDRegClass;
+    } else {
+      GetIdOp = SPIRV::GET_vID;
+      DstClass = &SPIRV::vIDRegClass;
+    }
   }
   Register IdReg = MRI.createGenericVirtualRegister(NewT);
   MRI.setRegClass(IdReg, DstClass);
@@ -264,6 +313,7 @@ Register insertAssignInstr(Register Reg, Type *Ty, SPIRVType *SpirvTy,
   MIB.setInsertPt(*Def->getParent(),
                   (Def->getNextNode() ? Def->getNextNode()->getIterator()
                                       : Def->getParent()->end()));
+  SpirvTy = SpirvTy ? SpirvTy : GR->getOrCreateSPIRVType(Ty, MIB);
   Register NewReg = MRI.createGenericVirtualRegister(MRI.getType(Reg));
   if (auto *RC = MRI.getRegClassOrNull(Reg)) {
     MRI.setRegClass(NewReg, RC);
@@ -271,7 +321,6 @@ Register insertAssignInstr(Register Reg, Type *Ty, SPIRVType *SpirvTy,
     MRI.setRegClass(NewReg, &SPIRV::IDRegClass);
     MRI.setRegClass(Reg, &SPIRV::IDRegClass);
   }
-  SpirvTy = SpirvTy ? SpirvTy : GR->getOrCreateSPIRVType(Ty, MIB);
   GR->assignSPIRVTypeToVReg(SpirvTy, Reg, MIB.getMF());
   // This is to make it convenient for Legalizer to get the SPIRVType
   // when processing the actual MI (i.e. not pseudo one).
@@ -290,11 +339,11 @@ Register insertAssignInstr(Register Reg, Type *Ty, SPIRVType *SpirvTy,
 
 void processInstr(MachineInstr &MI, MachineIRBuilder &MIB,
                   MachineRegisterInfo &MRI, SPIRVGlobalRegistry *GR) {
-  unsigned Opc = MI.getOpcode();
   assert(MI.getNumDefs() > 0 && MRI.hasOneUse(MI.getOperand(0).getReg()));
   MachineInstr &AssignTypeInst =
       *(MRI.use_instr_begin(MI.getOperand(0).getReg()));
-  auto NewReg = createNewIdReg(MI.getOperand(0).getReg(), Opc, MRI, *GR).first;
+  auto NewReg =
+      createNewIdReg(nullptr, MI.getOperand(0).getReg(), MRI, *GR).first;
   AssignTypeInst.getOperand(1).setReg(NewReg);
   MI.getOperand(0).setReg(NewReg);
   MIB.setInsertPt(*MI.getParent(),
@@ -303,7 +352,7 @@ void processInstr(MachineInstr &MI, MachineIRBuilder &MIB,
   for (auto &Op : MI.operands()) {
     if (!Op.isReg() || Op.isDef())
       continue;
-    auto IdOpInfo = createNewIdReg(Op.getReg(), Opc, MRI, *GR);
+    auto IdOpInfo = createNewIdReg(nullptr, Op.getReg(), MRI, *GR);
     MIB.buildInstr(IdOpInfo.second).addDef(IdOpInfo.first).addUse(Op.getReg());
     Op.setReg(IdOpInfo.first);
   }
@@ -390,6 +439,7 @@ static void generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
         }
         insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MRI);
       } else if (MI.getOpcode() == TargetOpcode::G_TRUNC ||
+                 MI.getOpcode() == TargetOpcode::G_ZEXT ||
                  MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE ||
                  MI.getOpcode() == TargetOpcode::COPY ||
                  MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST) {
@@ -419,6 +469,7 @@ static void processInstrsWithTypeFolding(MachineFunction &MF,
         processInstr(MI, MIB, MRI, GR);
     }
   }
+
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &MI : MBB) {
       // We need to rewrite dst types for ASSIGN_TYPE instrs to be able
@@ -431,16 +482,18 @@ static void processInstrsWithTypeFolding(MachineFunction &MF,
       if (!isTypeFoldingSupported(Opcode))
         continue;
       Register DstReg = MI.getOperand(0).getReg();
-      if (MRI.getType(DstReg).isVector())
+      bool IsDstPtr = MRI.getType(DstReg).isPointer();
+      if (IsDstPtr || MRI.getType(DstReg).isVector())
         MRI.setRegClass(DstReg, &SPIRV::IDRegClass);
       // Don't need to reset type of register holding constant and used in
-      // G_ADDRSPACE_CAST, since it braaks legalizer.
+      // G_ADDRSPACE_CAST, since it breaks legalizer.
       if (Opcode == TargetOpcode::G_CONSTANT && MRI.hasOneUse(DstReg)) {
         MachineInstr &UseMI = *MRI.use_instr_begin(DstReg);
         if (UseMI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST)
           continue;
       }
-      MRI.setType(DstReg, LLT::scalar(32));
+      MRI.setType(DstReg, IsDstPtr ? LLT::pointer(0, GR->getPointerSize())
+                                   : LLT::scalar(32));
     }
   }
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.cpp
index 5983c9229cb3c..ecd99f1840d7e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVRegisterBankInfo.cpp
@@ -27,23 +27,7 @@ using namespace llvm;
 const RegisterBank &
 SPIRVRegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
                                               LLT Ty) const {
-  switch (RC.getID()) {
-  case SPIRV::TYPERegClassID:
+  if (RC.getID() == SPIRV::TYPERegClassID)
     return SPIRV::TYPERegBank;
-  case SPIRV::pIDRegClassID:
-  case SPIRV::IDRegClassID:
-    return SPIRV::IDRegBank;
-  case SPIRV::fIDRegClassID:
-    return SPIRV::fIDRegBank;
-  case SPIRV::vIDRegClassID:
-    return SPIRV::vIDRegBank;
-  case SPIRV::vfIDRegClassID:
-    return SPIRV::vfIDRegBank;
-  case SPIRV::vpIDRegClassID:
-    return SPIRV::vpIDRegBank;
-  case SPIRV::ANYIDRegClassID:
-  case SPIRV::ANYRegClassID:
-    return SPIRV::IDRegBank;
-  }
-  llvm_unreachable("Unknown register class");
+  return SPIRV::IDRegBank;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVRegisterBanks.td b/llvm/lib/Target/SPIRV/SPIRVRegisterBanks.td
index c7f1e172f3d4f..dea2ef402d3d9 100644
--- a/llvm/lib/Target/SPIRV/SPIRVRegisterBanks.td
+++ b/llvm/lib/Target/SPIRV/SPIRVRegisterBanks.td
@@ -8,9 +8,6 @@
 
 // Although RegisterBankSelection is disabled we need to distinct the banks
 // as InstructionSelector RegClass checking code relies on them
-def IDRegBank : RegisterBank<"IDBank", [ID]>;
-def fIDRegBank : RegisterBank<"fIDBank", [fID]>;
-def vIDRegBank : RegisterBank<"vIDBank", [vID]>;
-def vfIDRegBank : RegisterBank<"vfIDBank", [vfID]>;
-def vpIDRegBank : RegisterBank<"vpIDBank", [vpID]>;
+
 def TYPERegBank : RegisterBank<"TYPEBank", [TYPE]>;
+def IDRegBank : RegisterBank<"IDBank", [ID, fID, pID32, pID64, vID, vfID, vpID32, vpID64]>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td b/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td
index 6d2bfb91a97f1..9231d22e8d836 100644
--- a/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td
+++ b/llvm/lib/Target/SPIRV/SPIRVRegisterInfo.td
@@ -11,39 +11,46 @@
 //===----------------------------------------------------------------------===//
 
 let Namespace = "SPIRV" in {
-  def p0 : PtrValueType <i32, 0>;
-
-  class P0Vec<ValueType scalar>
-      : PtrValueType <scalar, 0> {
-    let nElem = 2;
-    let ElementType = p0;
-    let isInteger = false;
-    let isFP = false;
-    let isVector = true;
+  // Pointer types for patterns with the GlobalISelEmitter
+  def p32 : PtrValueType <i32, 0>;
+  def p64 : PtrValueType <i64, 0>;
+
+  class VTPtrVec<int nelem, PtrValueType ptr>
+      : VTVec<nelem, ValueType<ptr.Size, ptr.Value>, ptr.Value> {
+    int isPointer = true;
   }
 
-  def v2p0 : P0Vec<i32>;
-  // All registers are for 32-bit identifiers, so have a single dummy register
+  def v2p32 : VTPtrVec<2, p32>;
+  def v2p64 : VTPtrVec<2, p64>;
 
-  // Class for registers that are the result of OpTypeXXX instructions
+  // Class for type registers
   def TYPE0 : Register<"TYPE0">;
   def TYPE : RegisterClass<"SPIRV", [i32], 32, (add TYPE0)>;
 
-  // Class for every other non-type ID
+  // Class for non-type registers
   def ID0 : Register<"ID0">;
-  def ID : RegisterClass<"SPIRV", [i32], 32, (add ID0)>;
   def fID0 : Register<"fID0">;
-  def fID : RegisterClass<"SPIRV", [f32], 32, (add fID0)>;
-  def pID0 : Register<"pID0">;
-  def pID : RegisterClass<"SPIRV", [p0], 32, (add pID0)>;
+  def pID320 : Register<"pID320">;
+  def pID640 : Register<"pID640">;
   def vID0 : Register<"vID0">;
-  def vID : RegisterClass<"SPIRV", [v2i32], 32, (add vID0)>;
   def vfID0 : Register<"vfID0">;
+  def vpID320 : Register<"vpID320">;
+  def vpID640 : Register<"vpID640">;
+
+  def ID : RegisterClass<"SPIRV", [i32], 32, (add ID0)>;
+  def fID : RegisterClass<"SPIRV", [f32], 32, (add fID0)>;
+  def pID32 : RegisterClass<"SPIRV", [p32], 32, (add pID320)>;
+  def pID64 : RegisterClass<"SPIRV", [p64], 32, (add pID640)>;
+  def vID : RegisterClass<"SPIRV", [v2i32], 32, (add vID0)>;
   def vfID : RegisterClass<"SPIRV", [v2f32], 32, (add vfID0)>;
-  def vpID0 : Register<"vpID0">;
-  def vpID : RegisterClass<"SPIRV", [v2p0], 32, (add vpID0)>;
+  def vpID32 : RegisterClass<"SPIRV", [v2p32], 32, (add vpID320)>;
+  def vpID64 : RegisterClass<"SPIRV", [v2p64], 32, (add vpID640)>;
 
-  def ANYID : RegisterClass<"SPIRV", [i32, f32, p0, v2i32, v2f32], 32, (add ID, fID, pID, vID, vfID)>;
+  def ANYID : RegisterClass<
+      "SPIRV",
+      [i32, f32, p32, p64, v2i32, v2f32, v2p32, v2p64],
+      32,
+      (add ID0, fID0, pID320, pID640, vID0, vfID0, vpID320, vpID640)>;
 
   // A few instructions like OpName can take ids from both type and non-type
   // instructions, so we need a super-class to allow for both to count as valid
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
index fbf64f2b1dfb1..ae8baa3f11913 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
@@ -55,9 +55,9 @@ static std::string computeDataLayout(const Triple &TT) {
   // mean anything.
   if (Arch == Triple::spirv32)
     return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-"
-           "v96:128-v192:256-v256:256-v512:512-v1024:1024";
+           "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1";
   return "e-i64:64-v16:16-v24:32-v32:32-v48:64-"
-         "v96:128-v192:256-v256:256-v512:512-v1024:1024";
+         "v96:128-v192:256-v256:256-v512:512-v1024:1024-G1";
 }
 
 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.td b/llvm/lib/Target/Sparc/SparcInstrInfo.td
index 5e792427cca28..4d68f93efeac1 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.td
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -693,38 +693,38 @@ let DecoderNamespace = "SparcV8", Predicates = [HasNoV9] in {
 }
 
 let rd = 0 in {
-  let Defs = [CPSR] in {
-    def STCSRrr : F3_1<3, 0b110101, (outs (MEMrr $rs1, $rs2):$addr), (ins),
+  let mayStore = 1, Uses = [CPSR] in {
+    def STCSRrr : F3_1<3, 0b110101, (outs), (ins (MEMrr $rs1, $rs2):$addr),
                        "st %csr, [$addr]", [], IIC_st>;
-    def STCSRri : F3_2<3, 0b110101, (outs (MEMri $rs1, $simm13):$addr), (ins),
+    def STCSRri : F3_2<3, 0b110101, (outs), (ins (MEMri $rs1, $simm13):$addr),
                        "st %csr, [$addr]", [], IIC_st>;
   }
-  let Defs = [CPQ] in {
-    def STDCQrr : F3_1<3, 0b110110, (outs (MEMrr $rs1, $rs2):$addr), (ins),
+  let mayStore = 1, Uses = [CPQ] in {
+    def STDCQrr : F3_1<3, 0b110110, (outs), (ins (MEMrr $rs1, $rs2):$addr),
                        "std %cq, [$addr]", [], IIC_std>;
-    def STDCQri : F3_2<3, 0b110110, (outs (MEMri $rs1, $simm13):$addr), (ins),
+    def STDCQri : F3_2<3, 0b110110, (outs), (ins (MEMri $rs1, $simm13):$addr),
                        "std %cq, [$addr]", [], IIC_std>;
   }
 }
 
 let rd = 0 in {
-  let Defs = [FSR] in {
-    def STFSRrr : F3_1<3, 0b100101, (outs (MEMrr $rs1, $rs2):$addr), (ins),
+  let mayStore = 1, Uses = [FSR] in {
+    def STFSRrr : F3_1<3, 0b100101, (outs), (ins (MEMrr $rs1, $rs2):$addr),
 		   "st %fsr, [$addr]", [], IIC_st>;
-    def STFSRri : F3_2<3, 0b100101, (outs (MEMri $rs1, $simm13):$addr), (ins),
+    def STFSRri : F3_2<3, 0b100101, (outs), (ins (MEMri $rs1, $simm13):$addr),
 		   "st %fsr, [$addr]", [], IIC_st>;
   }
-  let Defs = [FQ] in {
-    def STDFQrr : F3_1<3, 0b100110, (outs (MEMrr $rs1, $rs2):$addr), (ins),
+  let mayStore = 1, Defs = [FQ] in {
+    def STDFQrr : F3_1<3, 0b100110, (outs), (ins (MEMrr $rs1, $rs2):$addr),
 		   "std %fq, [$addr]", [], IIC_std>;
-    def STDFQri : F3_2<3, 0b100110, (outs (MEMri $rs1, $simm13):$addr), (ins),
+    def STDFQri : F3_2<3, 0b100110, (outs), (ins (MEMri $rs1, $simm13):$addr),
 		   "std %fq, [$addr]", [], IIC_std>;
   }
 }
-let rd = 1, Defs = [FSR] in {
-  def STXFSRrr : F3_1<3, 0b100101, (outs (MEMrr $rs1, $rs2):$addr), (ins),
+let rd = 1, mayStore = 1, Uses = [FSR] in {
+  def STXFSRrr : F3_1<3, 0b100101, (outs), (ins (MEMrr $rs1, $rs2):$addr),
 		 "stx %fsr, [$addr]", []>, Requires<[HasV9]>;
-  def STXFSRri : F3_2<3, 0b100101, (outs (MEMri $rs1, $simm13):$addr), (ins),
+  def STXFSRri : F3_2<3, 0b100101, (outs), (ins (MEMri $rs1, $simm13):$addr),
 		 "stx %fsr, [$addr]", []>, Requires<[HasV9]>;
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 4897b37d8eb1e..50ecd6e074414 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -824,10 +824,7 @@ void SystemZELFFrameLowering::inlineStackProbe(
   StackAllocMI->eraseFromParent();
   if (DoneMBB != nullptr) {
     // Compute the live-in lists for the new blocks.
-    bool anyChange = false;
-    do {
-      anyChange = recomputeLiveIns(*DoneMBB) || recomputeLiveIns(*LoopMBB);
-    } while (anyChange);
+    fullyRecomputeLiveIns({DoneMBB, LoopMBB});
   }
 }
 
@@ -1425,10 +1422,7 @@ void SystemZXPLINKFrameLowering::inlineStackProbe(
   StackAllocMI->eraseFromParent();
 
   // Compute the live-in lists for the new blocks.
-  bool anyChange = false;
-  do {
-    anyChange = recomputeLiveIns(*StackExtMBB) || recomputeLiveIns(*NextMBB);
-  } while (anyChange);
+  fullyRecomputeLiveIns({StackExtMBB, NextMBB});
 }
 
 bool SystemZXPLINKFrameLowering::hasFP(const MachineFunction &MF) const {
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index a7fe329b064ee..8ddc742004292 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -43,6 +43,12 @@ bool TargetMachine::isLargeGlobalValue(const GlobalValue *GVal) const {
   if (getTargetTriple().getArch() != Triple::x86_64)
     return false;
 
+  // Remaining logic below is ELF-specific. For other object file formats where
+  // the large code model is mostly used for JIT compilation, just look at the
+  // code model.
+  if (!getTargetTriple().isOSBinFormatELF())
+    return getCodeModel() == CodeModel::Large;
+
   auto *GO = GVal->getAliaseeObject();
 
   // Be conservative if we can't find an underlying GlobalObject.
@@ -51,9 +57,20 @@ bool TargetMachine::isLargeGlobalValue(const GlobalValue *GVal) const {
 
   auto *GV = dyn_cast<GlobalVariable>(GO);
 
+  auto IsPrefix = [](StringRef Name, StringRef Prefix) {
+    return Name.consume_front(Prefix) && (Name.empty() || Name[0] == '.');
+  };
+
   // Functions/GlobalIFuncs are only large under the large code model.
-  if (!GV)
+  if (!GV) {
+    // Handle explicit sections as we do for GlobalVariables with an explicit
+    // section, see comments below.
+    if (GO->hasSection()) {
+      StringRef Name = GO->getSection();
+      return IsPrefix(Name, ".ltext");
+    }
     return getCodeModel() == CodeModel::Large;
+  }
 
   if (GV->isThreadLocal())
     return false;
@@ -73,11 +90,8 @@ bool TargetMachine::isLargeGlobalValue(const GlobalValue *GVal) const {
   // data sections. The code model attribute overrides this above.
   if (GV->hasSection()) {
     StringRef Name = GV->getSection();
-    auto IsPrefix = [&](StringRef Prefix) {
-      StringRef S = Name;
-      return S.consume_front(Prefix) && (S.empty() || S[0] == '.');
-    };
-    return IsPrefix(".lbss") || IsPrefix(".ldata") || IsPrefix(".lrodata");
+    return IsPrefix(Name, ".lbss") || IsPrefix(Name, ".ldata") ||
+           IsPrefix(Name, ".lrodata");
   }
 
   // Respect large data threshold for medium and large code models.
diff --git a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp
index e7c9e60ba95f1..9e85424e76e62 100644
--- a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp
+++ b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.cpp
@@ -13,10 +13,13 @@
 #include "X86RegisterBankInfo.h"
 #include "X86InstrInfo.h"
 #include "X86Subtarget.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterBank.h"
 #include "llvm/CodeGen/RegisterBankInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/IntrinsicsX86.h"
 
 #define GET_TARGET_REGBANK_IMPL
 #include "X86GenRegisterBank.inc"
@@ -68,6 +71,98 @@ X86RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
   llvm_unreachable("Unsupported register kind yet.");
 }
 
+// \returns true if a given intrinsic only uses and defines FPRs.
+static bool isFPIntrinsic(const MachineRegisterInfo &MRI,
+                          const MachineInstr &MI) {
+  // TODO: Add more intrinsics.
+  switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
+  default:
+    return false;
+  // SSE1
+  case Intrinsic::x86_sse_rcp_ss:
+  case Intrinsic::x86_sse_rcp_ps:
+  case Intrinsic::x86_sse_rsqrt_ss:
+  case Intrinsic::x86_sse_rsqrt_ps:
+  case Intrinsic::x86_sse_min_ss:
+  case Intrinsic::x86_sse_min_ps:
+  case Intrinsic::x86_sse_max_ss:
+  case Intrinsic::x86_sse_max_ps:
+    return true;
+  }
+  return false;
+}
+
+bool X86RegisterBankInfo::hasFPConstraints(const MachineInstr &MI,
+                                           const MachineRegisterInfo &MRI,
+                                           const TargetRegisterInfo &TRI,
+                                           unsigned Depth) const {
+  unsigned Op = MI.getOpcode();
+  if (Op == TargetOpcode::G_INTRINSIC && isFPIntrinsic(MRI, MI))
+    return true;
+
+  // Do we have an explicit floating point instruction?
+  if (isPreISelGenericFloatingPointOpcode(Op))
+    return true;
+
+  // No. Check if we have a copy-like instruction. If we do, then we could
+  // still be fed by floating point instructions.
+  if (Op != TargetOpcode::COPY && !MI.isPHI() &&
+      !isPreISelGenericOptimizationHint(Op))
+    return false;
+
+  // Check if we already know the register bank.
+  auto *RB = getRegBank(MI.getOperand(0).getReg(), MRI, TRI);
+  if (RB == &getRegBank(X86::PSRRegBankID))
+    return true;
+  if (RB == &getRegBank(X86::GPRRegBankID))
+    return false;
+
+  // We don't know anything.
+  //
+  // If we have a phi, we may be able to infer that it will be assigned a fp
+  // type based off of its inputs.
+  if (!MI.isPHI() || Depth > MaxFPRSearchDepth)
+    return false;
+
+  return any_of(MI.explicit_uses(), [&](const MachineOperand &Op) {
+    return Op.isReg() &&
+           onlyDefinesFP(*MRI.getVRegDef(Op.getReg()), MRI, TRI, Depth + 1);
+  });
+}
+
+bool X86RegisterBankInfo::onlyUsesFP(const MachineInstr &MI,
+                                     const MachineRegisterInfo &MRI,
+                                     const TargetRegisterInfo &TRI,
+                                     unsigned Depth) const {
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_FPTOSI:
+  case TargetOpcode::G_FPTOUI:
+  case TargetOpcode::G_FCMP:
+  case TargetOpcode::G_LROUND:
+  case TargetOpcode::G_LLROUND:
+  case TargetOpcode::G_INTRINSIC_TRUNC:
+  case TargetOpcode::G_INTRINSIC_ROUND:
+    return true;
+  default:
+    break;
+  }
+  return hasFPConstraints(MI, MRI, TRI, Depth);
+}
+
+bool X86RegisterBankInfo::onlyDefinesFP(const MachineInstr &MI,
+                                        const MachineRegisterInfo &MRI,
+                                        const TargetRegisterInfo &TRI,
+                                        unsigned Depth) const {
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_SITOFP:
+  case TargetOpcode::G_UITOFP:
+    return true;
+  default:
+    break;
+  }
+  return hasFPConstraints(MI, MRI, TRI, Depth);
+}
+
 X86GenRegisterBankInfo::PartialMappingIdx
 X86GenRegisterBankInfo::getPartialMappingIdx(const MachineInstr &MI,
                                              const LLT &Ty, bool isFP) {
@@ -180,11 +275,13 @@ X86RegisterBankInfo::getSameOperandsMapping(const MachineInstr &MI,
 const RegisterBankInfo::InstructionMapping &
 X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   const MachineFunction &MF = *MI.getParent()->getParent();
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   unsigned Opc = MI.getOpcode();
 
-  // Try the default logic for non-generic instructions that are either copies
-  // or already have some operands assigned to banks.
+  // Try the default logic for non-generic instructions that are either
+  // copies or already have some operands assigned to banks.
   if (!isPreISelGenericOpcode(Opc) || Opc == TargetOpcode::G_PHI) {
     const InstructionMapping &Mapping = getInstrMappingImpl(MI);
     if (Mapping.isValid())
@@ -221,13 +318,14 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case TargetOpcode::G_FPEXT:
   case TargetOpcode::G_FPTRUNC:
   case TargetOpcode::G_FCONSTANT:
-    // Instruction having only floating-point operands (all scalars in VECRReg)
+    // Instruction having only floating-point operands (all scalars in
+    // VECRReg)
     getInstrPartialMappingIdxs(MI, MRI, /* isFP= */ true, OpRegBankIdx);
     break;
   case TargetOpcode::G_SITOFP:
   case TargetOpcode::G_FPTOSI: {
-    // Some of the floating-point instructions have mixed GPR and FP operands:
-    // fine-tune the computed mapping.
+    // Some of the floating-point instructions have mixed GPR and FP
+    // operands: fine-tune the computed mapping.
     auto &Op0 = MI.getOperand(0);
     auto &Op1 = MI.getOperand(1);
     const LLT Ty0 = MRI.getType(Op0.getReg());
@@ -271,9 +369,36 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
 
     getInstrPartialMappingIdxs(MI, MRI, /* isFP= */ isFPTrunc || isFPAnyExt,
                                OpRegBankIdx);
-  } break;
+    break;
+  }
+  case TargetOpcode::G_LOAD: {
+    // Check if that load feeds fp instructions.
+    // In that case, we want the default mapping to be on FPR
+    // instead of blind map every scalar to GPR.
+    bool IsFP = any_of(MRI.use_nodbg_instructions(cast<GLoad>(MI).getDstReg()),
+                       [&](const MachineInstr &UseMI) {
+                         // If we have at least one direct use in a FP
+                         // instruction, assume this was a floating point load
+                         // in the IR. If it was not, we would have had a
+                         // bitcast before reaching that instruction.
+                         return onlyUsesFP(UseMI, MRI, TRI);
+                       });
+    getInstrPartialMappingIdxs(MI, MRI, IsFP, OpRegBankIdx);
+    break;
+  }
+  case TargetOpcode::G_STORE: {
+    // Check if that store is fed by fp instructions.
+    Register VReg = cast<GStore>(MI).getValueReg();
+    if (!VReg)
+      break;
+    MachineInstr *DefMI = MRI.getVRegDef(VReg);
+    bool IsFP = onlyDefinesFP(*DefMI, MRI, TRI);
+    getInstrPartialMappingIdxs(MI, MRI, IsFP, OpRegBankIdx);
+    break;
+  }
   default:
-    // Track the bank of each register, use NotFP mapping (all scalars in GPRs)
+    // Track the bank of each register, use NotFP mapping (all scalars in
+    // GPRs)
     getInstrPartialMappingIdxs(MI, MRI, /* isFP= */ false, OpRegBankIdx);
     break;
   }
diff --git a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.h b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.h
index 989c5956ad591..8f38e717e36b0 100644
--- a/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.h
+++ b/llvm/lib/Target/X86/GISel/X86RegisterBankInfo.h
@@ -62,6 +62,22 @@ class X86RegisterBankInfo final : public X86GenRegisterBankInfo {
                        const SmallVectorImpl<PartialMappingIdx> &OpRegBankIdx,
                        SmallVectorImpl<const ValueMapping *> &OpdsMapping);
 
+  // Maximum recursion depth for hasFPConstraints.
+  const unsigned MaxFPRSearchDepth = 2;
+
+  /// \returns true if \p MI only uses and defines FPRs.
+  bool hasFPConstraints(const MachineInstr &MI, const MachineRegisterInfo &MRI,
+                        const TargetRegisterInfo &TRI,
+                        unsigned Depth = 0) const;
+
+  /// \returns true if \p MI only uses FPRs.
+  bool onlyUsesFP(const MachineInstr &MI, const MachineRegisterInfo &MRI,
+                  const TargetRegisterInfo &TRI, unsigned Depth = 0) const;
+
+  /// \returns true if \p MI only defines FPRs.
+  bool onlyDefinesFP(const MachineInstr &MI, const MachineRegisterInfo &MRI,
+                     const TargetRegisterInfo &TRI, unsigned Depth = 0) const;
+
 public:
   X86RegisterBankInfo(const TargetRegisterInfo &TRI);
 
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index d914e1b61ab07..4521401d8741c 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -885,10 +885,7 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop(
   }
 
   // Update Live In information
-  bool anyChange = false;
-  do {
-    anyChange = recomputeLiveIns(*tailMBB) || recomputeLiveIns(*testMBB);
-  } while (anyChange);
+  fullyRecomputeLiveIns({tailMBB, testMBB});
 }
 
 void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64(
@@ -1380,11 +1377,7 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
         footMBB->addSuccessor(&MBB);
       }
 
-      bool anyChange = false;
-      do {
-        anyChange = recomputeLiveIns(*footMBB) || recomputeLiveIns(*bodyMBB) ||
-                    recomputeLiveIns(*headMBB) || recomputeLiveIns(MBB);
-      } while (anyChange);
+      fullyRecomputeLiveIns({footMBB, bodyMBB, headMBB, &MBB});
     }
   } else {
     MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AndOp), Reg)
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 4e4241efd63d6..7dcde2a508949 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2927,11 +2927,10 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
 }
 
 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
-  // Cannot use 32 bit constants to reference objects in kernel code model.
-  // Cannot use 32 bit constants to reference objects in large PIC mode since
-  // GOTOFF is 64 bits.
+  // Cannot use 32 bit constants to reference objects in kernel/large code
+  // model.
   if (TM.getCodeModel() == CodeModel::Kernel ||
-      (TM.getCodeModel() == CodeModel::Large && TM.isPositionIndependent()))
+      TM.getCodeModel() == CodeModel::Large)
     return false;
 
   // In static codegen with small code model, we can get the address of a label
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f16a751a166d6..27107f554fccf 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1276,6 +1276,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::STRICT_FDIV,        MVT::v2f64, Legal);
   }
 
+  if (Subtarget.hasGFNI()) {
+    setOperationAction(ISD::BITREVERSE, MVT::i8, Custom);
+    setOperationAction(ISD::BITREVERSE, MVT::i16, Custom);
+    setOperationAction(ISD::BITREVERSE, MVT::i32, Custom);
+    setOperationAction(ISD::BITREVERSE, MVT::i64, Custom);
+  }
+
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
     setOperationAction(ISD::ABS,                MVT::v16i8, Legal);
     setOperationAction(ISD::ABS,                MVT::v8i16, Legal);
@@ -1286,13 +1293,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::CTLZ,             VT, Custom);
     }
 
-    if (Subtarget.hasGFNI()) {
-      setOperationAction(ISD::BITREVERSE, MVT::i8, Custom);
-      setOperationAction(ISD::BITREVERSE, MVT::i16, Custom);
-      setOperationAction(ISD::BITREVERSE, MVT::i32, Custom);
-      setOperationAction(ISD::BITREVERSE, MVT::i64, Custom);
-    }
-
     // These might be better off as horizontal vector ops.
     setOperationAction(ISD::ADD,                MVT::i16, Custom);
     setOperationAction(ISD::ADD,                MVT::i32, Custom);
diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
index 27a0c889a4da3..e27aa4115990e 100644
--- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -58,8 +58,8 @@ let SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold] in {
 }
 let SchedRW = [WriteCMOV, ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault],
     Predicates = [HasCMOV, HasCF, In64BitMode], mayStore = 1 in
-  def mr : ITy<0x40, MRMDestMemCC, t, (outs t.MemOperand:$dst),
-                (ins t.RegClass:$src1, ccode:$cond),
+  def mr : ITy<0x40, MRMDestMemCC, t, (outs),
+                (ins t.MemOperand:$dst, t.RegClass:$src1, ccode:$cond),
                 "cfcmov${cond}", unaryop_ndd_args, []>, UseEFLAGS, NF;
 }
 
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index b466624e13348..d111c4d4ecc1a 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3402,6 +3402,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::ROTR,       MVT::v32i16,  {  1,  1,  1,  1 } },
     { ISD::ROTR,       MVT::v16i16,  {  1,  1,  1,  1 } },
     { ISD::ROTR,       MVT::v8i16,   {  1,  1,  1,  1 } },
+    { X86ISD::VROTLI,  MVT::v32i16,  {  1,  1,  1,  1 } },
+    { X86ISD::VROTLI,  MVT::v16i16,  {  1,  1,  1,  1 } },
+    { X86ISD::VROTLI,  MVT::v8i16,   {  1,  1,  1,  1 } },
   };
   static const CostKindTblEntry AVX512BITALGCostTbl[] = {
     { ISD::CTPOP,      MVT::v32i16,  {  1,  1,  1,  1 } },
@@ -3498,6 +3501,12 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::ROTR,       MVT::v64i8,   {  5,  6, 12, 14 } },
     { ISD::ROTR,       MVT::v32i8,   {  5, 14,  6,  9 } },
     { ISD::ROTR,       MVT::v16i8,   {  5, 14,  6,  9 } },
+    { X86ISD::VROTLI,  MVT::v32i16,  {  2,  5,  3,  3 } },
+    { X86ISD::VROTLI,  MVT::v16i16,  {  1,  5,  3,  3 } },
+    { X86ISD::VROTLI,  MVT::v8i16,   {  1,  5,  3,  3 } },
+    { X86ISD::VROTLI,  MVT::v64i8,   {  2,  9,  3,  4 } },
+    { X86ISD::VROTLI,  MVT::v32i8,   {  1,  9,  3,  4 } },
+    { X86ISD::VROTLI,  MVT::v16i8,   {  1,  8,  3,  4 } },
     { ISD::SADDSAT,    MVT::v32i16,  {  1 } },
     { ISD::SADDSAT,    MVT::v64i8,   {  1 } },
     { ISD::SMAX,       MVT::v32i16,  {  1,  1,  1,  1 } },
@@ -3556,6 +3565,12 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::ROTR,       MVT::v16i32,  {  1,  1,  1,  1 } },
     { ISD::ROTR,       MVT::v8i32,   {  1,  1,  1,  1 } },
     { ISD::ROTR,       MVT::v4i32,   {  1,  1,  1,  1 } },
+    { X86ISD::VROTLI,  MVT::v8i64,   {  1,  1,  1,  1 } },
+    { X86ISD::VROTLI,  MVT::v4i64,   {  1,  1,  1,  1 } },
+    { X86ISD::VROTLI,  MVT::v2i64,   {  1,  1,  1,  1 } },
+    { X86ISD::VROTLI,  MVT::v16i32,  {  1,  1,  1,  1 } },
+    { X86ISD::VROTLI,  MVT::v8i32,   {  1,  1,  1,  1 } },
+    { X86ISD::VROTLI,  MVT::v4i32,   {  1,  1,  1,  1 } },
     { ISD::SMAX,       MVT::v8i64,   {  1,  3,  1,  1 } },
     { ISD::SMAX,       MVT::v16i32,  {  1,  1,  1,  1 } },
     { ISD::SMAX,       MVT::v32i16,  {  3,  7,  5,  5 } },
@@ -3642,7 +3657,15 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::ROTR,       MVT::v2i64,   {  1,  3,  3,  3 } },
     { ISD::ROTR,       MVT::v4i32,   {  1,  3,  3,  3 } },
     { ISD::ROTR,       MVT::v8i16,   {  1,  3,  3,  3 } },
-    { ISD::ROTR,       MVT::v16i8,   {  1,  3,  3,  3 } }
+    { ISD::ROTR,       MVT::v16i8,   {  1,  3,  3,  3 } },
+    { X86ISD::VROTLI,  MVT::v4i64,   {  4,  7,  5,  6 } },
+    { X86ISD::VROTLI,  MVT::v8i32,   {  4,  7,  5,  6 } },
+    { X86ISD::VROTLI,  MVT::v16i16,  {  4,  7,  5,  6 } },
+    { X86ISD::VROTLI,  MVT::v32i8,   {  4,  7,  5,  6 } },
+    { X86ISD::VROTLI,  MVT::v2i64,   {  1,  3,  1,  1 } },
+    { X86ISD::VROTLI,  MVT::v4i32,   {  1,  3,  1,  1 } },
+    { X86ISD::VROTLI,  MVT::v8i16,   {  1,  3,  1,  1 } },
+    { X86ISD::VROTLI,  MVT::v16i8,   {  1,  3,  1,  1 } },
   };
   static const CostKindTblEntry AVX2CostTbl[] = {
     { ISD::ABS,        MVT::v2i64,   {  2,  4,  3,  5 } }, // VBLENDVPD(X,VPSUBQ(0,X),X)
@@ -3820,6 +3843,24 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::FSQRT,      MVT::v2f64,   { 27, 27,  1,  1 } }, // vsqrtpd
     { ISD::FSQRT,      MVT::v4f64,   { 54, 54,  1,  3 } }, // vsqrtpd
   };
+  static const CostKindTblEntry GFNICostTbl[] = {
+    { ISD::BITREVERSE, MVT::i8,      {  3,  3,  3,  4 } }, // gf2p8affineqb
+    { ISD::BITREVERSE, MVT::i16,     {  3,  3,  4,  6 } }, // gf2p8affineqb
+    { ISD::BITREVERSE, MVT::i32,     {  3,  3,  4,  5 } }, // gf2p8affineqb
+    { ISD::BITREVERSE, MVT::i64,     {  3,  3,  4,  6 } }, // gf2p8affineqb
+    { ISD::BITREVERSE, MVT::v16i8,   {  1,  6,  1,  2 } }, // gf2p8affineqb
+    { ISD::BITREVERSE, MVT::v32i8,   {  1,  6,  1,  2 } }, // gf2p8affineqb
+    { ISD::BITREVERSE, MVT::v64i8,   {  1,  6,  1,  2 } }, // gf2p8affineqb
+    { ISD::BITREVERSE, MVT::v8i16,   {  1,  8,  2,  4 } }, // gf2p8affineqb
+    { ISD::BITREVERSE, MVT::v16i16,  {  1,  9,  2,  4 } }, // gf2p8affineqb
+    { ISD::BITREVERSE, MVT::v32i16,  {  1,  9,  2,  4 } }, // gf2p8affineqb
+    { ISD::BITREVERSE, MVT::v4i32,   {  1,  8,  2,  4 } }, // gf2p8affineqb
+    { ISD::BITREVERSE, MVT::v8i32,   {  1,  9,  2,  4 } }, // gf2p8affineqb
+    { ISD::BITREVERSE, MVT::v16i32,  {  1,  9,  2,  4 } }, // gf2p8affineqb
+    { ISD::BITREVERSE, MVT::v2i64,   {  1,  8,  2,  4 } }, // gf2p8affineqb
+    { ISD::BITREVERSE, MVT::v4i64,   {  1,  9,  2,  4 } }, // gf2p8affineqb
+    { ISD::BITREVERSE, MVT::v8i64,   {  1,  9,  2,  4 } }, // gf2p8affineqb
+  };
   static const CostKindTblEntry GLMCostTbl[] = {
     { ISD::FSQRT,      MVT::f32,     { 19, 20, 1, 1 } }, // sqrtss
     { ISD::FSQRT,      MVT::v4f32,   { 37, 41, 1, 5 } }, // sqrtps
@@ -4078,9 +4119,11 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
       const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
       if (Args[0] == Args[1]) {
         ISD = ISD::ROTL;
-        // Handle scalar constant rotation amounts.
-        // TODO: Handle vector + funnel-shift cases.
-        if (isa_and_nonnull<ConstantInt>(Args[2]))
+        // Handle uniform constant rotation amounts.
+        // TODO: Handle funnel-shift cases.
+        const APInt *Amt;
+        if (Args[2] &&
+            PatternMatch::match(Args[2], PatternMatch::m_APIntAllowUndef(Amt)))
           ISD = X86ISD::VROTLI;
       }
     }
@@ -4091,10 +4134,12 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     if (!ICA.isTypeBasedOnly()) {
       const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
       if (Args[0] == Args[1]) {
-        // Handle scalar constant rotation amount.
-        // TODO: Handle vector + funnel-shift cases.
         ISD = ISD::ROTR;
-        if (isa_and_nonnull<ConstantInt>(Args[2]))
+        // Handle uniform constant rotation amount.
+        // TODO: Handle funnel-shift cases.
+        const APInt *Amt;
+        if (Args[2] &&
+            PatternMatch::match(Args[2], PatternMatch::m_APIntAllowUndef(Amt)))
           ISD = X86ISD::VROTLI;
       }
     }
@@ -4156,23 +4201,6 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(OpTy);
     MVT MTy = LT.second;
 
-    // Attempt to lookup cost.
-    if (ISD == ISD::BITREVERSE && ST->hasGFNI() && ST->hasSSSE3() &&
-        MTy.isVector()) {
-      // With PSHUFB the code is very similar for all types. If we have integer
-      // byte operations, we just need a GF2P8AFFINEQB for vXi8. For other types
-      // we also need a PSHUFB.
-      unsigned Cost = MTy.getVectorElementType() == MVT::i8 ? 1 : 2;
-
-      // Without byte operations, we need twice as many GF2P8AFFINEQB and PSHUFB
-      // instructions. We also need an extract and an insert.
-      if (!(MTy.is128BitVector() || (ST->hasAVX2() && MTy.is256BitVector()) ||
-            (ST->hasBWI() && MTy.is512BitVector())))
-        Cost = Cost * 2 + 2;
-
-      return LT.first * Cost;
-    }
-
     // Without BMI/LZCNT see if we're only looking for a *_ZERO_UNDEF cost.
     if (((ISD == ISD::CTTZ && !ST->hasBMI()) ||
          (ISD == ISD::CTLZ && !ST->hasLZCNT())) &&
@@ -4230,6 +4258,12 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
           return adjustTableCost(Entry->ISD, *KindCost, LT.first,
                                  ICA.getFlags());
 
+    if (ST->hasGFNI())
+      if (const auto *Entry = CostTableLookup(GFNICostTbl, ISD, MTy))
+        if (auto KindCost = Entry->Cost[CostKind])
+          return adjustTableCost(Entry->ISD, *KindCost, LT.first,
+                                 ICA.getFlags());
+
     if (ST->hasCDI())
       if (const auto *Entry = CostTableLookup(AVX512CDCostTbl, ISD, MTy))
         if (auto KindCost = Entry->Cost[CostKind])
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index f27d8d64a1040..41b66aafe7d34 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -2453,7 +2453,7 @@ bool AANonNull::isImpliedByIR(Attributor &A, const IRPosition &IRP,
 
   if (llvm::any_of(Worklist, [&](AA::ValueAndContext VAC) {
         return !isKnownNonZero(
-            VAC.getValue(), /*Depth=*/0,
+            VAC.getValue(),
             SimplifyQuery(A.getDataLayout(), DT, AC, VAC.getCtxI()));
       }))
     return false;
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 14612b251d1a4..7ebf265e17ba1 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -1175,7 +1175,7 @@ static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes,
     Value *RetVal = FlowsToReturn[i];
 
     // If this value is locally known to be non-null, we're good
-    if (isKnownNonZero(RetVal, /*Depth=*/0, DL))
+    if (isKnownNonZero(RetVal, DL))
       continue;
 
     // Otherwise, we need to look upwards since we can't make any local
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 07c50d866544b..c59b867b10e7d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -988,7 +988,7 @@ Instruction *InstCombinerImpl::foldAddWithConstant(BinaryOperator &Add) {
   if (C->isOne()) {
     if (match(Op0, m_ZExt(m_Add(m_Value(X), m_AllOnes())))) {
       const SimplifyQuery Q = SQ.getWithInstruction(&Add);
-      if (llvm::isKnownNonZero(X, /*Depth=*/0, Q))
+      if (llvm::isKnownNonZero(X, Q))
         return new ZExtInst(X, Ty);
     }
   }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 2c0c4ee46e809..0f4fbf5bbfbbd 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1039,9 +1039,9 @@ static Value *foldUnsignedUnderflowCheck(ICmpInst *ZeroICmp,
       match(ZeroCmpOp, m_c_Add(m_Specific(A), m_Value(B))) &&
       (ZeroICmp->hasOneUse() || UnsignedICmp->hasOneUse())) {
     auto GetKnownNonZeroAndOther = [&](Value *&NonZero, Value *&Other) {
-      if (!isKnownNonZero(NonZero, /*Depth=*/0, Q))
+      if (!isKnownNonZero(NonZero, Q))
         std::swap(NonZero, Other);
-      return isKnownNonZero(NonZero, /*Depth=*/0, Q);
+      return isKnownNonZero(NonZero, Q);
     };
 
     // Given  ZeroCmpOp = (A + B)
@@ -2538,6 +2538,8 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
     }
   }
 
+  // and(shl(zext(X), Y), SignMask) -> and(sext(X), SignMask)
+  // where Y is a valid shift amount.
   if (match(&I, m_And(m_OneUse(m_Shl(m_ZExt(m_Value(X)), m_Value(Y))),
                       m_SignMask())) &&
       match(Y, m_SpecificInt_ICMP(
@@ -2546,15 +2548,7 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
                          Ty->getScalarSizeInBits() -
                              X->getType()->getScalarSizeInBits())))) {
     auto *SExt = Builder.CreateSExt(X, Ty, X->getName() + ".signext");
-    auto *SanitizedSignMask = cast<Constant>(Op1);
-    // We must be careful with the undef elements of the sign bit mask, however:
-    // the mask elt can be undef iff the shift amount for that lane was undef,
-    // otherwise we need to sanitize undef masks to zero.
-    SanitizedSignMask = Constant::replaceUndefsWith(
-        SanitizedSignMask, ConstantInt::getNullValue(Ty->getScalarType()));
-    SanitizedSignMask =
-        Constant::mergeUndefsWith(SanitizedSignMask, cast<Constant>(Y));
-    return BinaryOperator::CreateAnd(SExt, SanitizedSignMask);
+    return BinaryOperator::CreateAnd(SExt, Op1);
   }
 
   if (Instruction *Z = narrowMaskedBinOp(I))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 20f51c8af617d..60e4be883f513 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -601,8 +601,7 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {
   // then change the 'ZeroIsPoison' parameter to 'true'
   // because we know the zero behavior can't affect the result.
   if (!Known.One.isZero() ||
-      isKnownNonZero(Op0, /*Depth=*/0,
-                     IC.getSimplifyQuery().getWithInstruction(&II))) {
+      isKnownNonZero(Op0, IC.getSimplifyQuery().getWithInstruction(&II))) {
     if (!match(II.getArgOperand(1), m_One()))
       return IC.replaceOperand(II, 1, IC.Builder.getTrue());
   }
@@ -1774,6 +1773,13 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     if (Instruction *I = moveAddAfterMinMax(II, Builder))
       return I;
 
+    // minmax (X & NegPow2C, Y & NegPow2C) --> minmax(X, Y) & NegPow2C
+    const APInt *RHSC;
+    if (match(I0, m_OneUse(m_And(m_Value(X), m_NegatedPower2(RHSC)))) &&
+        match(I1, m_OneUse(m_And(m_Value(Y), m_SpecificInt(*RHSC)))))
+      return BinaryOperator::CreateAnd(Builder.CreateBinaryIntrinsic(IID, X, Y),
+                                       ConstantInt::get(II->getType(), *RHSC));
+
     // smax(X, -X) --> abs(X)
     // smin(X, -X) --> -abs(X)
     // umax(X, -X) --> -abs(X)
@@ -1815,7 +1821,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
        return NewMinMax;
 
     // Try to fold minmax with constant RHS based on range information
-    const APInt *RHSC;
     if (match(I1, m_APIntAllowUndef(RHSC))) {
       ICmpInst::Predicate Pred =
           ICmpInst::getNonStrictPredicate(MinMaxIntrinsic::getPredicate(IID));
@@ -2061,8 +2066,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     // See if we can deduce non-null.
     if (!CI.hasRetAttr(Attribute::NonNull) &&
         (Known.isNonZero() ||
-         isKnownNonZero(II, /*Depth=*/0,
-                        getSimplifyQuery().getWithInstruction(II)))) {
+         isKnownNonZero(II, getSimplifyQuery().getWithInstruction(II)))) {
       CI.addRetAttr(Attribute::NonNull);
       Changed = true;
     }
@@ -3408,6 +3412,15 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       return I;
     break;
   }
+  case Intrinsic::threadlocal_address: {
+    Align MinAlign = getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT);
+    MaybeAlign Align = II->getRetAlign();
+    if (MinAlign > Align.valueOrOne()) {
+      II->addRetAttr(Attribute::getWithAlignment(II->getContext(), MinAlign));
+      return II;
+    }
+    break;
+  }
   default: {
     // Handle target specific intrinsics
     std::optional<Instruction *> V = targetInstCombineIntrinsic(*II);
@@ -3649,8 +3662,7 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) {
   for (Value *V : Call.args()) {
     if (V->getType()->isPointerTy() &&
         !Call.paramHasAttr(ArgNo, Attribute::NonNull) &&
-        isKnownNonZero(V, /*Depth=*/0,
-                       getSimplifyQuery().getWithInstruction(&Call)))
+        isKnownNonZero(V, getSimplifyQuery().getWithInstruction(&Call)))
       ArgNos.push_back(ArgNo);
     ArgNo++;
   }
@@ -3830,7 +3842,7 @@ Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) {
 
         // isKnownNonNull -> nonnull attribute
         if (!GCR.hasRetAttr(Attribute::NonNull) &&
-            isKnownNonZero(DerivedPtr, /*Depth=*/0,
+            isKnownNonZero(DerivedPtr,
                            getSimplifyQuery().getWithInstruction(&Call))) {
           GCR.addRetAttr(Attribute::NonNull);
           // We discovered new fact, re-check users.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 437e9b92c7032..4537a47da2ced 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1977,11 +1977,25 @@ Instruction *InstCombinerImpl::visitFPToSI(FPToSIInst &FI) {
 }
 
 Instruction *InstCombinerImpl::visitUIToFP(CastInst &CI) {
-  return commonCastTransforms(CI);
+  if (Instruction *R = commonCastTransforms(CI))
+    return R;
+  if (!CI.hasNonNeg() && isKnownNonNegative(CI.getOperand(0), SQ)) {
+    CI.setNonNeg();
+    return &CI;
+  }
+  return nullptr;
 }
 
 Instruction *InstCombinerImpl::visitSIToFP(CastInst &CI) {
-  return commonCastTransforms(CI);
+  if (Instruction *R = commonCastTransforms(CI))
+    return R;
+  if (isKnownNonNegative(CI.getOperand(0), SQ)) {
+    auto *UI =
+        CastInst::Create(Instruction::UIToFP, CI.getOperand(0), CI.getType());
+    UI->setNonNeg(true);
+    return UI;
+  }
+  return nullptr;
 }
 
 Instruction *InstCombinerImpl::visitIntToPtr(IntToPtrInst &CI) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 90550cdbdf891..de90907701743 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1273,12 +1273,12 @@ Instruction *InstCombinerImpl::foldICmpWithZero(ICmpInst &Cmp) {
 
       // if X non-zero and NoOverflow(X * Y)
       //    (icmp eq/ne Y)
-      if (!XKnown.One.isZero() || isKnownNonZero(X, /*Depth=*/0, Q))
+      if (!XKnown.One.isZero() || isKnownNonZero(X, Q))
         return new ICmpInst(Pred, Y, Cmp.getOperand(1));
 
       // if Y non-zero and NoOverflow(X * Y)
       //    (icmp eq/ne X)
-      if (!YKnown.One.isZero() || isKnownNonZero(Y, /*Depth=*/0, Q))
+      if (!YKnown.One.isZero() || isKnownNonZero(Y, Q))
         return new ICmpInst(Pred, X, Cmp.getOperand(1));
     }
     // Note, we are skipping cases:
@@ -3087,7 +3087,7 @@ Instruction *InstCombinerImpl::foldICmpAddConstant(ICmpInst &Cmp,
   // (X + -1) <u C --> X <=u C (if X is never null)
   if (Pred == CmpInst::ICMP_ULT && C2->isAllOnes()) {
     const SimplifyQuery Q = SQ.getWithInstruction(&Cmp);
-    if (llvm::isKnownNonZero(X, /*Depth=*/0, Q))
+    if (llvm::isKnownNonZero(X, Q))
       return new ICmpInst(ICmpInst::ICMP_ULE, X, ConstantInt::get(Ty, C));
   }
 
@@ -4275,7 +4275,7 @@ static Value *foldICmpWithLowBitMaskedVal(ICmpInst::Predicate Pred, Value *Op0,
 
       // Look for: x & ~Mask pred ~Mask
       if (isMaskOrZero(X, /*Not=*/true, Q)) {
-        return !ICmpInst::isSigned(Pred) || isKnownNonZero(X, /*Depth=*/0, Q);
+        return !ICmpInst::isSigned(Pred) || isKnownNonZero(X, Q);
       }
       return false;
     }
@@ -4779,7 +4779,7 @@ static Instruction *foldICmpXorXX(ICmpInst &I, const SimplifyQuery &Q,
   // icmp (X ^ Y_NonZero) s>= X --> icmp (X ^ Y_NonZero) s> X
   // icmp (X ^ Y_NonZero) s<= X --> icmp (X ^ Y_NonZero) s< X
   CmpInst::Predicate PredOut = CmpInst::getStrictPredicate(Pred);
-  if (PredOut != Pred && isKnownNonZero(A, /*Depth=*/0, Q))
+  if (PredOut != Pred && isKnownNonZero(A, Q))
     return new ICmpInst(PredOut, Op0, Op1);
 
   return nullptr;
@@ -5062,11 +5062,11 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
     return new ICmpInst(Pred, C, D);
   // (A - B) u>=/u< A --> B u>/u<= A  iff B != 0
   if (A == Op1 && (Pred == ICmpInst::ICMP_UGE || Pred == ICmpInst::ICMP_ULT) &&
-      isKnownNonZero(B, /*Depth=*/0, Q))
+      isKnownNonZero(B, Q))
     return new ICmpInst(CmpInst::getFlippedStrictnessPredicate(Pred), B, A);
   // C u<=/u> (C - D) --> C u</u>= D  iff B != 0
   if (C == Op0 && (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_UGT) &&
-      isKnownNonZero(D, /*Depth=*/0, Q))
+      isKnownNonZero(D, Q))
     return new ICmpInst(CmpInst::getFlippedStrictnessPredicate(Pred), C, D);
 
   // icmp (A-B), (C-B) -> icmp A, C for equalities or if there is no overflow.
@@ -5108,13 +5108,13 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
         //    X * Z eq/ne Y * Z -> X eq/ne Y
         if (ZKnown.countMaxTrailingZeros() == 0)
           return new ICmpInst(Pred, X, Y);
-        NonZero = !ZKnown.One.isZero() || isKnownNonZero(Z, /*Depth=*/0, Q);
+        NonZero = !ZKnown.One.isZero() || isKnownNonZero(Z, Q);
         // if Z != 0 and nsw(X * Z) and nsw(Y * Z)
         //    X * Z eq/ne Y * Z -> X eq/ne Y
         if (NonZero && BO0 && BO1 && Op0HasNSW && Op1HasNSW)
           return new ICmpInst(Pred, X, Y);
       } else
-        NonZero = isKnownNonZero(Z, /*Depth=*/0, Q);
+        NonZero = isKnownNonZero(Z, Q);
 
       // If Z != 0 and nuw(X * Z) and nuw(Y * Z)
       //    X * Z u{lt/le/gt/ge}/eq/ne Y * Z -> X u{lt/le/gt/ge}/eq/ne Y
@@ -8097,6 +8097,14 @@ Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
         return new FCmpInst(I.getSwappedPredicate(), X, NegC, "", &I);
   }
 
+  // fcmp (fadd X, 0.0), Y --> fcmp X, Y
+  if (match(Op0, m_FAdd(m_Value(X), m_AnyZeroFP())))
+    return new FCmpInst(Pred, X, Op1, "", &I);
+
+  // fcmp X, (fadd Y, 0.0) --> fcmp X, Y
+  if (match(Op1, m_FAdd(m_Value(Y), m_AnyZeroFP())))
+    return new FCmpInst(Pred, Op0, Y, "", &I);
+
   if (match(Op0, m_FPExt(m_Value(X)))) {
     // fcmp (fpext X), (fpext Y) -> fcmp X, Y
     if (match(Op1, m_FPExt(m_Value(Y))) && X->getType() == Y->getType())
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 4dc1319f1c437..7b86fcde8937b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -319,19 +319,12 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
   }
 
   // abs(X) * abs(X) -> X * X
-  // nabs(X) * nabs(X) -> X * X
-  if (Op0 == Op1) {
-    Value *X, *Y;
-    SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor;
-    if (SPF == SPF_ABS || SPF == SPF_NABS)
-      return BinaryOperator::CreateMul(X, X);
-
-    if (match(Op0, m_Intrinsic<Intrinsic::abs>(m_Value(X))))
-      return BinaryOperator::CreateMul(X, X);
-  }
+  Value *X;
+  if (Op0 == Op1 && match(Op0, m_Intrinsic<Intrinsic::abs>(m_Value(X))))
+    return BinaryOperator::CreateMul(X, X);
 
   {
-    Value *X, *Y;
+    Value *Y;
     // abs(X) * abs(Y) -> abs(X * Y)
     if (I.hasNoSignedWrap() &&
         match(Op0,
@@ -344,7 +337,7 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
   }
 
   // -X * C --> X * -C
-  Value *X, *Y;
+  Value *Y;
   Constant *Op1C;
   if (match(Op0, m_Neg(m_Value(X))) && match(Op1, m_Constant(Op1C)))
     return BinaryOperator::CreateMul(X, ConstantExpr::getNeg(Op1C));
@@ -631,31 +624,38 @@ Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) {
   Value *Op1 = I.getOperand(1);
   Value *X, *Y;
   Constant *C;
+  BinaryOperator *Op0BinOp;
 
   // Reassociate constant RHS with another constant to form constant
   // expression.
-  if (match(Op1, m_Constant(C)) && C->isFiniteNonZeroFP()) {
+  if (match(Op1, m_Constant(C)) && C->isFiniteNonZeroFP() &&
+      match(Op0, m_AllowReassoc(m_BinOp(Op0BinOp)))) {
+    // Everything in this scope folds I with Op0, intersecting their FMF.
+    FastMathFlags FMF = I.getFastMathFlags() & Op0BinOp->getFastMathFlags();
+    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
+    Builder.setFastMathFlags(FMF);
     Constant *C1;
     if (match(Op0, m_OneUse(m_FDiv(m_Constant(C1), m_Value(X))))) {
       // (C1 / X) * C --> (C * C1) / X
       Constant *CC1 =
           ConstantFoldBinaryOpOperands(Instruction::FMul, C, C1, DL);
       if (CC1 && CC1->isNormalFP())
-        return BinaryOperator::CreateFDivFMF(CC1, X, &I);
+        return BinaryOperator::CreateFDivFMF(CC1, X, FMF);
     }
     if (match(Op0, m_FDiv(m_Value(X), m_Constant(C1)))) {
+      // FIXME: This seems like it should also be checking for arcp
       // (X / C1) * C --> X * (C / C1)
       Constant *CDivC1 =
           ConstantFoldBinaryOpOperands(Instruction::FDiv, C, C1, DL);
       if (CDivC1 && CDivC1->isNormalFP())
-        return BinaryOperator::CreateFMulFMF(X, CDivC1, &I);
+        return BinaryOperator::CreateFMulFMF(X, CDivC1, FMF);
 
       // If the constant was a denormal, try reassociating differently.
       // (X / C1) * C --> X / (C1 / C)
       Constant *C1DivC =
           ConstantFoldBinaryOpOperands(Instruction::FDiv, C1, C, DL);
       if (C1DivC && Op0->hasOneUse() && C1DivC->isNormalFP())
-        return BinaryOperator::CreateFDivFMF(X, C1DivC, &I);
+        return BinaryOperator::CreateFDivFMF(X, C1DivC, FMF);
     }
 
     // We do not need to match 'fadd C, X' and 'fsub X, C' because they are
@@ -665,26 +665,33 @@ Instruction *InstCombinerImpl::foldFMulReassoc(BinaryOperator &I) {
       // (X + C1) * C --> (X * C) + (C * C1)
       if (Constant *CC1 =
               ConstantFoldBinaryOpOperands(Instruction::FMul, C, C1, DL)) {
-        Value *XC = Builder.CreateFMulFMF(X, C, &I);
-        return BinaryOperator::CreateFAddFMF(XC, CC1, &I);
+        Value *XC = Builder.CreateFMul(X, C);
+        return BinaryOperator::CreateFAddFMF(XC, CC1, FMF);
       }
     }
     if (match(Op0, m_OneUse(m_FSub(m_Constant(C1), m_Value(X))))) {
       // (C1 - X) * C --> (C * C1) - (X * C)
       if (Constant *CC1 =
               ConstantFoldBinaryOpOperands(Instruction::FMul, C, C1, DL)) {
-        Value *XC = Builder.CreateFMulFMF(X, C, &I);
-        return BinaryOperator::CreateFSubFMF(CC1, XC, &I);
+        Value *XC = Builder.CreateFMul(X, C);
+        return BinaryOperator::CreateFSubFMF(CC1, XC, FMF);
       }
     }
   }
 
   Value *Z;
   if (match(&I,
-            m_c_FMul(m_OneUse(m_FDiv(m_Value(X), m_Value(Y))), m_Value(Z)))) {
-    // Sink division: (X / Y) * Z --> (X * Z) / Y
-    Value *NewFMul = Builder.CreateFMulFMF(X, Z, &I);
-    return BinaryOperator::CreateFDivFMF(NewFMul, Y, &I);
+            m_c_FMul(m_AllowReassoc(m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))),
+                     m_Value(Z)))) {
+    BinaryOperator *DivOp = cast<BinaryOperator>(((Z == Op0) ? Op1 : Op0));
+    FastMathFlags FMF = I.getFastMathFlags() & DivOp->getFastMathFlags();
+    if (FMF.allowReassoc()) {
+      // Sink division: (X / Y) * Z --> (X * Z) / Y
+      IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
+      Builder.setFastMathFlags(FMF);
+      auto *NewFMul = Builder.CreateFMul(X, Z);
+      return BinaryOperator::CreateFDivFMF(NewFMul, Y, FMF);
+    }
   }
 
   // sqrt(X) * sqrt(Y) -> sqrt(X * Y)
diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 9838e2aa9f3a2..52803e9bea451 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -1537,8 +1537,7 @@ Instruction *InstCombinerImpl::visitPHINode(PHINode &PN) {
       for (unsigned I = 0, E = PN.getNumIncomingValues(); I != E; ++I) {
         Instruction *CtxI = PN.getIncomingBlock(I)->getTerminator();
         Value *VA = PN.getIncomingValue(I);
-        if (isKnownNonZero(VA, 0,
-                           getSimplifyQuery().getWithInstruction(CtxI))) {
+        if (isKnownNonZero(VA, getSimplifyQuery().getWithInstruction(CtxI))) {
           if (!NonZeroConst)
             NonZeroConst = getAnyNonZeroConstInt(PN);
           if (NonZeroConst != VA) {
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 4c00f2a0ea176..5a144cc737896 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1431,7 +1431,7 @@ Instruction *InstCombinerImpl::foldFBinOpOfIntCastsFromSign(
     if (OpsKnown[OpNo].hasKnownBits() &&
         OpsKnown[OpNo].getKnownBits(SQ).isNonZero())
       return true;
-    return isKnownNonZero(IntOps[OpNo], /*Depth=*/0, SQ);
+    return isKnownNonZero(IntOps[OpNo], SQ);
   };
 
   auto IsNonNeg = [&](unsigned OpNo) -> bool {
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index a72b0ee9a08e0..ee3531bbd68df 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -1281,7 +1281,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         // ignored.
         return;
       }
-      if (llvm::isKnownNonZero(ConvertedShadow, /*Depth=*/0, DL)) {
+      if (llvm::isKnownNonZero(ConvertedShadow, DL)) {
         // Copy origin as the value is definitely uninitialized.
         paintOrigin(IRB, updateOrigin(Origin, IRB), OriginPtr, StoreSize,
                     OriginAlignment);
@@ -1427,7 +1427,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
           // Skip, value is initialized or const shadow is ignored.
           continue;
         }
-        if (llvm::isKnownNonZero(ConvertedShadow, /*Depth=*/0, DL)) {
+        if (llvm::isKnownNonZero(ConvertedShadow, DL)) {
           // Report as the value is definitely uninitialized.
           insertWarningFn(IRB, ShadowData.Origin);
           if (!MS.Recover)
diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp
index 200bad22148f0..fcc82eadac36c 100644
--- a/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -87,10 +87,6 @@ bool llvm::applyDebugifyMetadata(
     return false;
   }
 
-  bool NewDebugMode = M.IsNewDbgInfoFormat;
-  if (NewDebugMode)
-    M.convertFromNewDbgValues();
-
   DIBuilder DIB(M);
   LLVMContext &Ctx = M.getContext();
   auto *Int32Ty = Type::getInt32Ty(Ctx);
@@ -214,9 +210,6 @@ bool llvm::applyDebugifyMetadata(
   if (!M.getModuleFlag(DIVersionKey))
     M.addModuleFlag(Module::Warning, DIVersionKey, DEBUG_METADATA_VERSION);
 
-  if (NewDebugMode)
-    M.convertToNewDbgValues();
-
   return true;
 }
 
@@ -311,10 +304,6 @@ bool llvm::collectDebugInfoMetadata(Module &M,
     return false;
   }
 
-  bool NewDebugMode = M.IsNewDbgInfoFormat;
-  if (NewDebugMode)
-    M.convertFromNewDbgValues();
-
   uint64_t FunctionsCnt = DebugInfoBeforePass.DIFunctions.size();
   // Visit each instruction.
   for (Function &F : Functions) {
@@ -349,20 +338,23 @@ bool llvm::collectDebugInfoMetadata(Module &M,
 
         // Cllect dbg.values and dbg.declare.
         if (DebugifyLevel > Level::Locations) {
-          if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I)) {
+          auto HandleDbgVariable = [&](auto *DbgVar) {
             if (!SP)
-              continue;
+              return;
             // Skip inlined variables.
-            if (I.getDebugLoc().getInlinedAt())
-              continue;
+            if (DbgVar->getDebugLoc().getInlinedAt())
+              return;
             // Skip undef values.
-            if (DVI->isKillLocation())
-              continue;
+            if (DbgVar->isKillLocation())
+              return;
 
-            auto *Var = DVI->getVariable();
+            auto *Var = DbgVar->getVariable();
             DebugInfoBeforePass.DIVariables[Var]++;
-            continue;
-          }
+          };
+          for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+            HandleDbgVariable(&DVR);
+          if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I))
+            HandleDbgVariable(DVI);
         }
 
         // Skip debug instructions other than dbg.value and dbg.declare.
@@ -379,9 +371,6 @@ bool llvm::collectDebugInfoMetadata(Module &M,
     }
   }
 
-  if (NewDebugMode)
-    M.convertToNewDbgValues();
-
   return true;
 }
 
@@ -561,10 +550,6 @@ bool llvm::checkDebugInfoMetadata(Module &M,
     return false;
   }
 
-  bool NewDebugMode = M.IsNewDbgInfoFormat;
-  if (NewDebugMode)
-    M.convertFromNewDbgValues();
-
   // Map the debug info holding DIs after a pass.
   DebugInfoPerPass DebugInfoAfterPass;
 
@@ -599,20 +584,23 @@ bool llvm::checkDebugInfoMetadata(Module &M,
 
         // Collect dbg.values and dbg.declares.
         if (DebugifyLevel > Level::Locations) {
-          if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I)) {
+          auto HandleDbgVariable = [&](auto *DbgVar) {
             if (!SP)
-              continue;
+              return;
             // Skip inlined variables.
-            if (I.getDebugLoc().getInlinedAt())
-              continue;
+            if (DbgVar->getDebugLoc().getInlinedAt())
+              return;
             // Skip undef values.
-            if (DVI->isKillLocation())
-              continue;
+            if (DbgVar->isKillLocation())
+              return;
 
-            auto *Var = DVI->getVariable();
+            auto *Var = DbgVar->getVariable();
             DebugInfoAfterPass.DIVariables[Var]++;
-            continue;
-          }
+          };
+          for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+            HandleDbgVariable(&DVR);
+          if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I))
+            HandleDbgVariable(DVI);
         }
 
         // Skip debug instructions other than dbg.value and dbg.declare.
@@ -675,16 +663,14 @@ bool llvm::checkDebugInfoMetadata(Module &M,
   // the debugging information from the previous pass.
   DebugInfoBeforePass = DebugInfoAfterPass;
 
-  if (NewDebugMode)
-    M.convertToNewDbgValues();
-
   LLVM_DEBUG(dbgs() << "\n\n");
   return Result;
 }
 
 namespace {
-/// Return true if a mis-sized diagnostic is issued for \p DVI.
-bool diagnoseMisSizedDbgValue(Module &M, DbgValueInst *DVI) {
+/// Return true if a mis-sized diagnostic is issued for \p DbgVal.
+template <typename DbgValTy>
+bool diagnoseMisSizedDbgValue(Module &M, DbgValTy *DbgVal) {
   // The size of a dbg.value's value operand should match the size of the
   // variable it corresponds to.
   //
@@ -693,22 +679,22 @@ bool diagnoseMisSizedDbgValue(Module &M, DbgValueInst *DVI) {
 
   // For now, don't try to interpret anything more complicated than an empty
   // DIExpression. Eventually we should try to handle OP_deref and fragments.
-  if (DVI->getExpression()->getNumElements())
+  if (DbgVal->getExpression()->getNumElements())
     return false;
 
-  Value *V = DVI->getVariableLocationOp(0);
+  Value *V = DbgVal->getVariableLocationOp(0);
   if (!V)
     return false;
 
   Type *Ty = V->getType();
   uint64_t ValueOperandSize = getAllocSizeInBits(M, Ty);
-  std::optional<uint64_t> DbgVarSize = DVI->getFragmentSizeInBits();
+  std::optional<uint64_t> DbgVarSize = DbgVal->getFragmentSizeInBits();
   if (!ValueOperandSize || !DbgVarSize)
     return false;
 
   bool HasBadSize = false;
   if (Ty->isIntegerTy()) {
-    auto Signedness = DVI->getVariable()->getSignedness();
+    auto Signedness = DbgVal->getVariable()->getSignedness();
     if (Signedness && *Signedness == DIBasicType::Signedness::Signed)
       HasBadSize = ValueOperandSize < *DbgVarSize;
   } else {
@@ -718,7 +704,7 @@ bool diagnoseMisSizedDbgValue(Module &M, DbgValueInst *DVI) {
   if (HasBadSize) {
     dbg() << "ERROR: dbg.value operand has size " << ValueOperandSize
           << ", but its variable has size " << *DbgVarSize << ": ";
-    DVI->print(dbg());
+    DbgVal->print(dbg());
     dbg() << "\n";
   }
   return HasBadSize;
@@ -735,10 +721,6 @@ bool checkDebugifyMetadata(Module &M,
     return false;
   }
 
-  bool NewDebugMode = M.IsNewDbgInfoFormat;
-  if (NewDebugMode)
-    M.convertFromNewDbgValues();
-
   auto getDebugifyOperand = [&](unsigned Idx) -> unsigned {
     return mdconst::extract<ConstantInt>(NMD->getOperand(Idx)->getOperand(0))
         ->getZExtValue();
@@ -780,18 +762,23 @@ bool checkDebugifyMetadata(Module &M,
     }
 
     // Find missing variables and mis-sized debug values.
-    for (Instruction &I : instructions(F)) {
-      auto *DVI = dyn_cast<DbgValueInst>(&I);
-      if (!DVI)
-        continue;
-
+    auto CheckForMisSized = [&](auto *DbgVal) {
       unsigned Var = ~0U;
-      (void)to_integer(DVI->getVariable()->getName(), Var, 10);
+      (void)to_integer(DbgVal->getVariable()->getName(), Var, 10);
       assert(Var <= OriginalNumVars && "Unexpected name for DILocalVariable");
-      bool HasBadSize = diagnoseMisSizedDbgValue(M, DVI);
+      bool HasBadSize = diagnoseMisSizedDbgValue(M, DbgVal);
       if (!HasBadSize)
         MissingVars.reset(Var - 1);
       HasErrors |= HasBadSize;
+    };
+    for (Instruction &I : instructions(F)) {
+      for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
+        if (DVR.isDbgValue() || DVR.isDbgAssign())
+          CheckForMisSized(&DVR);
+      auto *DVI = dyn_cast<DbgValueInst>(&I);
+      if (!DVI)
+        continue;
+      CheckForMisSized(DVI);
     }
   }
 
@@ -820,9 +807,6 @@ bool checkDebugifyMetadata(Module &M,
   if (Strip)
     Ret = stripDebugifyMetadata(M);
 
-  if (NewDebugMode)
-    M.convertToNewDbgValues();
-
   return Ret;
 }
 
@@ -1052,10 +1036,6 @@ FunctionPass *createCheckDebugifyFunctionPass(
 
 PreservedAnalyses NewPMCheckDebugifyPass::run(Module &M,
                                               ModuleAnalysisManager &) {
-  bool NewDebugMode = M.IsNewDbgInfoFormat;
-  if (NewDebugMode)
-    M.convertFromNewDbgValues();
-
   if (Mode == DebugifyMode::SyntheticDebugInfo)
     checkDebugifyMetadata(M, M.functions(), NameOfWrappedPass,
                                    "CheckModuleDebugify", Strip, StatsMap);
@@ -1065,9 +1045,6 @@ PreservedAnalyses NewPMCheckDebugifyPass::run(Module &M,
       "CheckModuleDebugify (original debuginfo)", NameOfWrappedPass,
       OrigDIVerifyBugsReportFilePath);
 
-  if (NewDebugMode)
-    M.convertToNewDbgValues();
-
   return PreservedAnalyses::all();
 }
 
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 380bac9c61807..a42ef0c4e6ae9 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3627,10 +3627,12 @@ DIExpression *llvm::getExpressionForConstant(DIBuilder &DIB, const Constant &C,
     return createIntegerExpression(C);
 
   auto *FP = dyn_cast<ConstantFP>(&C);
-  if (FP && (Ty.isFloatTy() || Ty.isDoubleTy())) {
+  if (FP && Ty.isFloatingPointTy() && Ty.getScalarSizeInBits() <= 64) {
     const APFloat &APF = FP->getValueAPF();
-    return DIB.createConstantValueExpression(
-        APF.bitcastToAPInt().getZExtValue());
+    APInt const &API = APF.bitcastToAPInt();
+    if (auto Temp = API.getZExtValue())
+      return DIB.createConstantValueExpression(static_cast<uint64_t>(Temp));
+    return DIB.createConstantValueExpression(*API.getRawData());
   }
 
   if (!Ty.isPointerTy())
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 9d816c5220532..73c5d63678229 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1034,6 +1034,15 @@ CmpInst::Predicate llvm::getMinMaxReductionPredicate(RecurKind RK) {
   }
 }
 
+Value *llvm::createAnyOfOp(IRBuilderBase &Builder, Value *StartVal,
+                           RecurKind RK, Value *Left, Value *Right) {
+  if (auto VTy = dyn_cast<VectorType>(Left->getType()))
+    StartVal = Builder.CreateVectorSplat(VTy->getElementCount(), StartVal);
+  Value *Cmp =
+      Builder.CreateCmp(CmpInst::ICMP_NE, Left, StartVal, "rdx.select.cmp");
+  return Builder.CreateSelect(Cmp, Left, Right, "rdx.select");
+}
+
 Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
                             Value *Right) {
   Type *Ty = Left->getType();
@@ -1142,13 +1151,16 @@ Value *llvm::createAnyOfTargetReduction(IRBuilderBase &Builder, Value *Src,
     NewVal = SI->getTrueValue();
   }
 
+  // Create a splat vector with the new value and compare this to the vector
+  // we want to reduce.
+  ElementCount EC = cast<VectorType>(Src->getType())->getElementCount();
+  Value *Right = Builder.CreateVectorSplat(EC, InitVal);
+  Value *Cmp =
+      Builder.CreateCmp(CmpInst::ICMP_NE, Src, Right, "rdx.select.cmp");
+
   // If any predicate is true it means that we want to select the new value.
-  Value *AnyOf =
-      Src->getType()->isVectorTy() ? Builder.CreateOrReduce(Src) : Src;
-  // The compares in the loop may yield poison, which propagates through the
-  // bitwise ORs. Freeze it here before the condition is used.
-  AnyOf = Builder.CreateFreeze(AnyOf);
-  return Builder.CreateSelect(AnyOf, NewVal, InitVal, "rdx.select");
+  Cmp = Builder.CreateOrReduce(Cmp);
+  return Builder.CreateSelect(Cmp, NewVal, InitVal, "rdx.select");
 }
 
 Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder, Value *Src,
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index f376b5f7d68d4..40d0f6b75d69b 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -459,7 +459,7 @@ static void convertMetadataToAssumes(LoadInst *LI, Value *Val,
   // we can only do this if the value is known non-poison.
   if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
       LI->getMetadata(LLVMContext::MD_noundef) &&
-      !isKnownNonZero(Val, /*Depth=*/0, SimplifyQuery(DL, DT, AC, LI)))
+      !isKnownNonZero(Val, SimplifyQuery(DL, DT, AC, LI)))
     addAssumeNonNull(AC, LI);
 }
 
diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index 440fe0790d795..31be7d62c8d1d 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -1153,6 +1153,7 @@ class WidenIV {
 
   Instruction *widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter,
                           PHINode *OrigPhi, PHINode *WidePhi);
+  void truncateIVUse(NarrowIVDefUse DU);
 
   bool widenLoopCompare(NarrowIVDefUse DU);
   bool widenWithVariantUse(NarrowIVDefUse DU);
@@ -1569,15 +1570,18 @@ WidenIV::WidenedRecTy WidenIV::getWideRecurrence(WidenIV::NarrowIVDefUse DU) {
 
 /// This IV user cannot be widened. Replace this use of the original narrow IV
 /// with a truncation of the new wide IV to isolate and eliminate the narrow IV.
-static void truncateIVUse(WidenIV::NarrowIVDefUse DU, DominatorTree *DT,
-                          LoopInfo *LI) {
+void WidenIV::truncateIVUse(NarrowIVDefUse DU) {
   auto *InsertPt = getInsertPointForUses(DU.NarrowUse, DU.NarrowDef, DT, LI);
   if (!InsertPt)
     return;
   LLVM_DEBUG(dbgs() << "INDVARS: Truncate IV " << *DU.WideDef << " for user "
                     << *DU.NarrowUse << "\n");
+  ExtendKind ExtKind = getExtendKind(DU.NarrowDef);
   IRBuilder<> Builder(InsertPt);
-  Value *Trunc = Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType());
+  Value *Trunc =
+      Builder.CreateTrunc(DU.WideDef, DU.NarrowDef->getType(), "",
+                          DU.NeverNegative || ExtKind == ExtendKind::Zero,
+                          DU.NeverNegative || ExtKind == ExtendKind::Sign);
   DU.NarrowUse->replaceUsesOfWith(DU.NarrowDef, Trunc);
 }
 
@@ -1826,6 +1830,13 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU,
   assert(ExtendKindMap.count(DU.NarrowDef) &&
          "Should already know the kind of extension used to widen NarrowDef");
 
+  // This narrow use can be widened by a sext if it's non-negative or its narrow
+  // def was widened by a sext. Same for zext.
+  bool CanWidenBySExt =
+      DU.NeverNegative || getExtendKind(DU.NarrowDef) == ExtendKind::Sign;
+  bool CanWidenByZExt =
+      DU.NeverNegative || getExtendKind(DU.NarrowDef) == ExtendKind::Zero;
+
   // Stop traversing the def-use chain at inner-loop phis or post-loop phis.
   if (PHINode *UsePhi = dyn_cast<PHINode>(DU.NarrowUse)) {
     if (LI->getLoopFor(UsePhi->getParent()) != L) {
@@ -1833,7 +1844,7 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU,
       // After SimplifyCFG most loop exit targets have a single predecessor.
       // Otherwise fall back to a truncate within the loop.
       if (UsePhi->getNumOperands() != 1)
-        truncateIVUse(DU, DT, LI);
+        truncateIVUse(DU);
       else {
         // Widening the PHI requires us to insert a trunc.  The logical place
         // for this trunc is in the same BB as the PHI.  This is not possible if
@@ -1847,7 +1858,8 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU,
         WidePhi->addIncoming(DU.WideDef, UsePhi->getIncomingBlock(0));
         BasicBlock *WidePhiBB = WidePhi->getParent();
         IRBuilder<> Builder(WidePhiBB, WidePhiBB->getFirstInsertionPt());
-        Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType());
+        Value *Trunc = Builder.CreateTrunc(WidePhi, DU.NarrowDef->getType(), "",
+                                           CanWidenByZExt, CanWidenBySExt);
         UsePhi->replaceAllUsesWith(Trunc);
         DeadInsts.emplace_back(UsePhi);
         LLVM_DEBUG(dbgs() << "INDVARS: Widen lcssa phi " << *UsePhi << " to "
@@ -1857,18 +1869,9 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU,
     }
   }
 
-  // This narrow use can be widened by a sext if it's non-negative or its narrow
-  // def was widened by a sext. Same for zext.
-  auto canWidenBySExt = [&]() {
-    return DU.NeverNegative || getExtendKind(DU.NarrowDef) == ExtendKind::Sign;
-  };
-  auto canWidenByZExt = [&]() {
-    return DU.NeverNegative || getExtendKind(DU.NarrowDef) == ExtendKind::Zero;
-  };
-
   // Our raison d'etre! Eliminate sign and zero extension.
-  if ((match(DU.NarrowUse, m_SExtLike(m_Value())) && canWidenBySExt()) ||
-      (isa<ZExtInst>(DU.NarrowUse) && canWidenByZExt())) {
+  if ((match(DU.NarrowUse, m_SExtLike(m_Value())) && CanWidenBySExt) ||
+      (isa<ZExtInst>(DU.NarrowUse) && CanWidenByZExt)) {
     Value *NewDef = DU.WideDef;
     if (DU.NarrowUse->getType() != WideType) {
       unsigned CastWidth = SE->getTypeSizeInBits(DU.NarrowUse->getType());
@@ -1876,7 +1879,8 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU,
       if (CastWidth < IVWidth) {
         // The cast isn't as wide as the IV, so insert a Trunc.
         IRBuilder<> Builder(DU.NarrowUse);
-        NewDef = Builder.CreateTrunc(DU.WideDef, DU.NarrowUse->getType());
+        NewDef = Builder.CreateTrunc(DU.WideDef, DU.NarrowUse->getType(), "",
+                                     CanWidenByZExt, CanWidenBySExt);
       }
       else {
         // A wider extend was hidden behind a narrower one. This may induce
@@ -1975,7 +1979,7 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU,
   // This user does not evaluate to a recurrence after widening, so don't
   // follow it. Instead insert a Trunc to kill off the original use,
   // eventually isolating the original narrow IV so it can be removed.
-  truncateIVUse(DU, DT, LI);
+  truncateIVUse(DU);
   return nullptr;
 }
 
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 7e9e91606fe22..2e68a9c01898c 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -305,7 +305,7 @@ static void annotateNonNullAndDereferenceable(CallInst *CI, ArrayRef<unsigned> A
   if (ConstantInt *LenC = dyn_cast<ConstantInt>(Size)) {
     annotateNonNullNoUndefBasedOnAccess(CI, ArgNos);
     annotateDereferenceableBytes(CI, ArgNos, LenC->getZExtValue());
-  } else if (isKnownNonZero(Size, /*Depth=*/0, DL)) {
+  } else if (isKnownNonZero(Size, DL)) {
     annotateNonNullNoUndefBasedOnAccess(CI, ArgNos);
     const APInt *X, *Y;
     uint64_t DerefMin = 1;
@@ -394,7 +394,7 @@ Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilderBase &B) {
   Value *Size = CI->getArgOperand(2);
   uint64_t Len;
   annotateNonNullNoUndefBasedOnAccess(CI, 0);
-  if (isKnownNonZero(Size, /*Depth=*/0, DL))
+  if (isKnownNonZero(Size, DL))
     annotateNonNullNoUndefBasedOnAccess(CI, 1);
 
   // We don't do anything if length is not constant.
@@ -613,7 +613,7 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilderBase &B) {
   if (Str1P == Str2P) // strncmp(x,x,n)  -> 0
     return ConstantInt::get(CI->getType(), 0);
 
-  if (isKnownNonZero(Size, /*Depth=*/0, DL))
+  if (isKnownNonZero(Size, DL))
     annotateNonNullNoUndefBasedOnAccess(CI, {0, 1});
   // Get the length argument if it is constant.
   uint64_t Length;
@@ -749,7 +749,7 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) {
 
 Value *LibCallSimplifier::optimizeStrLCpy(CallInst *CI, IRBuilderBase &B) {
   Value *Size = CI->getArgOperand(2);
-  if (isKnownNonZero(Size, /*Depth=*/0, DL))
+  if (isKnownNonZero(Size, DL))
     // Like snprintf, the function stores into the destination only when
     // the size argument is nonzero.
     annotateNonNullNoUndefBasedOnAccess(CI, 0);
@@ -833,7 +833,7 @@ Value *LibCallSimplifier::optimizeStringNCpy(CallInst *CI, bool RetEnd,
   Value *Src = CI->getArgOperand(1);
   Value *Size = CI->getArgOperand(2);
 
-  if (isKnownNonZero(Size, /*Depth=*/0, DL)) {
+  if (isKnownNonZero(Size, DL)) {
     // Both st{p,r}ncpy(D, S, N) access the source and destination arrays
     // only when N is nonzero.
     annotateNonNullNoUndefBasedOnAccess(CI, 0);
@@ -926,7 +926,7 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilderBase &B,
   Type *CharTy = B.getIntNTy(CharSize);
 
   if (isOnlyUsedInZeroEqualityComparison(CI) &&
-      (!Bound || isKnownNonZero(Bound, /*Depth=*/0, DL))) {
+      (!Bound || isKnownNonZero(Bound, DL))) {
     // Fold strlen:
     //   strlen(x) != 0 --> *x != 0
     //   strlen(x) == 0 --> *x == 0
@@ -1047,7 +1047,7 @@ Value *LibCallSimplifier::optimizeStrNLen(CallInst *CI, IRBuilderBase &B) {
   if (Value *V = optimizeStringLength(CI, B, 8, Bound))
     return V;
 
-  if (isKnownNonZero(Bound, /*Depth=*/0, DL))
+  if (isKnownNonZero(Bound, DL))
     annotateNonNullNoUndefBasedOnAccess(CI, 0);
   return nullptr;
 }
@@ -1291,7 +1291,7 @@ Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilderBase &B) {
   Value *SrcStr = CI->getArgOperand(0);
   Value *Size = CI->getArgOperand(2);
 
-  if (isKnownNonZero(Size, /*Depth=*/0, DL)) {
+  if (isKnownNonZero(Size, DL)) {
     annotateNonNullNoUndefBasedOnAccess(CI, 0);
     if (isOnlyUsedInEqualityComparison(CI, SrcStr))
       return memChrToCharCompare(CI, Size, B, DL);
@@ -2976,7 +2976,7 @@ Value *LibCallSimplifier::optimizeStrToInt(CallInst *CI, IRBuilderBase &B,
     // It would be readonly too, except that it still may write to errno.
     CI->addParamAttr(0, Attribute::NoCapture);
     EndPtr = nullptr;
-  } else if (!isKnownNonZero(EndPtr, /*Depth=*/0, DL))
+  } else if (!isKnownNonZero(EndPtr, DL))
     return nullptr;
 
   StringRef Str;
@@ -3402,7 +3402,7 @@ Value *LibCallSimplifier::optimizeSnPrintF(CallInst *CI, IRBuilderBase &B) {
     return V;
   }
 
-  if (isKnownNonZero(CI->getOperand(1), /*Depth=*/0, DL))
+  if (isKnownNonZero(CI->getOperand(1), DL))
     annotateNonNullNoUndefBasedOnAccess(CI, 0);
   return nullptr;
 }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index ece2a34f180cb..ebca2d855a467 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -68,7 +68,9 @@ class VPBuilder {
 public:
   VPBuilder() = default;
   VPBuilder(VPBasicBlock *InsertBB) { setInsertPoint(InsertBB); }
-  VPBuilder(VPRecipeBase *InsertPt) { setInsertPoint(InsertPt); }
+  VPBuilder(VPRecipeBase *InsertPt) {
+    setInsertPoint(InsertPt->getParent(), InsertPt->getIterator());
+  }
 
   /// Clear the insertion point: created instructions will not be inserted into
   /// a block.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5535cc55e9321..a8272f4502535 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -545,11 +545,6 @@ class InnerLoopVectorizer {
   // Return true if any runtime check is added.
   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
 
-  /// A type for vectorized values in the new loop. Each value from the
-  /// original loop, when vectorized, is represented by UF vector values in the
-  /// new unrolled loop, where UF is the unroll factor.
-  using VectorParts = SmallVector<Value *, 2>;
-
   /// A helper function to scalarize a single Instruction in the innermost loop.
   /// Generates a sequence of scalar instances for each lane between \p MinLane
   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
@@ -616,9 +611,6 @@ class InnerLoopVectorizer {
   void fixFixedOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
                                VPTransformState &State);
 
-  /// Create code for the loop exit value of the reduction.
-  void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
-
   /// Iteratively sink the scalarized operands of a predicated instruction into
   /// the block that was created for it.
   void sinkScalarOperands(Instruction *PredInst);
@@ -3051,8 +3043,9 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
   }
 
   // Create phi nodes to merge from the  backedge-taken check block.
-  PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
-                                         LoopScalarPreHeader->getFirstNonPHI());
+  PHINode *BCResumeVal =
+      PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
+                      LoopScalarPreHeader->getTerminator()->getIterator());
   // Copy original phi DL over to the new one.
   BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
 
@@ -7450,6 +7443,7 @@ static void createAndCollectMergePhiForReduction(
   auto *PhiR = cast<VPReductionPHIRecipe>(RedResult->getOperand(0));
   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
 
+  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
   Value *FinalValue =
       State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
   auto *ResumePhi =
@@ -7474,7 +7468,7 @@ static void createAndCollectMergePhiForReduction(
       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
                               Incoming);
     else
-      BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming);
+      BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
   }
 
   auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
@@ -7767,10 +7761,11 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
 
   // Now, compare the remaining count and if there aren't enough iterations to
   // execute the vectorized epilogue skip to the scalar part.
-  LoopVectorPreHeader->setName("vec.epilog.ph");
-  BasicBlock *VecEpilogueIterationCountCheck =
-      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->begin(), DT, LI,
-                 nullptr, "vec.epilog.iter.check", true);
+  BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
+  VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
+  LoopVectorPreHeader =
+      SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
+                 LI, nullptr, "vec.epilog.ph");
   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
                                           VecEpilogueIterationCountCheck);
 
@@ -8086,7 +8081,7 @@ void VPRecipeBuilder::createBlockInMask(BasicBlock *BB) {
   BlockMaskCache[BB] = BlockMask;
 }
 
-VPWidenMemoryInstructionRecipe *
+VPWidenMemoryRecipe *
 VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
                                   VFRange &Range) {
   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
@@ -8131,12 +8126,12 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
     Ptr = VectorPtr;
   }
   if (LoadInst *Load = dyn_cast<LoadInst>(I))
-    return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive,
-                                              Reverse, I->getDebugLoc());
+    return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
+                                 I->getDebugLoc());
 
   StoreInst *Store = cast<StoreInst>(I);
-  return new VPWidenMemoryInstructionRecipe(
-      *Store, Ptr, Operands[0], Mask, Consecutive, Reverse, I->getDebugLoc());
+  return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
+                                Reverse, I->getDebugLoc());
 }
 
 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
@@ -8775,13 +8770,12 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   // for this VPlan, replace the Recipes widening its memory instructions with a
   // single VPInterleaveRecipe at its insertion point.
   for (const auto *IG : InterleaveGroups) {
-    auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
-        RecipeBuilder.getRecipe(IG->getInsertPos()));
+    auto *Recipe =
+        cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getInsertPos()));
     SmallVector<VPValue *, 4> StoredValues;
     for (unsigned i = 0; i < IG->getFactor(); ++i)
       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
-        auto *StoreR =
-            cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
+        auto *StoreR = cast<VPWidenStoreRecipe>(RecipeBuilder.getRecipe(SI));
         StoredValues.push_back(StoreR->getStoredValue());
       }
 
@@ -8893,10 +8887,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
 // A ComputeReductionResult recipe is added to the middle block, also for
 // in-loop reductions which compute their result in-loop, because generating
 // the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
-//
-// Adjust AnyOf reductions; replace the reduction phi for the selected value
-// with a boolean reduction phi node to check if the condition is true in any
-// iteration. The final value is selected by the final ComputeReductionResult.
 void LoopVectorizationPlanner::adjustRecipesForReductions(
     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
     ElementCount MinVF) {
@@ -9071,41 +9061,6 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       continue;
 
     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
-    // Adjust AnyOf reductions; replace the reduction phi for the selected value
-    // with a boolean reduction phi node to check if the condition is true in
-    // any iteration. The final value is selected by the final
-    // ComputeReductionResult.
-    if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
-            RdxDesc.getRecurrenceKind())) {
-      auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {
-        return isa<VPWidenSelectRecipe>(U) ||
-               (isa<VPReplicateRecipe>(U) &&
-                cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
-                    Instruction::Select);
-      }));
-      VPValue *Cmp = Select->getOperand(0);
-      // If the compare is checking the reduction PHI node, adjust it to check
-      // the start value.
-      if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe()) {
-        for (unsigned I = 0; I != CmpR->getNumOperands(); ++I)
-          if (CmpR->getOperand(I) == PhiR)
-            CmpR->setOperand(I, PhiR->getStartValue());
-      }
-      VPBuilder::InsertPointGuard Guard(Builder);
-      Builder.setInsertPoint(Select);
-
-      // If the true value of the select is the reduction phi, the new value is
-      // selected if the negated condition is true in any iteration.
-      if (Select->getOperand(1) == PhiR)
-        Cmp = Builder.createNot(Cmp);
-      VPValue *Or = Builder.createOr(PhiR, Cmp);
-      Select->getVPSingleValue()->replaceAllUsesWith(Or);
-
-      // Convert the reduction phi to operate on bools.
-      PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
-                              OrigLoop->getHeader()->getContext())));
-    }
-
     // If tail is folded by masking, introduce selects between the phi
     // and the live-out instruction of each reduction, at the beginning of the
     // dedicated latch block.
@@ -9138,9 +9093,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
     // then extend the loop exit value to enable InstCombine to evaluate the
     // entire expression in the smaller type.
     Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();
-    if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
-        !RecurrenceDescriptor::isAnyOfRecurrenceKind(
-            RdxDesc.getRecurrenceKind())) {
+    if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
       assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
       Type *RdxTy = RdxDesc.getRecurrenceType();
       auto *Trunc =
@@ -9409,92 +9362,27 @@ static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder,
   return Call;
 }
 
-void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
-  VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
-
-  // Attempt to issue a wide load.
-  LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
-  StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
-
-  assert((LI || SI) && "Invalid Load/Store instruction");
-  assert((!SI || StoredValue) && "No stored value provided for widened store");
-  assert((!LI || !StoredValue) && "Stored value provided for widened load");
+void VPWidenLoadRecipe::execute(VPTransformState &State) {
+  auto *LI = cast<LoadInst>(&Ingredient);
 
   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
-
   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
   const Align Alignment = getLoadStoreAlignment(&Ingredient);
-  bool CreateGatherScatter = !isConsecutive();
+  bool CreateGather = !isConsecutive();
 
   auto &Builder = State.Builder;
-  InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
-  bool isMaskRequired = getMask();
-  if (isMaskRequired) {
-    // Mask reversal is only needed for non-all-one (null) masks, as reverse of
-    // a null all-one mask is a null mask.
-    for (unsigned Part = 0; Part < State.UF; ++Part) {
-      Value *Mask = State.get(getMask(), Part);
+  State.setDebugLocFrom(getDebugLoc());
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    Value *NewLI;
+    Value *Mask = nullptr;
+    if (auto *VPMask = getMask()) {
+      // Mask reversal is only needed for non-all-one (null) masks, as reverse
+      // of a null all-one mask is a null mask.
+      Mask = State.get(VPMask, Part);
       if (isReverse())
         Mask = Builder.CreateVectorReverse(Mask, "reverse");
-      BlockInMaskParts[Part] = Mask;
     }
-  }
 
-  // Handle Stores:
-  if (SI) {
-    State.setDebugLocFrom(getDebugLoc());
-
-    for (unsigned Part = 0; Part < State.UF; ++Part) {
-      Instruction *NewSI = nullptr;
-      Value *StoredVal = State.get(StoredValue, Part);
-      // TODO: split this into several classes for better design.
-      if (State.EVL) {
-        assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
-                                "explicit vector length.");
-        assert(cast<VPInstruction>(State.EVL)->getOpcode() ==
-                   VPInstruction::ExplicitVectorLength &&
-               "EVL must be VPInstruction::ExplicitVectorLength.");
-        Value *EVL = State.get(State.EVL, VPIteration(0, 0));
-        // If EVL is not nullptr, then EVL must be a valid value set during plan
-        // creation, possibly default value = whole vector register length. EVL
-        // is created only if TTI prefers predicated vectorization, thus if EVL
-        // is not nullptr it also implies preference for predicated
-        // vectorization.
-        // FIXME: Support reverse store after vp_reverse is added.
-        Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
-        NewSI = lowerStoreUsingVectorIntrinsics(
-            Builder, State.get(getAddr(), Part, !CreateGatherScatter),
-            StoredVal, CreateGatherScatter, MaskPart, EVL, Alignment);
-      } else if (CreateGatherScatter) {
-        Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
-        Value *VectorGep = State.get(getAddr(), Part);
-        NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
-                                            MaskPart);
-      } else {
-        if (isReverse()) {
-          // If we store to reverse consecutive memory locations, then we need
-          // to reverse the order of elements in the stored value.
-          StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
-          // We don't want to update the value in the map as it might be used in
-          // another expression. So don't call resetVectorValue(StoredVal).
-        }
-        auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true);
-        if (isMaskRequired)
-          NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
-                                            BlockInMaskParts[Part]);
-        else
-          NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
-      }
-      State.addMetadata(NewSI, SI);
-    }
-    return;
-  }
-
-  // Handle loads.
-  assert(LI && "Must have a load instruction");
-  State.setDebugLocFrom(getDebugLoc());
-  for (unsigned Part = 0; Part < State.UF; ++Part) {
-    Value *NewLI;
     // TODO: split this into several classes for better design.
     if (State.EVL) {
       assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
@@ -9509,22 +9397,20 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
       // is not nullptr it also implies preference for predicated
       // vectorization.
       // FIXME: Support reverse loading after vp_reverse is added.
-      Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
       NewLI = lowerLoadUsingVectorIntrinsics(
-          Builder, DataTy, State.get(getAddr(), Part, !CreateGatherScatter),
-          CreateGatherScatter, MaskPart, EVL, Alignment);
-    } else if (CreateGatherScatter) {
-      Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+          Builder, DataTy, State.get(getAddr(), Part, !CreateGather),
+          CreateGather, Mask, EVL, Alignment);
+    } else if (CreateGather) {
       Value *VectorGep = State.get(getAddr(), Part);
-      NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
+      NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, Mask,
                                          nullptr, "wide.masked.gather");
       State.addMetadata(NewLI, LI);
     } else {
       auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true);
-      if (isMaskRequired)
-        NewLI = Builder.CreateMaskedLoad(
-            DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
-            PoisonValue::get(DataTy), "wide.masked.load");
+      if (Mask)
+        NewLI = Builder.CreateMaskedLoad(DataTy, VecPtr, Alignment, Mask,
+                                         PoisonValue::get(DataTy),
+                                         "wide.masked.load");
       else
         NewLI =
             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
@@ -9535,7 +9421,69 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
     }
 
-    State.set(getVPSingleValue(), NewLI, Part);
+    State.set(this, NewLI, Part);
+  }
+}
+
+void VPWidenStoreRecipe::execute(VPTransformState &State) {
+  auto *SI = cast<StoreInst>(&Ingredient);
+
+  VPValue *StoredVPValue = getStoredValue();
+  bool CreateScatter = !isConsecutive();
+  const Align Alignment = getLoadStoreAlignment(&Ingredient);
+
+  auto &Builder = State.Builder;
+  State.setDebugLocFrom(getDebugLoc());
+
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    Instruction *NewSI = nullptr;
+    Value *Mask = nullptr;
+    if (auto *VPMask = getMask()) {
+      // Mask reversal is only needed for non-all-one (null) masks, as reverse
+      // of a null all-one mask is a null mask.
+      Mask = State.get(VPMask, Part);
+      if (isReverse())
+        Mask = Builder.CreateVectorReverse(Mask, "reverse");
+    }
+
+    Value *StoredVal = State.get(StoredVPValue, Part);
+    if (isReverse()) {
+      assert(!State.EVL && "reversing not yet implemented with EVL");
+      // If we store to reverse consecutive memory locations, then we need
+      // to reverse the order of elements in the stored value.
+      StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
+      // We don't want to update the value in the map as it might be used in
+      // another expression. So don't call resetVectorValue(StoredVal).
+    }
+    // TODO: split this into several classes for better design.
+    if (State.EVL) {
+      assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with "
+                              "explicit vector length.");
+      assert(cast<VPInstruction>(State.EVL)->getOpcode() ==
+                 VPInstruction::ExplicitVectorLength &&
+             "EVL must be VPInstruction::ExplicitVectorLength.");
+      Value *EVL = State.get(State.EVL, VPIteration(0, 0));
+      // If EVL is not nullptr, then EVL must be a valid value set during plan
+      // creation, possibly default value = whole vector register length. EVL
+      // is created only if TTI prefers predicated vectorization, thus if EVL
+      // is not nullptr it also implies preference for predicated
+      // vectorization.
+      // FIXME: Support reverse store after vp_reverse is added.
+      NewSI = lowerStoreUsingVectorIntrinsics(
+          Builder, State.get(getAddr(), Part, !CreateScatter), StoredVal,
+          CreateScatter, Mask, EVL, Alignment);
+    } else if (CreateScatter) {
+      Value *VectorGep = State.get(getAddr(), Part);
+      NewSI =
+          Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, Mask);
+    } else {
+      auto *VecPtr = State.get(getAddr(), Part, /*IsScalar*/ true);
+      if (Mask)
+        NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, Mask);
+      else
+        NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
+    }
+    State.addMetadata(NewSI, SI);
   }
 }
 
@@ -9722,7 +9670,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
   }
 
   // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
-  double ScalarC = *VF.ScalarCost.getValue();
+  uint64_t ScalarC = *VF.ScalarCost.getValue();
   if (ScalarC == 0)
     return true;
 
@@ -9749,7 +9697,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
   //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
   //
   // Now we can compute the minimum required trip count TC as
-  //   (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
+  //   VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
   //
   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
   // the computations are performed on doubles, not integers and the result
@@ -9761,9 +9709,9 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
       AssumedMinimumVscale = *VScale;
     IntVF *= AssumedMinimumVscale;
   }
-  double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
-  double RtC = *CheckCost.getValue();
-  double MinTC1 = RtC / (ScalarC - VecCOverVF);
+  uint64_t RtC = *CheckCost.getValue();
+  uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
+  uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
 
   // Second, compute a minimum iteration count so that the cost of the
   // runtime checks is only a fraction of the total scalar loop cost. This
@@ -9772,12 +9720,12 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
   // * TC. To bound the runtime check to be a fraction 1/X of the scalar
   // cost, compute
   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
-  double MinTC2 = RtC * 10 / ScalarC;
+  uint64_t MinTC2 = divideCeil(RtC * 10, ScalarC);
 
   // Now pick the larger minimum. If it is not a multiple of VF and a scalar
   // epilogue is allowed, choose the next closest multiple of VF. This should
   // partly compensate for ignoring the epilogue cost.
-  uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
+  uint64_t MinTC = std::max(MinTC1, MinTC2);
   if (SEL == CM_ScalarEpilogueAllowed)
     MinTC = alignTo(MinTC, IntVF);
   VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
@@ -10181,19 +10129,9 @@ bool LoopVectorizePass::processLoop(Loop *L) {
           Value *ResumeV = nullptr;
           // TODO: Move setting of resume values to prepareToExecute.
           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
-            const RecurrenceDescriptor &RdxDesc =
-                ReductionPhi->getRecurrenceDescriptor();
-            RecurKind RK = RdxDesc.getRecurrenceKind();
-            ResumeV = ReductionResumeValues.find(&RdxDesc)->second;
-            if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
-              // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
-              // start value; compare the final value from the main vector loop
-              // to the start value.
-              IRBuilder<> Builder(
-                  cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
-              ResumeV = Builder.CreateICmpNE(ResumeV,
-                                             RdxDesc.getRecurrenceStartValue());
-            }
+            ResumeV = ReductionResumeValues
+                          .find(&ReductionPhi->getRecurrenceDescriptor())
+                          ->second;
           } else {
             // Create induction resume values for both widened pointer and
             // integer/fp inductions and update the start value of the induction
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index c63b500f546f3..806e8085038b3 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1134,6 +1134,7 @@ class BoUpSLP {
     MustGather.clear();
     EntryToLastInstruction.clear();
     ExternalUses.clear();
+    ExternalUsesAsGEPs.clear();
     for (auto &Iter : BlocksSchedules) {
       BlockScheduling *BS = Iter.second.get();
       BS->clear();
@@ -3154,6 +3155,10 @@ class BoUpSLP {
   /// after vectorization.
   UserList ExternalUses;
 
+  /// A list of GEPs which can be reaplced by scalar GEPs instead of
+  /// extractelement instructions.
+  SmallPtrSet<Value *, 4> ExternalUsesAsGEPs;
+
   /// Values used only by @llvm.assume calls.
   SmallPtrSet<const Value *, 32> EphValues;
 
@@ -5541,6 +5546,7 @@ void BoUpSLP::buildExternalUses(
                           << FoundLane << " from " << *Scalar << ".\n");
         ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
         ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
+        continue;
       }
       for (User *U : Scalar->users()) {
         LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
@@ -9925,6 +9931,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
   SmallVector<APInt> DemandedElts;
   SmallDenseSet<Value *, 4> UsedInserts;
   DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;
+  std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
   for (ExternalUser &EU : ExternalUses) {
     // We only add extract cost once for the same scalar.
     if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
@@ -10033,12 +10040,40 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
         }
       }
     }
+    // Leave the GEPs as is, they are free in most cases and better to keep them
+    // as GEPs.
+    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+    if (auto *GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
+      if (!ValueToExtUses) {
+        ValueToExtUses.emplace();
+        for_each(enumerate(ExternalUses), [&](const auto &P) {
+          ValueToExtUses->try_emplace(P.value().Scalar, P.index());
+        });
+      }
+      // Can use original GEP, if no operands vectorized or they are marked as
+      // externally used already.
+      bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) {
+        if (!getTreeEntry(V))
+          return true;
+        auto It = ValueToExtUses->find(V);
+        if (It != ValueToExtUses->end()) {
+          // Replace all uses to avoid compiler crash.
+          ExternalUses[It->second].User = nullptr;
+          return true;
+        }
+        return false;
+      });
+      if (CanBeUsedAsGEP) {
+        ExtractCost += TTI->getInstructionCost(GEP, CostKind);
+        ExternalUsesAsGEPs.insert(EU.Scalar);
+        continue;
+      }
+    }
 
     // If we plan to rewrite the tree in a smaller type, we will need to sign
     // extend the extracted value back to the original type. Here, we account
     // for the extract and the added cost of the sign extend if needed.
     auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
-    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
     auto It = MinBWs.find(getTreeEntry(EU.Scalar));
     if (It != MinBWs.end()) {
       auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
@@ -13161,6 +13196,8 @@ Value *BoUpSLP::vectorizeTree(
       if (Scalar->getType() != Vec->getType()) {
         Value *Ex = nullptr;
         Value *ExV = nullptr;
+        auto *GEP = dyn_cast<GetElementPtrInst>(Scalar);
+        bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP);
         auto It = ScalarToEEs.find(Scalar);
         if (It != ScalarToEEs.end()) {
           // No need to emit many extracts, just move the only one in the
@@ -13186,6 +13223,15 @@ Value *BoUpSLP::vectorizeTree(
             if (const TreeEntry *ETE = getTreeEntry(V))
               V = ETE->VectorizedValue;
             Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
+          } else if (ReplaceGEP) {
+            // Leave the GEPs as is, they are free in most cases and better to
+            // keep them as GEPs.
+            auto *CloneGEP = GEP->clone();
+            CloneGEP->insertBefore(*Builder.GetInsertBlock(),
+                                   Builder.GetInsertPoint());
+            if (GEP->hasName())
+              CloneGEP->takeName(GEP);
+            Ex = CloneGEP;
           } else {
             Ex = Builder.CreateExtractElement(Vec, Lane);
           }
@@ -13224,6 +13270,8 @@ Value *BoUpSLP::vectorizeTree(
       assert((ExternallyUsedValues.count(Scalar) ||
               any_of(Scalar->users(),
                      [&](llvm::User *U) {
+                       if (ExternalUsesAsGEPs.contains(U))
+                         return true;
                        TreeEntry *UseEntry = getTreeEntry(U);
                        return UseEntry &&
                               (UseEntry->State == TreeEntry::Vectorize ||
@@ -14639,10 +14687,16 @@ bool BoUpSLP::collectValuesToDemote(
         assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
                "Expected min/max intrinsics only.");
         unsigned SignBits = OrigBitWidth - BitWidth;
+        APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
         return SignBits <= ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
                                               nullptr, DT) &&
+               (!isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL)) ||
+                MaskedValueIsZero(I->getOperand(0), Mask,
+                                  SimplifyQuery(*DL))) &&
                SignBits <= ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
-                                              nullptr, DT);
+                                              nullptr, DT) &&
+               (!isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL)) ||
+                MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
       });
     };
     if (ID != Intrinsic::abs) {
@@ -15110,10 +15164,6 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
   BoUpSLP::ValueSet VectorizedStores;
   bool Changed = false;
 
-  // Stores the pair of stores (first_store, last_store) in a range, that were
-  // already tried to be vectorized. Allows to skip the store ranges that were
-  // already tried to be vectorized but the attempts were unsuccessful.
-  DenseSet<std::pair<Value *, Value *>> TriedSequences;
   struct StoreDistCompare {
     bool operator()(const std::pair<unsigned, int> &Op1,
                     const std::pair<unsigned, int> &Op2) const {
@@ -15155,8 +15205,10 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
       Type *ValueTy = StoreTy;
       if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
         ValueTy = Trunc->getSrcTy();
-      unsigned MinVF = TTI->getStoreMinimumVF(
-          R.getMinVF(DL->getTypeSizeInBits(StoreTy)), StoreTy, ValueTy);
+      unsigned MinVF = std::max<unsigned>(
+          2, PowerOf2Ceil(TTI->getStoreMinimumVF(
+                 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
+                 ValueTy)));
 
       if (MaxVF < MinVF) {
         LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
@@ -15182,40 +15234,74 @@ bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
         VF = Size > MaxVF ? NonPowerOf2VF : Size;
         Size *= 2;
       });
-      unsigned StartIdx = 0;
-      for (unsigned Size : CandidateVFs) {
-        for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
-          ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
-          assert(
-              all_of(
-                  Slice,
-                  [&](Value *V) {
-                    return cast<StoreInst>(V)->getValueOperand()->getType() ==
-                           cast<StoreInst>(Slice.front())
-                               ->getValueOperand()
-                               ->getType();
-                  }) &&
-              "Expected all operands of same type.");
-          if (!VectorizedStores.count(Slice.front()) &&
-              !VectorizedStores.count(Slice.back()) &&
-              TriedSequences.insert(std::make_pair(Slice.front(), Slice.back()))
-                  .second &&
-              vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
-            // Mark the vectorized stores so that we don't vectorize them again.
-            VectorizedStores.insert(Slice.begin(), Slice.end());
-            Changed = true;
-            // If we vectorized initial block, no need to try to vectorize it
-            // again.
-            if (Cnt == StartIdx)
-              StartIdx += Size;
-            Cnt += Size;
-            continue;
+      unsigned End = Operands.size();
+      unsigned Repeat = 0;
+      constexpr unsigned MaxAttempts = 2;
+      SmallBitVector Range(Operands.size());
+      while (true) {
+        ++Repeat;
+        for (unsigned Size : CandidateVFs) {
+          int StartIdx = Range.find_first_unset();
+          while (StartIdx != -1) {
+            int EndIdx = Range.find_next(StartIdx);
+            unsigned Sz = EndIdx == -1 ? End : EndIdx;
+            for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
+              ArrayRef<Value *> Slice = ArrayRef(Operands).slice(Cnt, Size);
+              assert(all_of(Slice,
+                            [&](Value *V) {
+                              return cast<StoreInst>(V)
+                                         ->getValueOperand()
+                                         ->getType() ==
+                                     cast<StoreInst>(Slice.front())
+                                         ->getValueOperand()
+                                         ->getType();
+                            }) &&
+                     "Expected all operands of same type.");
+              if (vectorizeStoreChain(Slice, R, Cnt, MinVF)) {
+                // Mark the vectorized stores so that we don't vectorize them
+                // again.
+                VectorizedStores.insert(Slice.begin(), Slice.end());
+                // Mark the vectorized stores so that we don't vectorize them
+                // again.
+                Changed = true;
+                // If we vectorized initial block, no need to try to vectorize
+                // it again.
+                Range.set(Cnt, Cnt + Size);
+                if (Cnt < StartIdx + MinVF)
+                  Range.set(StartIdx, Cnt);
+                if (Cnt > EndIdx - Size - MinVF) {
+                  Range.set(Cnt + Size, EndIdx);
+                  End = Cnt;
+                }
+                Cnt += Size;
+                continue;
+              }
+              ++Cnt;
+            }
+            if (Sz >= End)
+              break;
+            StartIdx = Range.find_next_unset(EndIdx);
           }
-          ++Cnt;
         }
-        // Check if the whole array was vectorized already - exit.
-        if (StartIdx >= Operands.size())
+        // All values vectorize - exit.
+        if (Range.all())
+          break;
+        // Check if tried all attempts or no need for the last attempts at all.
+        if (Repeat >= MaxAttempts)
+          break;
+        constexpr unsigned MaxVFScale = 4;
+        constexpr unsigned StoresLimit = 16;
+        const unsigned MaxTotalNum = std::min(
+            std::max<unsigned>(StoresLimit, MaxVFScale * MaxVF),
+            bit_floor(static_cast<unsigned>(Range.find_last_unset() -
+                                            Range.find_first_unset() + 1)));
+        if (MaxVF >= MaxTotalNum)
           break;
+        // Last attempt to vectorize max number of elements, if all previous
+        // attempts were unsuccessful because of the cost issues.
+        CandidateVFs.clear();
+        for (unsigned Size = MaxTotalNum; Size > MaxVF; Size /= 2)
+          CandidateVFs.push_back(Size);
       }
     }
   };
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 605b47fa0a46b..b4c7ab02f928f 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -69,9 +69,9 @@ class VPRecipeBuilder {
   /// Check if the load or store instruction \p I should widened for \p
   /// Range.Start and potentially masked. Such instructions are handled by a
   /// recipe that takes an additional VPInstruction for the mask.
-  VPWidenMemoryInstructionRecipe *tryToWidenMemory(Instruction *I,
-                                                   ArrayRef<VPValue *> Operands,
-                                                   VFRange &Range);
+  VPWidenMemoryRecipe *tryToWidenMemory(Instruction *I,
+                                        ArrayRef<VPValue *> Operands,
+                                        VFRange &Range);
 
   /// Check if an induction recipe should be constructed for \p Phi. If so build
   /// and return it. If not, return null.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index d86a81d4fb4c7..334b10e2e5d09 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -875,7 +875,8 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
       return true;
     case VPRecipeBase::VPInterleaveSC:
     case VPRecipeBase::VPBranchOnMaskSC:
-    case VPRecipeBase::VPWidenMemoryInstructionSC:
+    case VPRecipeBase::VPWidenLoadSC:
+    case VPRecipeBase::VPWidenStoreSC:
       // TODO: Widened stores don't define a value, but widened loads do. Split
       // the recipes to be able to make widened loads VPSingleDefRecipes.
       return false;
@@ -1048,6 +1049,11 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
            R->getVPDefID() == VPRecipeBase::VPVectorPointerSC;
   }
 
+  static inline bool classof(const VPUser *U) {
+    auto *R = dyn_cast<VPRecipeBase>(U);
+    return R && classof(R);
+  }
+
   /// Drop all poison-generating flags.
   void dropPoisonGeneratingFlags() {
     // NOTE: This needs to be kept in-sync with
@@ -2280,68 +2286,62 @@ class VPPredInstPHIRecipe : public VPSingleDefRecipe {
   }
 };
 
-/// A Recipe for widening load/store operations.
-/// The recipe uses the following VPValues:
-/// - For load: Address, optional mask
-/// - For store: Address, stored value, optional mask
-/// TODO: We currently execute only per-part unless a specific instance is
-/// provided.
-class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
+/// A common base class for widening memory operations. An optional mask can be
+/// provided as the last operand.
+class VPWidenMemoryRecipe : public VPRecipeBase {
+protected:
   Instruction &Ingredient;
 
-  // Whether the loaded-from / stored-to addresses are consecutive.
+  /// Whether the accessed addresses are consecutive.
   bool Consecutive;
 
-  // Whether the consecutive loaded/stored addresses are in reverse order.
+  /// Whether the consecutive accessed addresses are in reverse order.
   bool Reverse;
 
+  /// Whether the memory access is masked.
+  bool IsMasked = false;
+
   void setMask(VPValue *Mask) {
+    assert(!IsMasked && "cannot re-set mask");
     if (!Mask)
       return;
     addOperand(Mask);
+    IsMasked = true;
   }
 
-  bool isMasked() const {
-    return isStore() ? getNumOperands() == 3 : getNumOperands() == 2;
+  VPWidenMemoryRecipe(const char unsigned SC, Instruction &I,
+                      std::initializer_list<VPValue *> Operands,
+                      bool Consecutive, bool Reverse, DebugLoc DL)
+      : VPRecipeBase(SC, Operands, DL), Ingredient(I), Consecutive(Consecutive),
+        Reverse(Reverse) {
+    assert((Consecutive || !Reverse) && "Reverse implies consecutive");
   }
 
 public:
-  VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
-                                 bool Consecutive, bool Reverse, DebugLoc DL)
-      : VPRecipeBase(VPDef::VPWidenMemoryInstructionSC, {Addr}, DL),
-        Ingredient(Load), Consecutive(Consecutive), Reverse(Reverse) {
-    assert((Consecutive || !Reverse) && "Reverse implies consecutive");
-    new VPValue(this, &Load);
-    setMask(Mask);
-  }
+  VPWidenMemoryRecipe *clone() override = 0;
 
-  VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr,
-                                 VPValue *StoredValue, VPValue *Mask,
-                                 bool Consecutive, bool Reverse, DebugLoc DL)
-      : VPRecipeBase(VPDef::VPWidenMemoryInstructionSC, {Addr, StoredValue},
-                     DL),
-        Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse) {
-    assert((Consecutive || !Reverse) && "Reverse implies consecutive");
-    setMask(Mask);
+  static inline bool classof(const VPRecipeBase *R) {
+    return R->getVPDefID() == VPDef::VPWidenLoadSC ||
+           R->getVPDefID() == VPDef::VPWidenStoreSC;
   }
 
-  VPWidenMemoryInstructionRecipe *clone() override {
-    if (isStore())
-      return new VPWidenMemoryInstructionRecipe(
-          cast<StoreInst>(Ingredient), getAddr(), getStoredValue(), getMask(),
-          Consecutive, Reverse, getDebugLoc());
-
-    return new VPWidenMemoryInstructionRecipe(cast<LoadInst>(Ingredient),
-                                              getAddr(), getMask(), Consecutive,
-                                              Reverse, getDebugLoc());
+  static inline bool classof(const VPUser *U) {
+    auto *R = dyn_cast<VPRecipeBase>(U);
+    return R && classof(R);
   }
 
-  VP_CLASSOF_IMPL(VPDef::VPWidenMemoryInstructionSC)
+  /// Return whether the loaded-from / stored-to addresses are consecutive.
+  bool isConsecutive() const { return Consecutive; }
+
+  /// Return whether the consecutive loaded/stored addresses are in reverse
+  /// order.
+  bool isReverse() const { return Reverse; }
 
   /// Return the address accessed by this recipe.
-  VPValue *getAddr() const {
-    return getOperand(0); // Address is the 1st, mandatory operand.
-  }
+  VPValue *getAddr() const { return getOperand(0); }
+
+  /// Returns true if the recipe is masked.
+  bool isMasked() const { return IsMasked; }
 
   /// Return the mask used by this recipe. Note that a full mask is represented
   /// by a nullptr.
@@ -2350,23 +2350,34 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
     return isMasked() ? getOperand(getNumOperands() - 1) : nullptr;
   }
 
-  /// Returns true if this recipe is a store.
-  bool isStore() const { return isa<StoreInst>(Ingredient); }
+  /// Generate the wide load/store.
+  void execute(VPTransformState &State) override {
+    llvm_unreachable("VPWidenMemoryRecipe should not be instantiated.");
+  }
 
-  /// Return the address accessed by this recipe.
-  VPValue *getStoredValue() const {
-    assert(isStore() && "Stored value only available for store instructions");
-    return getOperand(1); // Stored value is the 2nd, mandatory operand.
+  Instruction &getIngredient() const { return Ingredient; }
+};
+
+/// A recipe for widening load operations, using the address to load from and an
+/// optional mask.
+struct VPWidenLoadRecipe final : public VPWidenMemoryRecipe, public VPValue {
+  VPWidenLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
+                    bool Consecutive, bool Reverse, DebugLoc DL)
+      : VPWidenMemoryRecipe(VPDef::VPWidenLoadSC, Load, {Addr}, Consecutive,
+                            Reverse, DL),
+        VPValue(this, &Load) {
+    setMask(Mask);
   }
 
-  // Return whether the loaded-from / stored-to addresses are consecutive.
-  bool isConsecutive() const { return Consecutive; }
+  VPWidenLoadRecipe *clone() override {
+    return new VPWidenLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
+                                 getMask(), Consecutive, Reverse,
+                                 getDebugLoc());
+  }
 
-  // Return whether the consecutive loaded/stored addresses are in reverse
-  // order.
-  bool isReverse() const { return Reverse; }
+  VP_CLASSOF_IMPL(VPDef::VPWidenLoadSC);
 
-  /// Generate the wide load/store.
+  /// Generate a wide load or gather.
   void execute(VPTransformState &State) override;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2380,16 +2391,51 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
     assert(is_contained(operands(), Op) &&
            "Op must be an operand of the recipe");
 
-    // Widened, consecutive memory operations only demand the first lane of
-    // their address, unless the same operand is also stored. That latter can
-    // happen with opaque pointers.
-    return Op == getAddr() && isConsecutive() &&
-           (!isStore() || Op != getStoredValue());
+    // Widened, consecutive loads operations only demand the first lane of
+    // their address.
+    return Op == getAddr() && isConsecutive();
   }
-
-  Instruction &getIngredient() const { return Ingredient; }
 };
 
+/// A recipe for widening store operations, using the stored value, the address
+/// to store to and an optional mask.
+struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
+  VPWidenStoreRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredVal,
+                     VPValue *Mask, bool Consecutive, bool Reverse, DebugLoc DL)
+      : VPWidenMemoryRecipe(VPDef::VPWidenStoreSC, Store, {Addr, StoredVal},
+                            Consecutive, Reverse, DL) {
+    setMask(Mask);
+  }
+
+  VPWidenStoreRecipe *clone() override {
+    return new VPWidenStoreRecipe(cast<StoreInst>(Ingredient), getAddr(),
+                                  getStoredValue(), getMask(), Consecutive,
+                                  Reverse, getDebugLoc());
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPWidenStoreSC);
+
+  /// Return the value stored by this recipe.
+  VPValue *getStoredValue() const { return getOperand(1); }
+
+  /// Generate a wide store or scatter.
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+
+  /// Returns true if the recipe only uses the first lane of operand \p Op.
+  bool onlyFirstLaneUsed(const VPValue *Op) const override {
+    assert(is_contained(operands(), Op) &&
+           "Op must be an operand of the recipe");
+    // Widened, consecutive stores only demand the first lane of their address,
+    // unless the same operand is also stored.
+    return Op == getAddr() && isConsecutive() && Op != getStoredValue();
+  }
+};
 /// Recipe to expand a SCEV expression.
 class VPExpandSCEVRecipe : public VPSingleDefRecipe {
   const SCEV *Expr;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index c8ae2ee5a30fe..130fb04f586e7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -108,9 +108,9 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
   return CI.getType();
 }
 
-Type *VPTypeAnalysis::inferScalarTypeForRecipe(
-    const VPWidenMemoryInstructionRecipe *R) {
-  assert(!R->isStore() && "Store recipes should not define any values");
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
+  assert(isa<VPWidenLoadRecipe>(R) &&
+         "Store recipes should not define any values");
   return cast<LoadInst>(&R->getIngredient())->getType();
 }
 
@@ -231,8 +231,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
             return inferScalarType(R->getOperand(0));
           })
           .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
-                VPWidenCallRecipe, VPWidenMemoryInstructionRecipe,
-                VPWidenSelectRecipe>(
+                VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>(
               [this](const auto *R) { return inferScalarTypeForRecipe(R); })
           .Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) {
             // TODO: Use info from interleave group.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
index 4e69de7fd6812..7d310b1b31b6f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -20,7 +20,7 @@ class VPInstruction;
 class VPWidenRecipe;
 class VPWidenCallRecipe;
 class VPWidenIntOrFpInductionRecipe;
-class VPWidenMemoryInstructionRecipe;
+class VPWidenMemoryRecipe;
 struct VPWidenSelectRecipe;
 class VPReplicateRecipe;
 class Type;
@@ -46,7 +46,7 @@ class VPTypeAnalysis {
   Type *inferScalarTypeForRecipe(const VPWidenCallRecipe *R);
   Type *inferScalarTypeForRecipe(const VPWidenRecipe *R);
   Type *inferScalarTypeForRecipe(const VPWidenIntOrFpInductionRecipe *R);
-  Type *inferScalarTypeForRecipe(const VPWidenMemoryInstructionRecipe *R);
+  Type *inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R);
   Type *inferScalarTypeForRecipe(const VPWidenSelectRecipe *R);
   Type *inferScalarTypeForRecipe(const VPReplicateRecipe *R);
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 625319954e9b7..78932643c81fa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -47,9 +47,8 @@ bool VPRecipeBase::mayWriteToMemory() const {
   switch (getVPDefID()) {
   case VPInterleaveSC:
     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
-  case VPWidenMemoryInstructionSC: {
-    return cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
-  }
+  case VPWidenStoreSC:
+    return true;
   case VPReplicateSC:
   case VPWidenCallSC:
     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
@@ -64,6 +63,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
   case VPWidenCastSC:
   case VPWidenGEPSC:
   case VPWidenIntOrFpInductionSC:
+  case VPWidenLoadSC:
   case VPWidenPHISC:
   case VPWidenSC:
   case VPWidenSelectSC: {
@@ -81,16 +81,16 @@ bool VPRecipeBase::mayWriteToMemory() const {
 
 bool VPRecipeBase::mayReadFromMemory() const {
   switch (getVPDefID()) {
-  case VPWidenMemoryInstructionSC: {
-    return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
-  }
+  case VPWidenLoadSC:
+    return true;
   case VPReplicateSC:
   case VPWidenCallSC:
     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
         ->mayReadFromMemory();
   case VPBranchOnMaskSC:
-  case VPScalarIVStepsSC:
   case VPPredInstPHISC:
+  case VPScalarIVStepsSC:
+  case VPWidenStoreSC:
     return false;
   case VPBlendSC:
   case VPReductionSC:
@@ -155,12 +155,13 @@ bool VPRecipeBase::mayHaveSideEffects() const {
   }
   case VPInterleaveSC:
     return mayWriteToMemory();
-  case VPWidenMemoryInstructionSC:
-    assert(cast<VPWidenMemoryInstructionRecipe>(this)
-                   ->getIngredient()
-                   .mayHaveSideEffects() == mayWriteToMemory() &&
-           "mayHaveSideffects result for ingredient differs from this "
-           "implementation");
+  case VPWidenLoadSC:
+  case VPWidenStoreSC:
+    assert(
+        cast<VPWidenMemoryRecipe>(this)->getIngredient().mayHaveSideEffects() ==
+            mayWriteToMemory() &&
+        "mayHaveSideffects result for ingredient differs from this "
+        "implementation");
     return mayWriteToMemory();
   case VPReplicateSC: {
     auto *R = cast<VPReplicateRecipe>(this);
@@ -501,8 +502,6 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
     // Reduce all of the unrolled parts into a single vector.
     Value *ReducedPartRdx = RdxParts[0];
     unsigned Op = RecurrenceDescriptor::getOpcode(RK);
-    if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK))
-      Op = Instruction::Or;
 
     if (PhiR->isOrdered()) {
       ReducedPartRdx = RdxParts[State.UF - 1];
@@ -515,16 +514,19 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
           ReducedPartRdx = Builder.CreateBinOp(
               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
-        else
+        else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
+          TrackingVH<Value> ReductionStartValue =
+              RdxDesc.getRecurrenceStartValue();
+          ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,
+                                         ReducedPartRdx, RdxPart);
+        } else
           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
       }
     }
 
     // Create the reduction after the loop. Note that inloop reductions create
     // the target reduction in the loop using a Reduction recipe.
-    if ((State.VF.isVector() ||
-         RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) &&
-        !PhiR->isInLoop()) {
+    if (State.VF.isVector() && !PhiR->isInLoop()) {
       ReducedPartRdx =
           createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
       // If the reduction can be performed in a smaller type, we need to extend
@@ -1768,16 +1770,17 @@ void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
   printOperands(O, SlotTracker);
 }
 
-void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
-                                           VPSlotTracker &SlotTracker) const {
+void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
+                              VPSlotTracker &SlotTracker) const {
   O << Indent << "WIDEN ";
+  printAsOperand(O, SlotTracker);
+  O << " = load ";
+  printOperands(O, SlotTracker);
+}
 
-  if (!isStore()) {
-    getVPSingleValue()->printAsOperand(O, SlotTracker);
-    O << " = ";
-  }
-  O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
-
+void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
+                               VPSlotTracker &SlotTracker) const {
+  O << Indent << "WIDEN store ";
   printOperands(O, SlotTracker);
 }
 #endif
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 1256e4d8fda50..901ecd10c69d8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -60,14 +60,14 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
         assert(isa<VPInstruction>(&Ingredient) &&
                "only VPInstructions expected here");
         assert(!isa<PHINode>(Inst) && "phis should be handled above");
-        // Create VPWidenMemoryInstructionRecipe for loads and stores.
+        // Create VPWidenMemoryRecipe for loads and stores.
         if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
-          NewRecipe = new VPWidenMemoryInstructionRecipe(
+          NewRecipe = new VPWidenLoadRecipe(
               *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
               false /*Consecutive*/, false /*Reverse*/,
               Ingredient.getDebugLoc());
         } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
-          NewRecipe = new VPWidenMemoryInstructionRecipe(
+          NewRecipe = new VPWidenStoreRecipe(
               *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
               nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
               Ingredient.getDebugLoc());
@@ -852,6 +852,18 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
   return true;
 }
 
+static SmallVector<VPUser *> collectUsersRecursively(VPValue *V) {
+  SetVector<VPUser *> Users(V->user_begin(), V->user_end());
+  for (unsigned I = 0; I != Users.size(); ++I) {
+    VPRecipeBase *Cur = dyn_cast<VPRecipeBase>(Users[I]);
+    if (!Cur || isa<VPHeaderPHIRecipe>(Cur))
+      continue;
+    for (VPValue *V : Cur->definedValues())
+      Users.insert(V->user_begin(), V->user_end());
+  }
+  return Users.takeVector();
+}
+
 void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
   for (VPRecipeBase &R :
        Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
@@ -863,24 +875,10 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
     if (RK != RecurKind::Add && RK != RecurKind::Mul)
       continue;
 
-    SmallSetVector<VPValue *, 8> Worklist;
-    Worklist.insert(PhiR);
-
-    for (unsigned I = 0; I != Worklist.size(); ++I) {
-      VPValue *Cur = Worklist[I];
-      if (auto *RecWithFlags =
-              dyn_cast<VPRecipeWithIRFlags>(Cur->getDefiningRecipe())) {
+    for (VPUser *U : collectUsersRecursively(PhiR))
+      if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
         RecWithFlags->dropPoisonGeneratingFlags();
       }
-
-      for (VPUser *U : Cur->users()) {
-        auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
-        if (!UserRecipe)
-          continue;
-        for (VPValue *V : UserRecipe->definedValues())
-          Worklist.insert(V);
-      }
-    }
   }
 }
 
@@ -977,10 +975,7 @@ void VPlanTransforms::truncateToMinimalBitwidths(
            vp_depth_first_deep(Plan.getVectorLoopRegion()))) {
     for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
       if (!isa<VPWidenRecipe, VPWidenCastRecipe, VPReplicateRecipe,
-               VPWidenSelectRecipe, VPWidenMemoryInstructionRecipe>(&R))
-        continue;
-      if (isa<VPWidenMemoryInstructionRecipe>(&R) &&
-          cast<VPWidenMemoryInstructionRecipe>(&R)->isStore())
+               VPWidenSelectRecipe, VPWidenLoadRecipe>(&R))
         continue;
 
       VPValue *ResultVPV = R.getVPSingleValue();
@@ -1048,10 +1043,9 @@ void VPlanTransforms::truncateToMinimalBitwidths(
         assert(cast<VPWidenRecipe>(&R)->getOpcode() == Instruction::ICmp &&
                "Only ICmps should not need extending the result.");
 
-      if (isa<VPWidenMemoryInstructionRecipe>(&R)) {
-        assert(!cast<VPWidenMemoryInstructionRecipe>(&R)->isStore() && "stores cannot be narrowed");
+      assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
+      if (isa<VPWidenLoadRecipe>(&R))
         continue;
-      }
 
       // Shrink operands by introducing truncates as needed.
       unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
@@ -1315,7 +1309,7 @@ void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) {
       ConstantInt::getTrue(CanonicalIVPHI->getScalarType()->getContext());
   VPValue *VPTrueMask = Plan.getOrAddLiveIn(TrueMask);
   replaceHeaderPredicateWith(Plan, *VPTrueMask, [](VPUser &U, unsigned) {
-    return isa<VPWidenMemoryInstructionRecipe>(U);
+    return isa<VPWidenMemoryRecipe>(U);
   });
   // Now create the ExplicitVectorLengthPhi recipe in the main loop.
   auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
@@ -1371,8 +1365,7 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
       // instruction. Widen memory instructions involved in address computation
       // will lead to gather/scatter instructions, which don't need to be
       // handled.
-      if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
-          isa<VPInterleaveRecipe>(CurRec) ||
+      if (isa<VPWidenMemoryRecipe>(CurRec) || isa<VPInterleaveRecipe>(CurRec) ||
           isa<VPScalarIVStepsRecipe>(CurRec) || isa<VPHeaderPHIRecipe>(CurRec))
         continue;
 
@@ -1420,7 +1413,7 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
   auto Iter = vp_depth_first_deep(Plan.getEntry());
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
     for (VPRecipeBase &Recipe : *VPBB) {
-      if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
+      if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
         Instruction &UnderlyingInstr = WidenRec->getIngredient();
         VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
         if (AddrDef && WidenRec->isConsecutive() &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index da3a768552fc5..0bbc7ffb4a2fe 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -36,7 +36,6 @@ class VPDef;
 class VPSlotTracker;
 class VPUser;
 class VPRecipeBase;
-class VPWidenMemoryInstructionRecipe;
 
 // This is the base class of the VPlan Def/Use graph, used for modeling the data
 // flow into, within and out of the VPlan. VPValues can stand for live-ins
@@ -51,7 +50,6 @@ class VPValue {
   friend class VPInterleavedAccessInfo;
   friend class VPSlotTracker;
   friend class VPRecipeBase;
-  friend class VPWidenMemoryInstructionRecipe;
 
   const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
 
@@ -358,11 +356,12 @@ class VPDef {
     VPWidenCanonicalIVSC,
     VPWidenCastSC,
     VPWidenGEPSC,
-    VPWidenMemoryInstructionSC,
+    VPWidenLoadSC,
+    VPWidenStoreSC,
     VPWidenSC,
     VPWidenSelectSC,
-    // START: Phi-like recipes. Need to be kept together.
     VPBlendSC,
+    // START: Phi-like recipes. Need to be kept together.
     VPWidenPHISC,
     VPPredInstPHISC,
     // START: SubclassID for recipes that inherit VPHeaderPHIRecipe.
@@ -376,7 +375,7 @@ class VPDef {
     VPReductionPHISC,
     // END: SubclassID for recipes that inherit VPHeaderPHIRecipe
     // END: Phi-like recipes
-    VPFirstPHISC = VPBlendSC,
+    VPFirstPHISC = VPWidenPHISC,
     VPFirstHeaderPHISC = VPCanonicalIVPHISC,
     VPLastHeaderPHISC = VPReductionPHISC,
     VPLastPHISC = VPReductionPHISC,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 12d37fa711db9..5587302207acd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -128,7 +128,7 @@ static bool verifyVPBasicBlock(const VPBasicBlock *VPBB,
       }
       return true;
     }
-    if (isa<VPWidenMemoryInstructionRecipe>(R))
+    if (isa<VPWidenMemoryRecipe>(R))
       VPWidenMemRecipe = R;
     return true;
   };
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index e0e2f50c89ada..4918cee1fa82a 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -886,7 +886,7 @@ bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
     SafeToSpeculate = isSafeToSpeculativelyExecuteWithOpcode(
         *FunctionalOpcode, &VPI, nullptr, &AC, &DT);
   if (!SafeToSpeculate &&
-      !isKnownNonZero(EVL, /*Depth=*/0, SimplifyQuery(*DL, &DT, &AC, &VPI)))
+      !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI)))
     return false;
 
   Value *ScalarVal =
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll
index ec95a313e11e2..106f2f9edc2e4 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-load.ll
@@ -401,3 +401,623 @@ entry:
   %lane = shufflevector <2 x i64> %tmp1, <2 x i64> undef, <2 x i32> zeroinitializer
   ret <2 x i64> %lane
 }
+
+define void @vld2(ptr %p) {
+; CHECK-LABEL: 'vld2'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = load <4 x i8>, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_1 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = load <8 x i8>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_1 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = load <16 x i8>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_1 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = load <32 x i8>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_1 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = load <4 x i16>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_0 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_1 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = load <8 x i16>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_1 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = load <16 x i16>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_1 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i16 = load <32 x i16>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_0 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_1 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = load <4 x i32>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_1 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = load <8 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_1 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = load <16 x i32>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_0 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_1 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32 = load <32 x i32>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i32_0 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i32_1 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64 = load <4 x i64>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = shufflevector <4 x i64> %v2i64, <4 x i64> undef, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = shufflevector <4 x i64> %v2i64, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i64 = load <8 x i64>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_0 = shufflevector <8 x i64> %v4i64, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_1 = shufflevector <8 x i64> %v4i64, <8 x i64> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i64 = load <16 x i64>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_0 = shufflevector <16 x i64> %v8i64, <16 x i64> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_1 = shufflevector <16 x i64> %v8i64, <16 x i64> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i64 = load <32 x i64>, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_0 = shufflevector <32 x i64> %v16i64, <32 x i64> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_1 = shufflevector <32 x i64> %v16i64, <32 x i64> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CODESIZE-LABEL: 'vld2'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = load <4 x i8>, ptr %p, align 4
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_0 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> <i32 0, i32 2>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8_1 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> <i32 1, i32 3>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = load <8 x i8>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_0 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8_1 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = load <16 x i8>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_0 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8_1 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = load <32 x i8>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_0 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8_1 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = load <4 x i16>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_0 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> <i32 0, i32 2>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_1 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> <i32 1, i32 3>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = load <8 x i16>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_0 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16_1 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = load <16 x i16>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_0 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16_1 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i16 = load <32 x i16>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_0 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i16_1 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = load <4 x i32>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_0 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_1 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = load <8 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_0 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32_1 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = load <16 x i32>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_0 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i32_1 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32 = load <32 x i32>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i32_0 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32i32_1 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i64 = load <4 x i64>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = shufflevector <4 x i64> %v2i64, <4 x i64> undef, <2 x i32> <i32 0, i32 2>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = shufflevector <4 x i64> %v2i64, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i64 = load <8 x i64>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_0 = shufflevector <8 x i64> %v4i64, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_1 = shufflevector <8 x i64> %v4i64, <8 x i64> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i64 = load <16 x i64>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_0 = shufflevector <16 x i64> %v8i64, <16 x i64> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_1 = shufflevector <16 x i64> %v8i64, <16 x i64> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i64 = load <32 x i64>, ptr %p, align 256
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_0 = shufflevector <32 x i64> %v16i64, <32 x i64> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_1 = shufflevector <32 x i64> %v16i64, <32 x i64> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %v4i8 = load <4 x i8>, ptr %p
+  %v4i8_0 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> <i32 0, i32 2>
+  %v4i8_1 = shufflevector <4 x i8> %v4i8, <4 x i8> undef, <2 x i32> <i32 1, i32 3>
+  %v8i8 = load <8 x i8>, ptr %p
+  %v8i8_0 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %v8i8_1 = shufflevector <8 x i8> %v8i8, <8 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %v16i8 = load <16 x i8>, ptr %p
+  %v16i8_0 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v16i8_1 = shufflevector <16 x i8> %v16i8, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %v32i8 = load <32 x i8>, ptr %p
+  %v32i8_0 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %v32i8_1 = shufflevector <32 x i8> %v32i8, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+
+  %v4i16 = load <4 x i16>, ptr %p
+  %v4i16_0 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> <i32 0, i32 2>
+  %v4i16_1 = shufflevector <4 x i16> %v4i16, <4 x i16> undef, <2 x i32> <i32 1, i32 3>
+  %v8i16 = load <8 x i16>, ptr %p
+  %v8i16_0 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %v8i16_1 = shufflevector <8 x i16> %v8i16, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %v16i16 = load <16 x i16>, ptr %p
+  %v16i16_0 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v16i16_1 = shufflevector <16 x i16> %v16i16, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %v32i16 = load <32 x i16>, ptr %p
+  %v32i16_0 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %v32i16_1 = shufflevector <32 x i16> %v32i16, <32 x i16> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+
+  %v4i32 = load <4 x i32>, ptr %p
+  %v4i32_0 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
+  %v4i32_1 = shufflevector <4 x i32> %v4i32, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+  %v8i32 = load <8 x i32>, ptr %p
+  %v8i32_0 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %v8i32_1 = shufflevector <8 x i32> %v8i32, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %v16i32 = load <16 x i32>, ptr %p
+  %v16i32_0 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v16i32_1 = shufflevector <16 x i32> %v16i32, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %v32i32 = load <32 x i32>, ptr %p
+  %v32i32_0 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %v32i32_1 = shufflevector <32 x i32> %v32i32, <32 x i32> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+
+  %v2i64 = load <4 x i64>, ptr %p
+  %v2i64_0 = shufflevector <4 x i64> %v2i64, <4 x i64> undef, <2 x i32> <i32 0, i32 2>
+  %v2i64_1 = shufflevector <4 x i64> %v2i64, <4 x i64> undef, <2 x i32> <i32 1, i32 3>
+  %v4i64 = load <8 x i64>, ptr %p
+  %v4i64_0 = shufflevector <8 x i64> %v4i64, <8 x i64> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %v4i64_1 = shufflevector <8 x i64> %v4i64, <8 x i64> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %v8i64 = load <16 x i64>, ptr %p
+  %v8i64_0 = shufflevector <16 x i64> %v8i64, <16 x i64> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v8i64_1 = shufflevector <16 x i64> %v8i64, <16 x i64> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %v16i64 = load <32 x i64>, ptr %p
+  %v16i64_0 = shufflevector <32 x i64> %v16i64, <32 x i64> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %v16i64_1 = shufflevector <32 x i64> %v16i64, <32 x i64> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+
+  ret void
+}
+
+
+define void @vld3(ptr %p) {
+; CHECK-LABEL: 'vld3'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = load <6 x i8>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_0 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_1 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_2 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = load <12 x i8>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_0 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_1 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_2 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = load <24 x i8>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v8i8_0 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v8i8_1 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v8i8_2 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8 = load <48 x i8>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16i8_0 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8_1 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16i8_2 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = load <6 x i16>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i16_0 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i16_1 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i16_2 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = load <12 x i16>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v4i16_0 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v4i16_1 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v4i16_2 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = load <24 x i16>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i16_0 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_1 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i16_2 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = load <48 x i16>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i16_0 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_1 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i16_2 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32 = load <6 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_2 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = load <12 x i32>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_0 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i32_1 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_2 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = load <24 x i32>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_0 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32_1 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_2 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i32 = load <48 x i32>, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_0 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i32_1 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_2 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i64 = load <6 x i64>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> <i32 1, i32 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_2 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> <i32 2, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i64 = load <12 x i64>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_0 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_1 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_2 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i64 = load <24 x i64>, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_0 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_1 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_2 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i64 = load <48 x i64>, ptr %p, align 512
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_0 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_1 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_2 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CODESIZE-LABEL: 'vld3'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = load <6 x i8>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_0 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> <i32 0, i32 3>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_1 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> <i32 1, i32 4>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_2 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> <i32 2, i32 5>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = load <12 x i8>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_0 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_1 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_2 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = load <24 x i8>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v8i8_0 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v8i8_1 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v8i8_2 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8 = load <48 x i8>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16i8_0 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i8_1 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16i8_2 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = load <6 x i16>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i16_0 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> <i32 0, i32 3>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i16_1 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> <i32 1, i32 4>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i16_2 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> <i32 2, i32 5>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = load <12 x i16>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v4i16_0 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v4i16_1 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v4i16_2 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = load <24 x i16>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i16_0 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i16_1 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i16_2 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = load <48 x i16>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i16_0 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16_1 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i16_2 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32 = load <6 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> <i32 0, i32 3>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_1 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> <i32 1, i32 4>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_2 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> <i32 2, i32 5>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = load <12 x i32>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_0 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i32_1 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_2 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = load <24 x i32>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_0 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32_1 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_2 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i32 = load <48 x i32>, ptr %p, align 256
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_0 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i32_1 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_2 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i64 = load <6 x i64>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> <i32 0, i32 3>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> <i32 1, i32 4>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_2 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> <i32 2, i32 5>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i64 = load <12 x i64>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_0 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_1 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_2 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i64 = load <24 x i64>, ptr %p, align 256
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_0 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_1 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_2 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i64 = load <48 x i64>, ptr %p, align 512
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_0 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_1 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_2 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %v2i8 = load <6 x i8>, ptr %p
+  %v2i8_0 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> <i32 0, i32 3>
+  %v2i8_1 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> <i32 1, i32 4>
+  %v2i8_2 = shufflevector <6 x i8> %v2i8, <6 x i8> undef, <2 x i32> <i32 2, i32 5>
+  %v4i8 = load <12 x i8>, ptr %p
+  %v4i8_0 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v4i8_1 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v4i8_2 = shufflevector <12 x i8> %v4i8, <12 x i8> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %v8i8 = load <24 x i8>, ptr %p
+  %v8i8_0 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+  %v8i8_1 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+  %v8i8_2 = shufflevector <24 x i8> %v8i8, <24 x i8> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  %v16i8 = load <48 x i8>, ptr %p
+  %v16i8_0 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+  %v16i8_1 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+  %v16i8_2 = shufflevector <48 x i8> %v16i8, <48 x i8> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+
+  %v2i16 = load <6 x i16>, ptr %p
+  %v2i16_0 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> <i32 0, i32 3>
+  %v2i16_1 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> <i32 1, i32 4>
+  %v2i16_2 = shufflevector <6 x i16> %v2i16, <6 x i16> undef, <2 x i32> <i32 2, i32 5>
+  %v4i16 = load <12 x i16>, ptr %p
+  %v4i16_0 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v4i16_1 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v4i16_2 = shufflevector <12 x i16> %v4i16, <12 x i16> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %v8i16 = load <24 x i16>, ptr %p
+  %v8i16_0 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+  %v8i16_1 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+  %v8i16_2 = shufflevector <24 x i16> %v8i16, <24 x i16> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  %v16i16 = load <48 x i16>, ptr %p
+  %v16i16_0 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+  %v16i16_1 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+  %v16i16_2 = shufflevector <48 x i16> %v16i16, <48 x i16> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+
+  %v2i32 = load <6 x i32>, ptr %p
+  %v2i32_0 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> <i32 0, i32 3>
+  %v2i32_1 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> <i32 1, i32 4>
+  %v2i32_2 = shufflevector <6 x i32> %v2i32, <6 x i32> undef, <2 x i32> <i32 2, i32 5>
+  %v4i32 = load <12 x i32>, ptr %p
+  %v4i32_0 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v4i32_1 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v4i32_2 = shufflevector <12 x i32> %v4i32, <12 x i32> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %v8i32 = load <24 x i32>, ptr %p
+  %v8i32_0 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+  %v8i32_1 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+  %v8i32_2 = shufflevector <24 x i32> %v8i32, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  %v16i32 = load <48 x i32>, ptr %p
+  %v16i32_0 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+  %v16i32_1 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+  %v16i32_2 = shufflevector <48 x i32> %v16i32, <48 x i32> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+
+  %v2i64 = load <6 x i64>, ptr %p
+  %v2i64_0 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> <i32 0, i32 3>
+  %v2i64_1 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> <i32 1, i32 4>
+  %v2i64_2 = shufflevector <6 x i64> %v2i64, <6 x i64> undef, <2 x i32> <i32 2, i32 5>
+  %v4i64 = load <12 x i64>, ptr %p
+  %v4i64_0 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+  %v4i64_1 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+  %v4i64_2 = shufflevector <12 x i64> %v4i64, <12 x i64> undef, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+  %v8i64 = load <24 x i64>, ptr %p
+  %v8i64_0 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+  %v8i64_1 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+  %v8i64_2 = shufflevector <24 x i64> %v8i64, <24 x i64> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  %v16i64 = load <48 x i64>, ptr %p
+  %v16i64_0 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
+  %v16i64_1 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46>
+  %v16i64_2 = shufflevector <48 x i64> %v16i64, <48 x i64> undef, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47>
+
+  ret void
+}
+
+define void @vld4(ptr %p) {
+; CHECK-LABEL: 'vld4'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = load <8 x i8>, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_0 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_1 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_2 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_3 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = load <16 x i8>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_0 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_1 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_2 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_3 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = load <32 x i8>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v8i8_0 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v8i8_1 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v8i8_2 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v8i8_3 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8 = load <64 x i8>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16i8_0 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16i8_1 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16i8_2 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16i8_3 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = load <8 x i16>, ptr %p, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i16_0 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i16_1 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i16_2 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i16_3 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = load <16 x i16>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v4i16_0 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v4i16_1 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v4i16_2 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v4i16_3 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = load <32 x i16>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i16_0 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i16_1 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i16_2 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i16_3 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = load <64 x i16>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i16_0 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i16_1 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i16_2 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i16_3 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32 = load <8 x i32>, ptr %p, align 32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_2 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_3 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = load <16 x i32>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_0 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_1 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_2 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_3 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = load <32 x i32>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_0 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_1 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_2 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_3 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i32 = load <64 x i32>, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_0 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_1 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_2 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_3 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i64 = load <8 x i64>, ptr %p, align 64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_2 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_3 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i64 = load <16 x i64>, ptr %p, align 128
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_0 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_1 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_2 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i64 = load <32 x i64>, ptr %p, align 256
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_0 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_1 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_2 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_3 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i64 = load <64 x i64>, ptr %p, align 512
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_0 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_1 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_2 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_3 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CODESIZE-LABEL: 'vld4'
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = load <8 x i8>, ptr %p, align 8
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_0 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 0, i32 4>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_1 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 1, i32 5>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_2 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 2, i32 6>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i8_3 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 3, i32 7>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = load <16 x i8>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_0 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_1 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_2 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i8_3 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = load <32 x i8>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v8i8_0 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v8i8_1 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v8i8_2 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %v8i8_3 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i8 = load <64 x i8>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16i8_0 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16i8_1 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16i8_2 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v16i8_3 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = load <8 x i16>, ptr %p, align 16
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i16_0 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i16_1 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i16_2 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2i16_3 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 3, i32 7>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = load <16 x i16>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v4i16_0 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v4i16_1 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v4i16_2 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %v4i16_3 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i16 = load <32 x i16>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i16_0 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i16_1 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i16_2 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v8i16_3 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = load <64 x i16>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i16_0 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i16_1 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i16_2 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v16i16_3 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32 = load <8 x i32>, ptr %p, align 32
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_0 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 0, i32 4>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_1 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 1, i32 5>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_2 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 2, i32 6>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i32_3 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 3, i32 7>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i32 = load <16 x i32>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_0 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_1 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_2 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i32_3 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i32 = load <32 x i32>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_0 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_1 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_2 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i32_3 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i32 = load <64 x i32>, ptr %p, align 256
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_0 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_1 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_2 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i32_3 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2i64 = load <8 x i64>, ptr %p, align 64
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_0 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 0, i32 4>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_1 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 1, i32 5>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_2 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 2, i32 6>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_3 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 3, i32 7>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i64 = load <16 x i64>, ptr %p, align 128
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_0 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_1 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_2 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_3 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i64 = load <32 x i64>, ptr %p, align 256
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_0 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_1 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_2 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_3 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i64 = load <64 x i64>, ptr %p, align 512
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_0 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_1 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_2 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_3 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+; CODESIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %v2i8 = load <8 x i8>, ptr %p
+  %v2i8_0 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 0, i32 4>
+  %v2i8_1 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 1, i32 5>
+  %v2i8_2 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 2, i32 6>
+  %v2i8_3 = shufflevector <8 x i8> %v2i8, <8 x i8> undef, <2 x i32> <i32 3, i32 7>
+  %v4i8 = load <16 x i8>, ptr %p
+  %v4i8_0 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %v4i8_1 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %v4i8_2 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %v4i8_3 = shufflevector <16 x i8> %v4i8, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %v8i8 = load <32 x i8>, ptr %p
+  %v8i8_0 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %v8i8_1 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %v8i8_2 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %v8i8_3 = shufflevector <32 x i8> %v8i8, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  %v16i8 = load <64 x i8>, ptr %p
+  %v16i8_0 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+  %v16i8_1 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+  %v16i8_2 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+  %v16i8_3 = shufflevector <64 x i8> %v16i8, <64 x i8> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+
+  %v2i16 = load <8 x i16>, ptr %p
+  %v2i16_0 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
+  %v2i16_1 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
+  %v2i16_2 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
+  %v2i16_3 = shufflevector <8 x i16> %v2i16, <8 x i16> undef, <2 x i32> <i32 3, i32 7>
+  %v4i16 = load <16 x i16>, ptr %p
+  %v4i16_0 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %v4i16_1 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %v4i16_2 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %v4i16_3 = shufflevector <16 x i16> %v4i16, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %v8i16 = load <32 x i16>, ptr %p
+  %v8i16_0 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %v8i16_1 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %v8i16_2 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %v8i16_3 = shufflevector <32 x i16> %v8i16, <32 x i16> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  %v16i16 = load <64 x i16>, ptr %p
+  %v16i16_0 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+  %v16i16_1 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+  %v16i16_2 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+  %v16i16_3 = shufflevector <64 x i16> %v16i16, <64 x i16> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+
+  %v2i32 = load <8 x i32>, ptr %p
+  %v2i32_0 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 0, i32 4>
+  %v2i32_1 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 1, i32 5>
+  %v2i32_2 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 2, i32 6>
+  %v2i32_3 = shufflevector <8 x i32> %v2i32, <8 x i32> undef, <2 x i32> <i32 3, i32 7>
+  %v4i32 = load <16 x i32>, ptr %p
+  %v4i32_0 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %v4i32_1 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %v4i32_2 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %v4i32_3 = shufflevector <16 x i32> %v4i32, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %v8i32 = load <32 x i32>, ptr %p
+  %v8i32_0 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %v8i32_1 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %v8i32_2 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %v8i32_3 = shufflevector <32 x i32> %v8i32, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  %v16i32 = load <64 x i32>, ptr %p
+  %v16i32_0 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+  %v16i32_1 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+  %v16i32_2 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+  %v16i32_3 = shufflevector <64 x i32> %v16i32, <64 x i32> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+
+  %v2i64 = load <8 x i64>, ptr %p
+  %v2i64_0 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 0, i32 4>
+  %v2i64_1 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 1, i32 5>
+  %v2i64_2 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 2, i32 6>
+  %v2i64_3 = shufflevector <8 x i64> %v2i64, <8 x i64> undef, <2 x i32> <i32 3, i32 7>
+  %v4i64 = load <16 x i64>, ptr %p
+  %v4i64_0 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %v4i64_1 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %v4i64_2 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %v4i64_3 = shufflevector <16 x i64> %v4i64, <16 x i64> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %v8i64 = load <32 x i64>, ptr %p
+  %v8i64_0 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %v8i64_1 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %v8i64_2 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %v8i64_3 = shufflevector <32 x i64> %v8i64, <32 x i64> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  %v16i64 = load <64 x i64>, ptr %p
+  %v16i64_0 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
+  %v16i64_1 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
+  %v16i64_2 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
+  %v16i64_3 = shufflevector <64 x i64> %v16i64, <64 x i64> undef, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
+
+  ret void
+}
+
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll
index 27d24faf0a8da..caa6d6f483a24 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-cmp.ll
@@ -9,38 +9,38 @@ define void @icmp_eq() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = icmp eq <4 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = icmp eq <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = icmp eq <16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8 = icmp eq <32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = icmp eq <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8 = icmp eq <vscale x 1 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = icmp eq <vscale x 2 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = icmp eq <vscale x 4 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = icmp eq <vscale x 8 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = icmp eq <vscale x 16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8 = icmp eq <vscale x 32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = icmp eq <vscale x 16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i8 = icmp eq <vscale x 32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = icmp eq <2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = icmp eq <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = icmp eq <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16 = icmp eq <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = icmp eq <16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16 = icmp eq <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = icmp eq <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = icmp eq <vscale x 4 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = icmp eq <vscale x 8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16 = icmp eq <vscale x 16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = icmp eq <vscale x 8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i16 = icmp eq <vscale x 16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = icmp eq <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = icmp eq <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32 = icmp eq <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32 = icmp eq <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = icmp eq <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = icmp eq <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32 = icmp eq <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = icmp eq <vscale x 2 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = icmp eq <vscale x 4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32 = icmp eq <vscale x 8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32 = icmp eq <vscale x 16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = icmp eq <vscale x 4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32 = icmp eq <vscale x 8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32 = icmp eq <vscale x 16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = icmp eq <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64 = icmp eq <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64 = icmp eq <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = icmp eq <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = icmp eq <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = icmp eq <vscale x 1 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = icmp eq <vscale x 2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64 = icmp eq <vscale x 4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64 = icmp eq <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = icmp eq <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = icmp eq <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64 = icmp eq <vscale x 8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8 = icmp eq <2 x i8> undef, undef
@@ -96,38 +96,38 @@ define void @icmp_ne() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = icmp ne <4 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = icmp ne <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = icmp ne <16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8 = icmp ne <32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = icmp ne <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8 = icmp ne <vscale x 1 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = icmp ne <vscale x 2 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = icmp ne <vscale x 4 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = icmp ne <vscale x 8 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = icmp ne <vscale x 16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8 = icmp ne <vscale x 32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = icmp ne <vscale x 16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i8 = icmp ne <vscale x 32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = icmp ne <2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = icmp ne <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = icmp ne <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16 = icmp ne <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = icmp ne <16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16 = icmp ne <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = icmp ne <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = icmp ne <vscale x 4 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = icmp ne <vscale x 8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16 = icmp ne <vscale x 16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = icmp ne <vscale x 8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i16 = icmp ne <vscale x 16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = icmp ne <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = icmp ne <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32 = icmp ne <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32 = icmp ne <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = icmp ne <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = icmp ne <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32 = icmp ne <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = icmp ne <vscale x 2 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = icmp ne <vscale x 4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32 = icmp ne <vscale x 8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32 = icmp ne <vscale x 16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = icmp ne <vscale x 4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32 = icmp ne <vscale x 8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32 = icmp ne <vscale x 16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = icmp ne <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64 = icmp ne <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64 = icmp ne <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = icmp ne <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = icmp ne <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = icmp ne <vscale x 1 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = icmp ne <vscale x 2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64 = icmp ne <vscale x 4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64 = icmp ne <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = icmp ne <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = icmp ne <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64 = icmp ne <vscale x 8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8 = icmp ne <2 x i8> undef, undef
@@ -183,38 +183,38 @@ define void @icmp_ugt() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = icmp ugt <4 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = icmp ugt <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = icmp ugt <16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8 = icmp ugt <32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = icmp ugt <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8 = icmp ugt <vscale x 1 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = icmp ugt <vscale x 2 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = icmp ugt <vscale x 4 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = icmp ugt <vscale x 8 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = icmp ugt <vscale x 16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8 = icmp ugt <vscale x 32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = icmp ugt <vscale x 16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i8 = icmp ugt <vscale x 32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = icmp ugt <2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = icmp ugt <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = icmp ugt <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16 = icmp ugt <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = icmp ugt <16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16 = icmp ugt <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = icmp ugt <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = icmp ugt <vscale x 4 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = icmp ugt <vscale x 8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16 = icmp ugt <vscale x 16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = icmp ugt <vscale x 8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i16 = icmp ugt <vscale x 16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = icmp ugt <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = icmp ugt <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32 = icmp ugt <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32 = icmp ugt <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = icmp ugt <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = icmp ugt <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32 = icmp ugt <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = icmp ugt <vscale x 2 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = icmp ugt <vscale x 4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32 = icmp ugt <vscale x 8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32 = icmp ugt <vscale x 16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = icmp ugt <vscale x 4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32 = icmp ugt <vscale x 8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32 = icmp ugt <vscale x 16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = icmp ugt <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64 = icmp ugt <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64 = icmp ugt <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = icmp ugt <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = icmp ugt <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = icmp ugt <vscale x 1 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = icmp ugt <vscale x 2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64 = icmp ugt <vscale x 4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64 = icmp ugt <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = icmp ugt <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = icmp ugt <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64 = icmp ugt <vscale x 8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8 = icmp ugt <2 x i8> undef, undef
@@ -270,38 +270,38 @@ define void @icmp_uge() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = icmp uge <4 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = icmp uge <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = icmp uge <16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8 = icmp uge <32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = icmp uge <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8 = icmp uge <vscale x 1 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = icmp uge <vscale x 2 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = icmp uge <vscale x 4 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = icmp uge <vscale x 8 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = icmp uge <vscale x 16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8 = icmp uge <vscale x 32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = icmp uge <vscale x 16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i8 = icmp uge <vscale x 32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = icmp uge <2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = icmp uge <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = icmp uge <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16 = icmp uge <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = icmp uge <16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16 = icmp uge <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = icmp uge <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = icmp uge <vscale x 4 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = icmp uge <vscale x 8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16 = icmp uge <vscale x 16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = icmp uge <vscale x 8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i16 = icmp uge <vscale x 16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = icmp uge <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = icmp uge <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32 = icmp uge <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32 = icmp uge <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = icmp uge <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = icmp uge <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32 = icmp uge <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = icmp uge <vscale x 2 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = icmp uge <vscale x 4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32 = icmp uge <vscale x 8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32 = icmp uge <vscale x 16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = icmp uge <vscale x 4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32 = icmp uge <vscale x 8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32 = icmp uge <vscale x 16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = icmp uge <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64 = icmp uge <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64 = icmp uge <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = icmp uge <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = icmp uge <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = icmp uge <vscale x 1 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = icmp uge <vscale x 2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64 = icmp uge <vscale x 4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64 = icmp uge <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = icmp uge <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = icmp uge <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64 = icmp uge <vscale x 8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8 = icmp uge <2 x i8> undef, undef
@@ -357,38 +357,38 @@ define void @icmp_ult() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = icmp ult <4 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = icmp ult <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = icmp ult <16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8 = icmp ult <32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = icmp ult <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8 = icmp ult <vscale x 1 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = icmp ult <vscale x 2 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = icmp ult <vscale x 4 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = icmp ult <vscale x 8 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = icmp ult <vscale x 16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8 = icmp ult <vscale x 32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = icmp ult <vscale x 16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i8 = icmp ult <vscale x 32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = icmp ult <2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = icmp ult <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = icmp ult <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16 = icmp ult <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = icmp ult <16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16 = icmp ult <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = icmp ult <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = icmp ult <vscale x 4 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = icmp ult <vscale x 8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16 = icmp ult <vscale x 16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = icmp ult <vscale x 8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i16 = icmp ult <vscale x 16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = icmp ult <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = icmp ult <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32 = icmp ult <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32 = icmp ult <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = icmp ult <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = icmp ult <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32 = icmp ult <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = icmp ult <vscale x 2 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = icmp ult <vscale x 4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32 = icmp ult <vscale x 8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32 = icmp ult <vscale x 16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = icmp ult <vscale x 4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32 = icmp ult <vscale x 8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32 = icmp ult <vscale x 16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = icmp ult <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64 = icmp ult <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64 = icmp ult <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = icmp ult <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = icmp ult <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = icmp ult <vscale x 1 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = icmp ult <vscale x 2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64 = icmp ult <vscale x 4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64 = icmp ult <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = icmp ult <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = icmp ult <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64 = icmp ult <vscale x 8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8 = icmp ult <2 x i8> undef, undef
@@ -444,38 +444,38 @@ define void @icmp_ule() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = icmp ule <4 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = icmp ule <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = icmp ule <16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8 = icmp ule <32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = icmp ule <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8 = icmp ule <vscale x 1 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = icmp ule <vscale x 2 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = icmp ule <vscale x 4 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = icmp ule <vscale x 8 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = icmp ule <vscale x 16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8 = icmp ule <vscale x 32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = icmp ule <vscale x 16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i8 = icmp ule <vscale x 32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = icmp ule <2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = icmp ule <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = icmp ule <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16 = icmp ule <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = icmp ule <16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16 = icmp ule <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = icmp ule <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = icmp ule <vscale x 4 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = icmp ule <vscale x 8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16 = icmp ule <vscale x 16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = icmp ule <vscale x 8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i16 = icmp ule <vscale x 16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = icmp ule <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = icmp ule <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32 = icmp ule <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32 = icmp ule <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = icmp ule <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = icmp ule <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32 = icmp ule <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = icmp ule <vscale x 2 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = icmp ule <vscale x 4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32 = icmp ule <vscale x 8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32 = icmp ule <vscale x 16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = icmp ule <vscale x 4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32 = icmp ule <vscale x 8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32 = icmp ule <vscale x 16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = icmp ule <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64 = icmp ule <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64 = icmp ule <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = icmp ule <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = icmp ule <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = icmp ule <vscale x 1 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = icmp ule <vscale x 2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64 = icmp ule <vscale x 4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64 = icmp ule <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = icmp ule <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = icmp ule <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64 = icmp ule <vscale x 8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8 = icmp ule <2 x i8> undef, undef
@@ -531,38 +531,38 @@ define void @icmp_sgt() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = icmp sgt <4 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = icmp sgt <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = icmp sgt <16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8 = icmp sgt <32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = icmp sgt <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8 = icmp sgt <vscale x 1 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = icmp sgt <vscale x 2 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = icmp sgt <vscale x 4 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = icmp sgt <vscale x 8 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = icmp sgt <vscale x 16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8 = icmp sgt <vscale x 32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = icmp sgt <vscale x 16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i8 = icmp sgt <vscale x 32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = icmp sgt <2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = icmp sgt <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = icmp sgt <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16 = icmp sgt <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = icmp sgt <16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16 = icmp sgt <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = icmp sgt <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = icmp sgt <vscale x 4 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = icmp sgt <vscale x 8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16 = icmp sgt <vscale x 16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = icmp sgt <vscale x 8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i16 = icmp sgt <vscale x 16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = icmp sgt <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = icmp sgt <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32 = icmp sgt <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32 = icmp sgt <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = icmp sgt <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = icmp sgt <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32 = icmp sgt <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = icmp sgt <vscale x 2 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = icmp sgt <vscale x 4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32 = icmp sgt <vscale x 8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32 = icmp sgt <vscale x 16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = icmp sgt <vscale x 4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32 = icmp sgt <vscale x 8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32 = icmp sgt <vscale x 16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = icmp sgt <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64 = icmp sgt <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64 = icmp sgt <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = icmp sgt <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = icmp sgt <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = icmp sgt <vscale x 1 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = icmp sgt <vscale x 2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64 = icmp sgt <vscale x 4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64 = icmp sgt <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = icmp sgt <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = icmp sgt <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64 = icmp sgt <vscale x 8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8 = icmp sgt <2 x i8> undef, undef
@@ -618,38 +618,38 @@ define void @icmp_sge() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = icmp sge <4 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = icmp sge <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = icmp sge <16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8 = icmp sge <32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = icmp sge <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8 = icmp sge <vscale x 1 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = icmp sge <vscale x 2 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = icmp sge <vscale x 4 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = icmp sge <vscale x 8 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = icmp sge <vscale x 16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8 = icmp sge <vscale x 32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = icmp sge <vscale x 16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i8 = icmp sge <vscale x 32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = icmp sge <2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = icmp sge <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = icmp sge <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16 = icmp sge <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = icmp sge <16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16 = icmp sge <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = icmp sge <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = icmp sge <vscale x 4 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = icmp sge <vscale x 8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16 = icmp sge <vscale x 16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = icmp sge <vscale x 8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i16 = icmp sge <vscale x 16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = icmp sge <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = icmp sge <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32 = icmp sge <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32 = icmp sge <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = icmp sge <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = icmp sge <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32 = icmp sge <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = icmp sge <vscale x 2 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = icmp sge <vscale x 4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32 = icmp sge <vscale x 8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32 = icmp sge <vscale x 16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = icmp sge <vscale x 4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32 = icmp sge <vscale x 8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32 = icmp sge <vscale x 16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = icmp sge <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64 = icmp sge <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64 = icmp sge <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = icmp sge <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = icmp sge <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = icmp sge <vscale x 1 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = icmp sge <vscale x 2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64 = icmp sge <vscale x 4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64 = icmp sge <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = icmp sge <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = icmp sge <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64 = icmp sge <vscale x 8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8 = icmp sge <2 x i8> undef, undef
@@ -705,38 +705,38 @@ define void @icmp_slt() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = icmp slt <4 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = icmp slt <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = icmp slt <16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8 = icmp slt <32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = icmp slt <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8 = icmp slt <vscale x 1 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = icmp slt <vscale x 2 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = icmp slt <vscale x 4 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = icmp slt <vscale x 8 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = icmp slt <vscale x 16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8 = icmp slt <vscale x 32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = icmp slt <vscale x 16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i8 = icmp slt <vscale x 32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = icmp slt <2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = icmp slt <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = icmp slt <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16 = icmp slt <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = icmp slt <16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16 = icmp slt <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = icmp slt <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = icmp slt <vscale x 4 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = icmp slt <vscale x 8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16 = icmp slt <vscale x 16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = icmp slt <vscale x 8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i16 = icmp slt <vscale x 16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = icmp slt <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = icmp slt <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32 = icmp slt <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32 = icmp slt <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = icmp slt <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = icmp slt <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32 = icmp slt <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = icmp slt <vscale x 2 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = icmp slt <vscale x 4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32 = icmp slt <vscale x 8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32 = icmp slt <vscale x 16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = icmp slt <vscale x 4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32 = icmp slt <vscale x 8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32 = icmp slt <vscale x 16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = icmp slt <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64 = icmp slt <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64 = icmp slt <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = icmp slt <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = icmp slt <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = icmp slt <vscale x 1 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = icmp slt <vscale x 2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64 = icmp slt <vscale x 4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64 = icmp slt <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = icmp slt <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = icmp slt <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64 = icmp slt <vscale x 8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8 = icmp slt <2 x i8> undef, undef
@@ -792,38 +792,38 @@ define void @icmp_sle() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = icmp sle <4 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = icmp sle <8 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = icmp sle <16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v32i8 = icmp sle <32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32i8 = icmp sle <32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i8 = icmp sle <vscale x 1 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i8 = icmp sle <vscale x 2 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i8 = icmp sle <vscale x 4 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i8 = icmp sle <vscale x 8 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i8 = icmp sle <vscale x 16 x i8> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv32i8 = icmp sle <vscale x 32 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i8 = icmp sle <vscale x 16 x i8> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i8 = icmp sle <vscale x 32 x i8> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = icmp sle <2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = icmp sle <4 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = icmp sle <8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i16 = icmp sle <16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = icmp sle <16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16 = icmp sle <vscale x 1 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16 = icmp sle <vscale x 2 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i16 = icmp sle <vscale x 4 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i16 = icmp sle <vscale x 8 x i16> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i16 = icmp sle <vscale x 16 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8i16 = icmp sle <vscale x 8 x i16> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16i16 = icmp sle <vscale x 16 x i16> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = icmp sle <2 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = icmp sle <4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i32 = icmp sle <8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i32 = icmp sle <16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = icmp sle <8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32 = icmp sle <16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32 = icmp sle <vscale x 1 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32 = icmp sle <vscale x 2 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i32 = icmp sle <vscale x 4 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i32 = icmp sle <vscale x 8 x i32> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv16i32 = icmp sle <vscale x 16 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32 = icmp sle <vscale x 4 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32 = icmp sle <vscale x 8 x i32> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32 = icmp sle <vscale x 16 x i32> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = icmp sle <2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64 = icmp sle <4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i64 = icmp sle <8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = icmp sle <4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64 = icmp sle <8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64 = icmp sle <vscale x 1 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64 = icmp sle <vscale x 2 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4i64 = icmp sle <vscale x 4 x i64> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv8i64 = icmp sle <vscale x 8 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64 = icmp sle <vscale x 2 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64 = icmp sle <vscale x 4 x i64> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64 = icmp sle <vscale x 8 x i64> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8 = icmp sle <2 x i8> undef, undef
diff --git a/llvm/test/Analysis/CostModel/X86/bitreverse-codesize.ll b/llvm/test/Analysis/CostModel/X86/bitreverse-codesize.ll
index 90e49e6ccb81c..e02ba761aa8d2 100644
--- a/llvm/test/Analysis/CostModel/X86/bitreverse-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/bitreverse-codesize.ll
@@ -40,23 +40,23 @@ define i64 @var_bitreverse_i64(i64 %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_i64'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_i64'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_i64'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_i64'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_i64'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse
 ;
   %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
@@ -77,23 +77,23 @@ define i32 @var_bitreverse_i32(i32 %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_i32'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_i32'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_i32'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_i32'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_i32'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse
 ;
   %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
@@ -114,23 +114,23 @@ define i16 @var_bitreverse_i16(i16 %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_i16'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_i16'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_i16'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_i16'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_i16'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
   %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
@@ -151,23 +151,23 @@ define i8 @var_bitreverse_i8(i8 %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_i8'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_i8'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_i8'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_i8'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_i8'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
   %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
@@ -270,7 +270,7 @@ define <4 x i64> @var_bitreverse_v4i64(<4 x i64> %a) {
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v4i64'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v4i64'
@@ -323,7 +323,7 @@ define <8 x i64> @var_bitreverse_v8i64(<8 x i64> %a) {
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v8i64'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v8i64'
@@ -331,7 +331,7 @@ define <8 x i64> @var_bitreverse_v8i64(<8 x i64> %a) {
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v8i64'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v8i64'
@@ -421,7 +421,7 @@ define <8 x i32> @var_bitreverse_v8i32(<8 x i32> %a) {
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v8i32'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v8i32'
@@ -474,7 +474,7 @@ define <16 x i32> @var_bitreverse_v16i32(<16 x i32> %a) {
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v16i32'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v16i32'
@@ -482,7 +482,7 @@ define <16 x i32> @var_bitreverse_v16i32(<16 x i32> %a) {
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v16i32'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v16i32'
@@ -572,7 +572,7 @@ define <16 x i16> @var_bitreverse_v16i16(<16 x i16> %a) {
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v16i16'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v16i16'
@@ -625,7 +625,7 @@ define <32 x i16> @var_bitreverse_v32i16(<32 x i16> %a) {
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v32i16'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v32i16'
@@ -633,7 +633,7 @@ define <32 x i16> @var_bitreverse_v32i16(<32 x i16> %a) {
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v32i16'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v32i16'
@@ -723,7 +723,7 @@ define <32 x i8> @var_bitreverse_v32i8(<32 x i8> %a) {
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v32i8'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v32i8'
@@ -776,7 +776,7 @@ define <64 x i8> @var_bitreverse_v64i8(<64 x i8> %a) {
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v64i8'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v64i8'
@@ -784,7 +784,7 @@ define <64 x i8> @var_bitreverse_v64i8(<64 x i8> %a) {
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v64i8'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v64i8'
diff --git a/llvm/test/Analysis/CostModel/X86/bitreverse-latency.ll b/llvm/test/Analysis/CostModel/X86/bitreverse-latency.ll
index e7d0d05f82429..ba231b985c26b 100644
--- a/llvm/test/Analysis/CostModel/X86/bitreverse-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/bitreverse-latency.ll
@@ -40,23 +40,23 @@ define i64 @var_bitreverse_i64(i64 %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_i64'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_i64'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_i64'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_i64'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_i64'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse
 ;
   %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
@@ -77,23 +77,23 @@ define i32 @var_bitreverse_i32(i32 %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_i32'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_i32'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_i32'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_i32'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_i32'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse
 ;
   %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
@@ -114,23 +114,23 @@ define i16 @var_bitreverse_i16(i16 %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_i16'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_i16'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_i16'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_i16'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_i16'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
   %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
@@ -151,23 +151,23 @@ define i8 @var_bitreverse_i8(i8 %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_i8'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_i8'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_i8'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_i8'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_i8'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
   %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
@@ -221,23 +221,23 @@ define <2 x i64> @var_bitreverse_v2i64(<2 x i64> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v2i64'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v2i64'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v2i64'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v2i64'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v2i64'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse
 ;
   %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
@@ -270,23 +270,23 @@ define <4 x i64> @var_bitreverse_v4i64(<4 x i64> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v4i64'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v4i64'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v4i64'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v4i64'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v4i64'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse
 ;
   %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
@@ -323,23 +323,23 @@ define <8 x i64> @var_bitreverse_v8i64(<8 x i64> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v8i64'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v8i64'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v8i64'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v8i64'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v8i64'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse
 ;
   %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
@@ -376,23 +376,23 @@ define <4 x i32> @var_bitreverse_v4i32(<4 x i32> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v4i32'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v4i32'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v4i32'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v4i32'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v4i32'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse
 ;
   %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
@@ -425,23 +425,23 @@ define <8 x i32> @var_bitreverse_v8i32(<8 x i32> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v8i32'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v8i32'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v8i32'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v8i32'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v8i32'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse
 ;
   %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
@@ -478,23 +478,23 @@ define <16 x i32> @var_bitreverse_v16i32(<16 x i32> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v16i32'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v16i32'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v16i32'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v16i32'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v16i32'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse
 ;
   %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
@@ -531,23 +531,23 @@ define <8 x i16> @var_bitreverse_v8i16(<8 x i16> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v8i16'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v8i16'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v8i16'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v8i16'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v8i16'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse
 ;
   %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
@@ -580,23 +580,23 @@ define <16 x i16> @var_bitreverse_v16i16(<16 x i16> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v16i16'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v16i16'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v16i16'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v16i16'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v16i16'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse
 ;
   %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
@@ -633,23 +633,23 @@ define <32 x i16> @var_bitreverse_v32i16(<32 x i16> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v32i16'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v32i16'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v32i16'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v32i16'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v32i16'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse
 ;
   %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
@@ -686,23 +686,23 @@ define <16 x i8> @var_bitreverse_v16i8(<16 x i8> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v16i8'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v16i8'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v16i8'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v16i8'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v16i8'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse
 ;
   %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
@@ -735,23 +735,23 @@ define <32 x i8> @var_bitreverse_v32i8(<32 x i8> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v32i8'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v32i8'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v32i8'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v32i8'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v32i8'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse
 ;
   %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
@@ -788,23 +788,23 @@ define <64 x i8> @var_bitreverse_v64i8(<64 x i8> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v64i8'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v64i8'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v64i8'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v64i8'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v64i8'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse
 ;
   %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
diff --git a/llvm/test/Analysis/CostModel/X86/bitreverse-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/bitreverse-sizelatency.ll
index 1ba93af46ebdc..d60fac228fc06 100644
--- a/llvm/test/Analysis/CostModel/X86/bitreverse-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/bitreverse-sizelatency.ll
@@ -40,23 +40,23 @@ define i64 @var_bitreverse_i64(i64 %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_i64'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_i64'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_i64'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_i64'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_i64'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %bitreverse
 ;
   %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
@@ -77,23 +77,23 @@ define i32 @var_bitreverse_i32(i32 %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_i32'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_i32'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_i32'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_i32'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_i32'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %bitreverse
 ;
   %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
@@ -114,23 +114,23 @@ define i16 @var_bitreverse_i16(i16 %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_i16'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_i16'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_i16'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_i16'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_i16'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %bitreverse
 ;
   %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
@@ -151,23 +151,23 @@ define i8 @var_bitreverse_i8(i8 %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_i8'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_i8'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_i8'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_i8'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_i8'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %bitreverse
 ;
   %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
@@ -217,23 +217,23 @@ define <2 x i64> @var_bitreverse_v2i64(<2 x i64> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v2i64'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v2i64'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v2i64'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v2i64'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v2i64'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %bitreverse
 ;
   %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
@@ -270,23 +270,23 @@ define <4 x i64> @var_bitreverse_v4i64(<4 x i64> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v4i64'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v4i64'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v4i64'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v4i64'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v4i64'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i64> %bitreverse
 ;
   %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
@@ -323,23 +323,23 @@ define <8 x i64> @var_bitreverse_v8i64(<8 x i64> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v8i64'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v8i64'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v8i64'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v8i64'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v8i64'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %bitreverse
 ;
   %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
@@ -372,23 +372,23 @@ define <4 x i32> @var_bitreverse_v4i32(<4 x i32> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v4i32'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v4i32'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v4i32'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v4i32'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v4i32'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %bitreverse
 ;
   %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
@@ -425,23 +425,23 @@ define <8 x i32> @var_bitreverse_v8i32(<8 x i32> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v8i32'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v8i32'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v8i32'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v8i32'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v8i32'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i32> %bitreverse
 ;
   %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
@@ -478,23 +478,23 @@ define <16 x i32> @var_bitreverse_v16i32(<16 x i32> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v16i32'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v16i32'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v16i32'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v16i32'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v16i32'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i32> %bitreverse
 ;
   %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
@@ -527,23 +527,23 @@ define <8 x i16> @var_bitreverse_v8i16(<8 x i16> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v8i16'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v8i16'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v8i16'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v8i16'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v8i16'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %bitreverse
 ;
   %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
@@ -580,23 +580,23 @@ define <16 x i16> @var_bitreverse_v16i16(<16 x i16> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v16i16'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v16i16'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v16i16'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v16i16'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v16i16'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %bitreverse
 ;
   %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
@@ -633,23 +633,23 @@ define <32 x i16> @var_bitreverse_v32i16(<32 x i16> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v32i16'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v32i16'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v32i16'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v32i16'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v32i16'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %bitreverse
 ;
   %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
@@ -682,23 +682,23 @@ define <16 x i8> @var_bitreverse_v16i8(<16 x i8> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v16i8'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v16i8'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v16i8'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v16i8'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v16i8'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %bitreverse
 ;
   %bitreverse = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
@@ -735,23 +735,23 @@ define <32 x i8> @var_bitreverse_v32i8(<32 x i8> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v32i8'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v32i8'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v32i8'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v32i8'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v32i8'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %bitreverse
 ;
   %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
@@ -788,23 +788,23 @@ define <64 x i8> @var_bitreverse_v64i8(<64 x i8> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v64i8'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v64i8'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v64i8'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v64i8'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v64i8'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %bitreverse
 ;
   %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
diff --git a/llvm/test/Analysis/CostModel/X86/bitreverse.ll b/llvm/test/Analysis/CostModel/X86/bitreverse.ll
index 0b76b0d527ba1..a890147fee465 100644
--- a/llvm/test/Analysis/CostModel/X86/bitreverse.ll
+++ b/llvm/test/Analysis/CostModel/X86/bitreverse.ll
@@ -40,23 +40,23 @@ define i64 @var_bitreverse_i64(i64 %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_i64'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_i64'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_i64'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_i64'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_i64'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %bitreverse
 ;
   %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
@@ -77,23 +77,23 @@ define i32 @var_bitreverse_i32(i32 %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_i32'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_i32'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_i32'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_i32'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_i32'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %bitreverse
 ;
   %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
@@ -114,23 +114,23 @@ define i16 @var_bitreverse_i16(i16 %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_i16'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_i16'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_i16'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_i16'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_i16'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %bitreverse
 ;
   %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
@@ -151,23 +151,23 @@ define i8 @var_bitreverse_i8(i8 %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_i8'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_i8'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_i8'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_i8'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_i8'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %bitreverse
 ;
   %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
@@ -217,23 +217,23 @@ define <2 x i64> @var_bitreverse_v2i64(<2 x i64> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v2i64'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v2i64'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v2i64'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v2i64'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v2i64'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %bitreverse
 ;
   %bitreverse = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
@@ -270,23 +270,23 @@ define <4 x i64> @var_bitreverse_v4i64(<4 x i64> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v4i64'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v4i64'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v4i64'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v4i64'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v4i64'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i64> %bitreverse
 ;
   %bitreverse = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
@@ -323,23 +323,23 @@ define <8 x i64> @var_bitreverse_v8i64(<8 x i64> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v8i64'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v8i64'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v8i64'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v8i64'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v8i64'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %bitreverse
 ;
   %bitreverse = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
@@ -372,23 +372,23 @@ define <4 x i32> @var_bitreverse_v4i32(<4 x i32> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v4i32'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v4i32'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v4i32'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v4i32'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v4i32'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %bitreverse
 ;
   %bitreverse = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
@@ -425,23 +425,23 @@ define <8 x i32> @var_bitreverse_v8i32(<8 x i32> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v8i32'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v8i32'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v8i32'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v8i32'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v8i32'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i32> %bitreverse
 ;
   %bitreverse = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
@@ -478,23 +478,23 @@ define <16 x i32> @var_bitreverse_v16i32(<16 x i32> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v16i32'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v16i32'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v16i32'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v16i32'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v16i32'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i32> %bitreverse
 ;
   %bitreverse = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
@@ -527,23 +527,23 @@ define <8 x i16> @var_bitreverse_v8i16(<8 x i16> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v8i16'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v8i16'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v8i16'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v8i16'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v8i16'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %bitreverse
 ;
   %bitreverse = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
@@ -580,23 +580,23 @@ define <16 x i16> @var_bitreverse_v16i16(<16 x i16> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v16i16'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v16i16'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v16i16'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v16i16'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v16i16'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %bitreverse
 ;
   %bitreverse = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
@@ -633,23 +633,23 @@ define <32 x i16> @var_bitreverse_v32i16(<32 x i16> %a) {
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bitreverse
 ;
 ; GFNISSE-LABEL: 'var_bitreverse_v32i16'
-; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v32i16'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v32i16'
-; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v32i16'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v32i16'
-; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
+; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
 ; GFNIAVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %bitreverse
 ;
   %bitreverse = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
@@ -743,7 +743,7 @@ define <32 x i8> @var_bitreverse_v32i8(<32 x i8> %a) {
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v32i8'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v32i8'
@@ -796,7 +796,7 @@ define <64 x i8> @var_bitreverse_v64i8(<64 x i8> %a) {
 ; GFNISSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bitreverse
 ;
 ; GFNIAVX-LABEL: 'var_bitreverse_v64i8'
-; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
 ; GFNIAVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bitreverse
 ;
 ; GFNIAVX2-LABEL: 'var_bitreverse_v64i8'
@@ -804,7 +804,7 @@ define <64 x i8> @var_bitreverse_v64i8(<64 x i8> %a) {
 ; GFNIAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bitreverse
 ;
 ; GFNIAVX512F-LABEL: 'var_bitreverse_v64i8'
-; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
+; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bitreverse = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
 ; GFNIAVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %bitreverse
 ;
 ; GFNIAVX512BW-LABEL: 'var_bitreverse_v64i8'
diff --git a/llvm/test/Analysis/CostModel/X86/fshl-codesize.ll b/llvm/test/Analysis/CostModel/X86/fshl-codesize.ll
index c212ecfc3eb90..a7585a4d9f39e 100644
--- a/llvm/test/Analysis/CostModel/X86/fshl-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/fshl-codesize.ll
@@ -12,6 +12,7 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=GLM
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=x86_64-apple-macosx10.8.0 -mcpu=bdver2 | FileCheck %s --check-prefixes=XOP
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=x86_64-apple-macosx10.8.0 -mcpu=tigerlake | FileCheck %s --check-prefixes=AVX512,AVX512GFNI
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -97,6 +98,13 @@ define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64)
   %V2I64  = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
@@ -182,6 +190,13 @@ define void @var_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32)
   %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
@@ -267,6 +282,13 @@ define void @var_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
@@ -352,6 +374,13 @@ define void @var_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
@@ -463,6 +492,15 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
   %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -572,6 +610,15 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
   %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -681,6 +728,15 @@ define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
   %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -790,6 +846,15 @@ define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
   %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -881,6 +946,13 @@ define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
   %V2I64  = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
@@ -966,6 +1038,13 @@ define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
   %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
@@ -1051,6 +1130,13 @@ define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
@@ -1136,6 +1222,13 @@ define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
@@ -1225,6 +1318,13 @@ define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
   %V2I64  = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
@@ -1310,6 +1410,13 @@ define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
   %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
@@ -1395,6 +1502,13 @@ define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
@@ -1480,6 +1594,13 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
@@ -1690,6 +1811,13 @@ define void @var_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
@@ -1775,6 +1903,13 @@ define void @var_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
@@ -2023,6 +2158,15 @@ define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
   %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -2123,6 +2267,15 @@ define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
   %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -2335,6 +2488,13 @@ define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
@@ -2420,6 +2580,13 @@ define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
@@ -2577,9 +2744,9 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ;
 ; AVX512BW-LABEL: 'splatconstant_rotate_i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQ-LABEL: 'splatconstant_rotate_i16'
@@ -2616,6 +2783,13 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
@@ -2655,9 +2829,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512BW-LABEL: 'splatconstant_rotate_i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQ-LABEL: 'splatconstant_rotate_i8'
@@ -2669,9 +2843,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512VBMI2-LABEL: 'splatconstant_rotate_i8'
 ; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLM-LABEL: 'splatconstant_rotate_i8'
@@ -2694,6 +2868,13 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
diff --git a/llvm/test/Analysis/CostModel/X86/fshl-latency.ll b/llvm/test/Analysis/CostModel/X86/fshl-latency.ll
index 487adfe79d944..7105f713fdc34 100644
--- a/llvm/test/Analysis/CostModel/X86/fshl-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/fshl-latency.ll
@@ -12,6 +12,7 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=GLM
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=bdver2 | FileCheck %s --check-prefixes=XOP
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=tigerlake | FileCheck %s --check-prefixes=AVX512,AVX512GFNI
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -97,6 +98,13 @@ define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64)
   %V2I64  = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
@@ -182,6 +190,13 @@ define void @var_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32)
   %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
@@ -267,6 +282,13 @@ define void @var_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
@@ -352,6 +374,13 @@ define void @var_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
@@ -463,6 +492,15 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
   %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -563,6 +601,15 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
   %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -663,6 +710,15 @@ define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
   %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -763,6 +819,15 @@ define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
   %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -854,6 +919,13 @@ define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
   %V2I64  = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
@@ -939,6 +1011,13 @@ define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
   %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
@@ -1024,6 +1103,13 @@ define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
@@ -1109,6 +1195,13 @@ define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
@@ -1198,6 +1291,13 @@ define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
   %V2I64  = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
@@ -1276,6 +1376,13 @@ define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
   %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
@@ -1354,6 +1461,13 @@ define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
@@ -1432,6 +1546,13 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
@@ -1642,6 +1763,13 @@ define void @var_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
@@ -1727,6 +1855,13 @@ define void @var_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
@@ -1975,6 +2110,15 @@ define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
   %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -2075,6 +2219,15 @@ define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
   %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -2287,6 +2440,13 @@ define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
@@ -2372,6 +2532,13 @@ define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
@@ -2529,9 +2696,9 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ;
 ; AVX512BW-LABEL: 'splatconstant_rotate_i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQ-LABEL: 'splatconstant_rotate_i16'
@@ -2568,6 +2735,13 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
@@ -2607,9 +2781,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512BW-LABEL: 'splatconstant_rotate_i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQ-LABEL: 'splatconstant_rotate_i8'
@@ -2621,9 +2795,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512VBMI2-LABEL: 'splatconstant_rotate_i8'
 ; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLM-LABEL: 'splatconstant_rotate_i8'
@@ -2646,6 +2820,13 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
diff --git a/llvm/test/Analysis/CostModel/X86/fshl-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/fshl-sizelatency.ll
index 52ad7e13c84c6..5d7361e293176 100644
--- a/llvm/test/Analysis/CostModel/X86/fshl-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/fshl-sizelatency.ll
@@ -12,6 +12,7 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=GLM
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=bdver2 | FileCheck %s --check-prefixes=XOP
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=tigerlake | FileCheck %s --check-prefixes=AVX512GFNI
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -97,6 +98,13 @@ define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64)
   %V2I64  = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
@@ -182,6 +190,13 @@ define void @var_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32)
   %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
@@ -267,6 +282,13 @@ define void @var_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
@@ -352,6 +374,13 @@ define void @var_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
@@ -463,6 +492,15 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
   %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -572,6 +610,15 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
   %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -681,6 +728,15 @@ define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
   %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -790,6 +846,15 @@ define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
   %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -881,6 +946,13 @@ define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
   %V2I64  = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
@@ -966,6 +1038,13 @@ define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
   %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
@@ -1051,6 +1130,13 @@ define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
@@ -1136,6 +1222,13 @@ define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
@@ -1225,6 +1318,13 @@ define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
   %V2I64  = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
@@ -1310,6 +1410,13 @@ define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
   %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
@@ -1395,6 +1502,13 @@ define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
@@ -1480,6 +1594,13 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
@@ -1562,6 +1683,13 @@ define void @var_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_rotate_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 %c64)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 %c64)
   %V2I64  = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128)
@@ -1647,6 +1775,13 @@ define void @var_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_rotate_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 %c32)
   %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
@@ -1732,6 +1867,13 @@ define void @var_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
@@ -1817,6 +1959,13 @@ define void @var_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
@@ -1919,6 +2068,15 @@ define void @splatvar_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_rotate_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
   %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -2019,6 +2177,15 @@ define void @splatvar_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_rotate_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
   %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -2119,6 +2286,15 @@ define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
   %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -2219,6 +2395,15 @@ define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
   %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -2303,6 +2488,13 @@ define void @constant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_rotate_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
   %V2I64  = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
@@ -2388,6 +2580,13 @@ define void @constant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_rotate_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 7)
   %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
@@ -2473,6 +2672,13 @@ define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
@@ -2558,6 +2764,13 @@ define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
@@ -2640,6 +2853,13 @@ define void @splatconstant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_rotate_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshl.i64(i64 %a64, i64 %a64, i64 7)
   %V2I64  = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
@@ -2718,6 +2938,13 @@ define void @splatconstant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_rotate_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshl.i32(i32 %a32, i32 %a32, i32 5)
   %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
@@ -2757,9 +2984,9 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ;
 ; AVX512BW-LABEL: 'splatconstant_rotate_i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQ-LABEL: 'splatconstant_rotate_i16'
@@ -2796,6 +3023,13 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
@@ -2835,9 +3069,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512BW-LABEL: 'splatconstant_rotate_i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQ-LABEL: 'splatconstant_rotate_i8'
@@ -2849,9 +3083,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512VBMI2-LABEL: 'splatconstant_rotate_i8'
 ; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLM-LABEL: 'splatconstant_rotate_i8'
@@ -2874,6 +3108,13 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
diff --git a/llvm/test/Analysis/CostModel/X86/fshl.ll b/llvm/test/Analysis/CostModel/X86/fshl.ll
index 4e688c29c7ea3..1cbdab09acd90 100644
--- a/llvm/test/Analysis/CostModel/X86/fshl.ll
+++ b/llvm/test/Analysis/CostModel/X86/fshl.ll
@@ -12,6 +12,7 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=GLM
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=x86_64-apple-macosx10.8.0 -mcpu=bdver2 | FileCheck %s --check-prefixes=XOP
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=x86_64-apple-macosx10.8.0 -mcpu=tigerlake | FileCheck %s --check-prefixes=AVX512,AVX512GFNI
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -97,6 +98,13 @@ define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64)
   %V2I64  = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
@@ -182,6 +190,13 @@ define void @var_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 %c32)
   %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
@@ -267,6 +282,13 @@ define void @var_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 %c16)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
@@ -352,6 +374,13 @@ define void @var_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 %c8)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
@@ -463,6 +492,15 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
   %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -563,6 +601,15 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
   %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -663,6 +710,15 @@ define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
   %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -763,6 +819,15 @@ define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
   %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -854,6 +919,13 @@ define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
   %V2I64  = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
@@ -932,6 +1004,13 @@ define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 7)
   %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
@@ -1017,6 +1096,13 @@ define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 7)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
@@ -1102,6 +1188,13 @@ define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 7)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
@@ -1191,6 +1284,13 @@ define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
   %V2I64  = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
@@ -1269,6 +1369,13 @@ define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshl.i32(i32 %a32, i32 %b32, i32 5)
   %V2I32 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
@@ -1347,6 +1454,13 @@ define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %b16, i16 3)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
@@ -1425,6 +1539,13 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %b8, i8 3)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
@@ -1635,6 +1756,13 @@ define void @var_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 %c16)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
@@ -1720,6 +1848,13 @@ define void @var_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 %c8)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
@@ -1968,6 +2103,15 @@ define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
   %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -2068,6 +2212,15 @@ define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
   %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -2273,6 +2426,13 @@ define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 7)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
@@ -2358,6 +2518,13 @@ define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 7)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
@@ -2515,8 +2682,8 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ;
 ; AVX512BW-LABEL: 'splatconstant_rotate_i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -2554,6 +2721,13 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshl.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshl.i16(i16 %a16, i16 %a16, i16 3)
   %V8I16  = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
@@ -2593,9 +2767,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512BW-LABEL: 'splatconstant_rotate_i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512DQ-LABEL: 'splatconstant_rotate_i8'
@@ -2607,9 +2781,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512VBMI2-LABEL: 'splatconstant_rotate_i8'
 ; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SLM-LABEL: 'splatconstant_rotate_i8'
@@ -2632,6 +2806,13 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshl.i8(i8 %a8, i8 %a8, i8 3)
   %V16I8 = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
diff --git a/llvm/test/Analysis/CostModel/X86/fshr-codesize.ll b/llvm/test/Analysis/CostModel/X86/fshr-codesize.ll
index 0e76246fad85f..ecc861dd7f8ee 100644
--- a/llvm/test/Analysis/CostModel/X86/fshr-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/fshr-codesize.ll
@@ -12,6 +12,7 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=GLM
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=x86_64-apple-macosx10.8.0 -mcpu=bdver2 | FileCheck %s --check-prefixes=XOP
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=x86_64-apple-macosx10.8.0 -mcpu=tigerlake | FileCheck %s --check-prefixes=AVX512,AVX512GFNI
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -97,6 +98,13 @@ define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64)
   %V2I64  = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
@@ -182,6 +190,13 @@ define void @var_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32)
   %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
@@ -267,6 +282,13 @@ define void @var_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
@@ -352,6 +374,13 @@ define void @var_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
@@ -463,6 +492,15 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
   %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -572,6 +610,15 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
   %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -681,6 +728,15 @@ define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
   %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -790,6 +846,15 @@ define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
   %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -881,6 +946,13 @@ define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
   %V2I64  = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
@@ -966,6 +1038,13 @@ define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
   %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
@@ -1051,6 +1130,13 @@ define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
@@ -1136,6 +1222,13 @@ define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
@@ -1225,6 +1318,13 @@ define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
   %V2I64  = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
@@ -1310,6 +1410,13 @@ define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
   %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
@@ -1395,6 +1502,13 @@ define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
@@ -1480,6 +1594,13 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
@@ -1690,6 +1811,13 @@ define void @var_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
@@ -1775,6 +1903,13 @@ define void @var_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
@@ -2023,6 +2158,15 @@ define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
   %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -2123,6 +2267,15 @@ define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
   %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -2335,6 +2488,13 @@ define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
@@ -2420,6 +2580,13 @@ define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
@@ -2477,9 +2644,9 @@ define void @splatconstant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256
 ;
 ; XOP-LABEL: 'splatconstant_rotate_i64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
@@ -2534,9 +2701,9 @@ define void @splatconstant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256
 ;
 ; XOP-LABEL: 'splatconstant_rotate_i32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5)
@@ -2577,9 +2744,9 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ;
 ; AVX512BW-LABEL: 'splatconstant_rotate_i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQ-LABEL: 'splatconstant_rotate_i16'
@@ -2612,10 +2779,17 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ;
 ; XOP-LABEL: 'splatconstant_rotate_i16'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
@@ -2655,9 +2829,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512BW-LABEL: 'splatconstant_rotate_i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQ-LABEL: 'splatconstant_rotate_i8'
@@ -2669,9 +2843,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512VBMI2-LABEL: 'splatconstant_rotate_i8'
 ; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLM-LABEL: 'splatconstant_rotate_i8'
@@ -2690,10 +2864,17 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; XOP-LABEL: 'splatconstant_rotate_i8'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
diff --git a/llvm/test/Analysis/CostModel/X86/fshr-latency.ll b/llvm/test/Analysis/CostModel/X86/fshr-latency.ll
index 73e32dc92a69b..0142ad77849ca 100644
--- a/llvm/test/Analysis/CostModel/X86/fshr-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/fshr-latency.ll
@@ -12,6 +12,7 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=GLM
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=bdver2 | FileCheck %s --check-prefixes=XOP
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=tigerlake | FileCheck %s --check-prefixes=AVX512,AVX512GFNI
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -97,6 +98,13 @@ define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64)
   %V2I64  = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
@@ -182,6 +190,13 @@ define void @var_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32)
   %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
@@ -267,6 +282,13 @@ define void @var_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
@@ -352,6 +374,13 @@ define void @var_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
@@ -463,6 +492,15 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
   %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -563,6 +601,15 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
   %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -663,6 +710,15 @@ define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
   %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -763,6 +819,15 @@ define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
   %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -854,6 +919,13 @@ define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
   %V2I64  = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
@@ -939,6 +1011,13 @@ define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
   %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
@@ -1024,6 +1103,13 @@ define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
@@ -1109,6 +1195,13 @@ define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
@@ -1198,6 +1291,13 @@ define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
   %V2I64  = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
@@ -1276,6 +1376,13 @@ define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
   %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
@@ -1354,6 +1461,13 @@ define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
@@ -1432,6 +1546,13 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
@@ -1642,6 +1763,13 @@ define void @var_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
@@ -1727,6 +1855,13 @@ define void @var_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
@@ -1975,6 +2110,15 @@ define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
   %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -2075,6 +2219,15 @@ define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
   %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -2287,6 +2440,13 @@ define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
@@ -2372,6 +2532,13 @@ define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
@@ -2529,9 +2696,9 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ;
 ; AVX512BW-LABEL: 'splatconstant_rotate_i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQ-LABEL: 'splatconstant_rotate_i16'
@@ -2568,6 +2735,13 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
@@ -2607,9 +2781,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512BW-LABEL: 'splatconstant_rotate_i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQ-LABEL: 'splatconstant_rotate_i8'
@@ -2621,9 +2795,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512VBMI2-LABEL: 'splatconstant_rotate_i8'
 ; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLM-LABEL: 'splatconstant_rotate_i8'
@@ -2646,6 +2820,13 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
diff --git a/llvm/test/Analysis/CostModel/X86/fshr-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/fshr-sizelatency.ll
index 2d106c6dd3069..6dafb20a0aeed 100644
--- a/llvm/test/Analysis/CostModel/X86/fshr-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/fshr-sizelatency.ll
@@ -12,6 +12,7 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=GLM
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=bdver2 | FileCheck %s --check-prefixes=XOP
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mtriple=x86_64-apple-macosx10.8.0 -mcpu=tigerlake | FileCheck %s --check-prefixes=AVX512GFNI
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -97,6 +98,13 @@ define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64)
   %V2I64  = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
@@ -182,6 +190,13 @@ define void @var_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32)
   %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
@@ -267,6 +282,13 @@ define void @var_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
@@ -352,6 +374,13 @@ define void @var_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
@@ -463,6 +492,15 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
   %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -572,6 +610,15 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
   %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -681,6 +728,15 @@ define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
   %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -790,6 +846,15 @@ define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
   %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -881,6 +946,13 @@ define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
   %V2I64  = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
@@ -966,6 +1038,13 @@ define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
   %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
@@ -1051,6 +1130,13 @@ define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
@@ -1136,6 +1222,13 @@ define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 37 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
@@ -1225,6 +1318,13 @@ define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
   %V2I64  = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
@@ -1310,6 +1410,13 @@ define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
   %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
@@ -1395,6 +1502,13 @@ define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
@@ -1480,6 +1594,13 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 58 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
@@ -1562,6 +1683,13 @@ define void @var_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_rotate_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 %c64)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 %c64)
   %V2I64  = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %c128)
@@ -1647,6 +1775,13 @@ define void @var_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_rotate_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 %c32)
   %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %c128)
@@ -1732,6 +1867,13 @@ define void @var_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
@@ -1817,6 +1959,13 @@ define void @var_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
@@ -1919,6 +2068,15 @@ define void @splatvar_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_rotate_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
   %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -2019,6 +2177,15 @@ define void @splatvar_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_rotate_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
   %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -2119,6 +2286,15 @@ define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
   %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -2219,6 +2395,15 @@ define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
   %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -2303,6 +2488,13 @@ define void @constant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_rotate_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
   %V2I64  = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 1, i64 7>)
@@ -2388,6 +2580,13 @@ define void @constant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_rotate_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 7)
   %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
@@ -2473,6 +2672,13 @@ define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
@@ -2558,6 +2764,13 @@ define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
@@ -2636,10 +2849,17 @@ define void @splatconstant_rotate_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256
 ;
 ; XOP-LABEL: 'splatconstant_rotate_i64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_rotate_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %a256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %a512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshr.i64(i64 %a64, i64 %a64, i64 7)
   %V2I64  = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %a128, <2 x i64> <i64 7, i64 7>)
@@ -2714,10 +2934,17 @@ define void @splatconstant_rotate_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256
 ;
 ; XOP-LABEL: 'splatconstant_rotate_i32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_rotate_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %a256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %a512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshr.i32(i32 %a32, i32 %a32, i32 5)
   %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %a128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
@@ -2757,9 +2984,9 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ;
 ; AVX512BW-LABEL: 'splatconstant_rotate_i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQ-LABEL: 'splatconstant_rotate_i16'
@@ -2792,10 +3019,17 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ;
 ; XOP-LABEL: 'splatconstant_rotate_i16'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
@@ -2835,9 +3069,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512BW-LABEL: 'splatconstant_rotate_i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQ-LABEL: 'splatconstant_rotate_i8'
@@ -2849,9 +3083,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512VBMI2-LABEL: 'splatconstant_rotate_i8'
 ; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLM-LABEL: 'splatconstant_rotate_i8'
@@ -2870,10 +3104,17 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; XOP-LABEL: 'splatconstant_rotate_i8'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
diff --git a/llvm/test/Analysis/CostModel/X86/fshr.ll b/llvm/test/Analysis/CostModel/X86/fshr.ll
index 9565630677d52..ada1b9c5bdc4b 100644
--- a/llvm/test/Analysis/CostModel/X86/fshr.ll
+++ b/llvm/test/Analysis/CostModel/X86/fshr.ll
@@ -12,6 +12,7 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=x86_64-apple-macosx10.8.0 -mcpu=goldmont | FileCheck %s --check-prefixes=GLM
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=x86_64-apple-macosx10.8.0 -mcpu=bdver2 | FileCheck %s --check-prefixes=XOP
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=x86_64-apple-macosx10.8.0 -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=x86_64-apple-macosx10.8.0 -mcpu=tigerlake | FileCheck %s --check-prefixes=AVX512,AVX512GFNI
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -97,6 +98,13 @@ define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64)
   %V2I64  = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
@@ -182,6 +190,13 @@ define void @var_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 %c32)
   %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %c128)
@@ -267,6 +282,13 @@ define void @var_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 %c16)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %c128)
@@ -352,6 +374,13 @@ define void @var_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 %c8)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %c128)
@@ -463,6 +492,15 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
   %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -563,6 +601,15 @@ define void @splatvar_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <16 x i32> %c512, <16 x i32> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %u128 = shufflevector <4 x i32> %c128, <4 x i32> undef, <4 x i32> zeroinitializer
   %u256 = shufflevector <8 x i32> %c256, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -663,6 +710,15 @@ define void @splatvar_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
   %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -763,6 +819,15 @@ define void @splatvar_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
   %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -854,6 +919,13 @@ define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
   %V2I64  = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
@@ -932,6 +1004,13 @@ define void @constant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 7)
   %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
@@ -1017,6 +1096,13 @@ define void @constant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 7)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
@@ -1102,6 +1188,13 @@ define void @constant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 7)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
@@ -1191,6 +1284,13 @@ define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i64'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I64    = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
   %V2I64  = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
@@ -1269,6 +1369,13 @@ define void @splatconstant_funnel_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i32'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> %a256, <8 x i32> %b256, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %a512, <16 x i32> %b512, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I32   = call i32 @llvm.fshr.i32(i32 %a32, i32 %b32, i32 5)
   %V2I32 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a128, <4 x i32> %b128, <4 x i32> <i32 5, i32 5, i32 5, i32 5>)
@@ -1347,6 +1454,13 @@ define void @splatconstant_funnel_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %b256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %b512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %b16, i16 3)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %b128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
@@ -1425,6 +1539,13 @@ define void @splatconstant_funnel_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_funnel_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %b256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %b512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %b8, i8 3)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %b128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
@@ -1635,6 +1756,13 @@ define void @var_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 %c16)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %c128)
@@ -1720,6 +1848,13 @@ define void @var_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'var_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %c256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %c512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 %c8)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %c128)
@@ -1968,6 +2103,15 @@ define void @splatvar_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <32 x i16> %c512, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %u128 = shufflevector <8 x i16> %c128, <8 x i16> undef, <8 x i32> zeroinitializer
   %u256 = shufflevector <16 x i16> %c256, <16 x i16> undef, <16 x i32> zeroinitializer
@@ -2068,6 +2212,15 @@ define void @splatvar_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatvar_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <64 x i8> %c512, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> %u128)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> %u256)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> %u512)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %u128 = shufflevector <16 x i8> %c128, <16 x i8> undef, <16 x i32> zeroinitializer
   %u256 = shufflevector <32 x i8> %c256, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -2273,6 +2426,13 @@ define void @constant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 7)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
@@ -2358,6 +2518,13 @@ define void @constant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'constant_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 7)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
@@ -2515,8 +2682,8 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ;
 ; AVX512BW-LABEL: 'splatconstant_rotate_i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -2554,6 +2721,13 @@ define void @splatconstant_rotate_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a25
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_rotate_i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.fshr.v16i16(<16 x i16> %a256, <16 x i16> %a256, <16 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %a512, <32 x i16> %a512, <32 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I16    = call i16 @llvm.fshr.i16(i16 %a16, i16 %a16, i16 3)
   %V8I16  = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a128, <8 x i16> %a128, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>)
@@ -2593,9 +2767,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512BW-LABEL: 'splatconstant_rotate_i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512DQ-LABEL: 'splatconstant_rotate_i8'
@@ -2607,9 +2781,9 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ;
 ; AVX512VBMI2-LABEL: 'splatconstant_rotate_i8'
 ; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
-; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; AVX512VBMI2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SLM-LABEL: 'splatconstant_rotate_i8'
@@ -2632,6 +2806,13 @@ define void @splatconstant_rotate_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX512GFNI-LABEL: 'splatconstant_rotate_i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> %a256, <32 x i8> %a256, <32 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64I8 = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %a512, <64 x i8> %a512, <64 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %I8    = call i8 @llvm.fshr.i8(i8 %a8, i8 %a8, i8 3)
   %V16I8 = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a128, <16 x i8> %a128, <16 x i8> <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>)
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-ashr-codesize.ll b/llvm/test/Analysis/CostModel/X86/vshift-ashr-codesize.ll
index 8e3f38c796e6e..a3c24bdd1a882 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-ashr-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-ashr-codesize.ll
@@ -15,6 +15,7 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI
 
 ; Verify the cost of vector logical shift right instructions.
 
@@ -232,6 +233,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v8i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
 ;
   %shift = ashr <8 x i16> %a, %b
   ret <8 x i16> %shift
@@ -265,6 +270,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = ashr <16 x i16> %a, %b
   ret <16 x i16> %shift
@@ -298,6 +307,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = ashr <32 x i16> %a, %b
   ret <32 x i16> %shift
@@ -327,6 +340,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; AVX512BW-LABEL: 'var_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = ashr <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -389,6 +406,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; AVX512BW-LABEL: 'var_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <64 x i8> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <64 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = ashr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -778,6 +799,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <32 x i16> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <32 x i16> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %insert = insertelement <32 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -827,6 +854,12 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <16 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <16 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %insert = insertelement <16 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
@@ -882,6 +915,12 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %shift = ashr <32 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %shift = ashr <32 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
   %insert = insertelement <32 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -937,6 +976,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %shift = ashr <64 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %shift = ashr <64 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %insert = insertelement <64 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
@@ -1158,6 +1203,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v8i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
 ;
   %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <8 x i16> %shift
@@ -1191,6 +1240,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <16 x i16> %shift
@@ -1224,6 +1277,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
@@ -1253,6 +1310,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -1315,6 +1376,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
@@ -1531,6 +1596,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
@@ -1568,6 +1637,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <32 x i16> %shift
@@ -1601,6 +1674,10 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
@@ -1634,6 +1711,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v32i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
   %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
@@ -1667,6 +1748,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <64 x i8> %shift
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost-inseltpoison.ll
index d4ece3b2a1134..67679f2cb8566 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost-inseltpoison.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost-inseltpoison.ll
@@ -15,6 +15,7 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI
 
 ; Verify the cost of vector arithmetic shift right instructions.
 
@@ -228,6 +229,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v8i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
 ;
   %shift = ashr <8 x i16> %a, %b
   ret <8 x i16> %shift
@@ -269,6 +274,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v16i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
   %shift = ashr <16 x i16> %a, %b
   ret <16 x i16> %shift
@@ -310,6 +319,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v32i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %shift = ashr <32 x i16> %a, %b
   ret <32 x i16> %shift
@@ -351,6 +364,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v16i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
 ;
   %shift = ashr <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -392,6 +409,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v32i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <32 x i8> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <32 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
   %shift = ashr <32 x i8> %a, %b
   ret <32 x i8> %shift
@@ -433,6 +454,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v64i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %shift = ashr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -834,6 +859,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <32 x i16> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <32 x i16> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %insert = insertelement <32 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
@@ -950,6 +981,12 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <32 x i8> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i8> poison, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <32 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
   %insert = insertelement <32 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <32 x i8> %insert, <32 x i8> poison, <32 x i32> zeroinitializer
@@ -1017,6 +1054,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <64 x i8> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <64 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %insert = insertelement <64 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
@@ -1226,6 +1269,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v8i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
 ;
   %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <8 x i16> %shift
@@ -1267,6 +1314,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v16i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
   %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <16 x i16> %shift
@@ -1308,6 +1359,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v32i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
@@ -1349,6 +1404,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v16i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
 ;
   %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -1390,6 +1449,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v32i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
   %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
@@ -1431,6 +1494,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v64i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
@@ -1631,6 +1698,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v16i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
   %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
@@ -1672,6 +1743,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <32 x i16> %shift
@@ -1713,6 +1788,10 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v16i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
 ;
   %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
@@ -1754,6 +1833,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
   %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
@@ -1795,6 +1878,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v64i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <64 x i8> %shift
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost.ll b/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
index f8fb2d76a778f..efd378ee0b5a6 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-ashr-cost.ll
@@ -15,6 +15,7 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI
 
 ; Verify the cost of vector arithmetic shift right instructions.
 
@@ -228,6 +229,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v8i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
 ;
   %shift = ashr <8 x i16> %a, %b
   ret <8 x i16> %shift
@@ -269,6 +274,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v16i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
   %shift = ashr <16 x i16> %a, %b
   ret <16 x i16> %shift
@@ -310,6 +319,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v32i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %shift = ashr <32 x i16> %a, %b
   ret <32 x i16> %shift
@@ -351,6 +364,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v16i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
 ;
   %shift = ashr <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -392,6 +409,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v32i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <32 x i8> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <32 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
   %shift = ashr <32 x i8> %a, %b
   ret <32 x i8> %shift
@@ -433,6 +454,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v64i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %shift = ashr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -834,6 +859,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <32 x i16> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = ashr <32 x i16> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %insert = insertelement <32 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -950,6 +981,12 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <32 x i8> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <32 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
   %insert = insertelement <32 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -1017,6 +1054,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <64 x i8> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <64 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %insert = insertelement <64 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
@@ -1226,6 +1269,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v8i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
 ;
   %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <8 x i16> %shift
@@ -1267,6 +1314,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v16i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
   %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <16 x i16> %shift
@@ -1308,6 +1359,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v32i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
@@ -1349,6 +1404,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v16i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
 ;
   %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -1390,6 +1449,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v32i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
   %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
@@ -1431,6 +1494,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v64i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
@@ -1631,6 +1698,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v16i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
   %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
@@ -1672,6 +1743,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <32 x i16> %shift
@@ -1713,6 +1788,10 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v16i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
 ;
   %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
@@ -1754,6 +1833,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
   %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
@@ -1795,6 +1878,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v64i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <64 x i8> %shift
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-ashr-latency.ll b/llvm/test/Analysis/CostModel/X86/vshift-ashr-latency.ll
index af5278e355cfb..cd4189d4a7f84 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-ashr-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-ashr-latency.ll
@@ -15,6 +15,7 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=slm | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=tigerlake | FileCheck %s --check-prefixes=AVX512,AVX512GFNI
 
 ; Verify the cost of vector logical shift right instructions.
 
@@ -232,6 +233,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v8i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
 ;
   %shift = ashr <8 x i16> %a, %b
   ret <8 x i16> %shift
@@ -265,6 +270,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = ashr <16 x i16> %a, %b
   ret <16 x i16> %shift
@@ -298,6 +307,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = ashr <32 x i16> %a, %b
   ret <32 x i16> %shift
@@ -331,6 +344,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; AVX512BW-LABEL: 'var_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <16 x i8> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <16 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = ashr <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -364,6 +381,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; AVX512BW-LABEL: 'var_shift_v32i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %shift = ashr <32 x i8> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %shift = ashr <32 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
   %shift = ashr <32 x i8> %a, %b
   ret <32 x i8> %shift
@@ -397,6 +418,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; AVX512BW-LABEL: 'var_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = ashr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -834,6 +859,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <32 x i16> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = ashr <32 x i16> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %insert = insertelement <32 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -889,6 +920,12 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <16 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <16 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %insert = insertelement <16 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
@@ -944,6 +981,12 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %shift = ashr <32 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %shift = ashr <32 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
   %insert = insertelement <32 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -999,6 +1042,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %shift = ashr <64 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %shift = ashr <64 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %insert = insertelement <64 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
@@ -1220,6 +1269,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v8i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
 ;
   %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <8 x i16> %shift
@@ -1253,6 +1306,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <16 x i16> %shift
@@ -1286,6 +1343,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
@@ -1319,6 +1380,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -1352,6 +1417,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v32i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
   %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
@@ -1385,6 +1454,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
@@ -1649,6 +1722,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
@@ -1686,6 +1763,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <32 x i16> %shift
@@ -1723,6 +1804,10 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
@@ -1760,6 +1845,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v32i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
   %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
@@ -1797,6 +1886,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <64 x i8> %shift
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-ashr-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/vshift-ashr-sizelatency.ll
index 8286b3967af60..84ccad0294155 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-ashr-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-ashr-sizelatency.ll
@@ -15,6 +15,7 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI
 
 ; Verify the cost of vector logical shift right instructions.
 
@@ -232,6 +233,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v8i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
 ;
   %shift = ashr <8 x i16> %a, %b
   ret <8 x i16> %shift
@@ -265,6 +270,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = ashr <16 x i16> %a, %b
   ret <16 x i16> %shift
@@ -298,6 +307,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = ashr <32 x i16> %a, %b
   ret <32 x i16> %shift
@@ -331,6 +344,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; AVX512BW-LABEL: 'var_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <16 x i8> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <16 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = ashr <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -364,6 +381,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; AVX512BW-LABEL: 'var_shift_v32i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <32 x i8> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <32 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
   %shift = ashr <32 x i8> %a, %b
   ret <32 x i8> %shift
@@ -397,6 +418,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; AVX512BW-LABEL: 'var_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <64 x i8> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <64 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = ashr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -786,6 +811,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = ashr <32 x i16> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = ashr <32 x i16> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %insert = insertelement <32 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -835,6 +866,12 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <16 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %shift = ashr <16 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %insert = insertelement <16 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
@@ -890,6 +927,12 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %shift = ashr <32 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i8> undef, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %shift = ashr <32 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
   %insert = insertelement <32 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer
@@ -945,6 +988,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = ashr <64 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %insert = insertelement <64 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
@@ -1166,6 +1215,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v8i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
 ;
   %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <8 x i16> %shift
@@ -1199,6 +1252,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = ashr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <16 x i16> %shift
@@ -1232,6 +1289,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = ashr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
@@ -1265,6 +1326,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -1298,6 +1363,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v32i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
   %shift = ashr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
@@ -1331,6 +1400,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = ashr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
@@ -1547,6 +1620,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = ashr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
@@ -1584,6 +1661,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = ashr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <32 x i16> %shift
@@ -1617,6 +1698,10 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
@@ -1654,6 +1739,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v32i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
   %shift = ashr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
@@ -1691,6 +1780,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = ashr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <64 x i8> %shift
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-lshr-codesize.ll b/llvm/test/Analysis/CostModel/X86/vshift-lshr-codesize.ll
index 7c506af62e817..a0e15bb8ff738 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-lshr-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-lshr-codesize.ll
@@ -15,6 +15,7 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI
 
 ; Verify the cost of vector logical shift right instructions.
 
@@ -240,6 +241,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v8i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
 ;
   %shift = lshr <8 x i16> %a, %b
   ret <8 x i16> %shift
@@ -273,6 +278,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = lshr <16 x i16> %a, %b
   ret <16 x i16> %shift
@@ -306,6 +315,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = lshr <32 x i16> %a, %b
   ret <32 x i16> %shift
@@ -335,6 +348,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; AVX512BW-LABEL: 'var_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = lshr <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -397,6 +414,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; AVX512BW-LABEL: 'var_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = lshr <64 x i8> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = lshr <64 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = lshr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -762,6 +783,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <32 x i16> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <32 x i16> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %insert = insertelement <32 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -909,6 +936,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <64 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <64 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %insert = insertelement <64 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
@@ -1138,6 +1171,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v8i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
 ;
   %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <8 x i16> %shift
@@ -1171,6 +1208,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <16 x i16> %shift
@@ -1204,6 +1245,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
@@ -1233,6 +1278,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -1295,6 +1344,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
@@ -1495,6 +1548,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
@@ -1532,6 +1589,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <32 x i16> %shift
@@ -1631,6 +1692,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <64 x i8> %shift
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost-inseltpoison.ll
index 855d2b8e88e3a..34ea1a644f6cc 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost-inseltpoison.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost-inseltpoison.ll
@@ -15,6 +15,7 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI
 
 ; Verify the cost of vector logical shift right instructions.
 
@@ -232,6 +233,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v8i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
 ;
   %shift = lshr <8 x i16> %a, %b
   ret <8 x i16> %shift
@@ -273,6 +278,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v16i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
   %shift = lshr <16 x i16> %a, %b
   ret <16 x i16> %shift
@@ -314,6 +323,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v32i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %shift = lshr <32 x i16> %a, %b
   ret <32 x i16> %shift
@@ -355,6 +368,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v16i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
 ;
   %shift = lshr <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -396,6 +413,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v32i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i8> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
   %shift = lshr <32 x i8> %a, %b
   ret <32 x i8> %shift
@@ -437,6 +458,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v64i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <64 x i8> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <64 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %shift = lshr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -826,6 +851,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <32 x i16> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <32 x i16> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %insert = insertelement <32 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
@@ -991,6 +1022,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <64 x i8> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <64 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %insert = insertelement <64 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
@@ -1200,6 +1237,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v8i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
 ;
   %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <8 x i16> %shift
@@ -1241,6 +1282,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v16i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
   %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <16 x i16> %shift
@@ -1282,6 +1327,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v32i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
@@ -1323,6 +1372,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v16i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
 ;
   %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -1364,6 +1417,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v32i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
   %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
@@ -1405,6 +1462,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v64i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
@@ -1593,6 +1654,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v16i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
   %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
@@ -1634,6 +1699,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <32 x i16> %shift
@@ -1700,6 +1769,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
   %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
@@ -1741,6 +1814,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v64i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <64 x i8> %shift
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost.ll b/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
index 55894dc932957..93b844eddf182 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-lshr-cost.ll
@@ -15,6 +15,7 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI
 
 ; Verify the cost of vector logical shift right instructions.
 
@@ -232,6 +233,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v8i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
 ;
   %shift = lshr <8 x i16> %a, %b
   ret <8 x i16> %shift
@@ -273,6 +278,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v16i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
   %shift = lshr <16 x i16> %a, %b
   ret <16 x i16> %shift
@@ -314,6 +323,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v32i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %shift = lshr <32 x i16> %a, %b
   ret <32 x i16> %shift
@@ -355,6 +368,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v16i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
 ;
   %shift = lshr <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -396,6 +413,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v32i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i8> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
   %shift = lshr <32 x i8> %a, %b
   ret <32 x i8> %shift
@@ -437,6 +458,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; AVX512BWVL-LABEL: 'var_shift_v64i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <64 x i8> %a, %b
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <64 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %shift = lshr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -826,6 +851,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <32 x i16> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = lshr <32 x i16> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %insert = insertelement <32 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -991,6 +1022,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <64 x i8> %a, %splat
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <64 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %insert = insertelement <64 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
@@ -1200,6 +1237,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v8i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
 ;
   %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <8 x i16> %shift
@@ -1241,6 +1282,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v16i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
   %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <16 x i16> %shift
@@ -1282,6 +1327,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v32i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
@@ -1323,6 +1372,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v16i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
 ;
   %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -1364,6 +1417,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v32i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
   %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
@@ -1405,6 +1462,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BWVL-LABEL: 'constant_shift_v64i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
@@ -1593,6 +1654,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v16i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
   %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
@@ -1634,6 +1699,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <32 x i16> %shift
@@ -1700,6 +1769,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
   %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
@@ -1741,6 +1814,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v64i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <64 x i8> %shift
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-lshr-latency.ll b/llvm/test/Analysis/CostModel/X86/vshift-lshr-latency.ll
index 92fe253d38e73..61620e2cc97ed 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-lshr-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-lshr-latency.ll
@@ -15,6 +15,7 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=tigerlake | FileCheck %s --check-prefixes=AVX512,AVX512GFNI
 
 ; Verify the cost of vector logical shift right instructions.
 
@@ -240,6 +241,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v8i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
 ;
   %shift = lshr <8 x i16> %a, %b
   ret <8 x i16> %shift
@@ -277,6 +282,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = lshr <16 x i16> %a, %b
   ret <16 x i16> %shift
@@ -314,6 +323,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = lshr <32 x i16> %a, %b
   ret <32 x i16> %shift
@@ -351,6 +364,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; AVX512BW-LABEL: 'var_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <16 x i8> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <16 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = lshr <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -421,6 +438,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; AVX512BW-LABEL: 'var_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %shift = lshr <64 x i8> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %shift = lshr <64 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = lshr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -834,6 +855,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i16> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = lshr <32 x i16> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %insert = insertelement <32 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -889,6 +916,12 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, i8 %b) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %shift = lshr <16 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <16 x i8> undef, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %shift = lshr <16 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %insert = insertelement <16 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <16 x i8> %insert, <16 x i8> undef, <16 x i32> zeroinitializer
@@ -993,6 +1026,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <64 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <64 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %insert = insertelement <64 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
@@ -1222,6 +1261,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v8i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
 ;
   %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <8 x i16> %shift
@@ -1259,6 +1302,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <16 x i16> %shift
@@ -1296,6 +1343,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
@@ -1333,6 +1384,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -1403,6 +1458,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
@@ -1643,6 +1702,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
@@ -1676,6 +1739,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <32 x i16> %shift
@@ -1709,6 +1776,10 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
@@ -1771,6 +1842,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <64 x i8> %shift
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-lshr-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/vshift-lshr-sizelatency.ll
index fe8a7d91cc375..e6b6ac75b65d5 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-lshr-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-lshr-sizelatency.ll
@@ -15,6 +15,7 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI
 
 ; Verify the cost of vector logical shift right instructions.
 
@@ -240,6 +241,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v8i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
 ;
   %shift = lshr <8 x i16> %a, %b
   ret <8 x i16> %shift
@@ -273,6 +278,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = lshr <16 x i16> %a, %b
   ret <16 x i16> %shift
@@ -306,6 +315,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = lshr <32 x i16> %a, %b
   ret <32 x i16> %shift
@@ -335,6 +348,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; AVX512BW-LABEL: 'var_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = lshr <16 x i8> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = lshr <16 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = lshr <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -368,6 +385,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; AVX512BW-LABEL: 'var_shift_v32i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %shift = lshr <32 x i8> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %shift = lshr <32 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
   %shift = lshr <32 x i8> %a, %b
   ret <32 x i8> %shift
@@ -401,6 +422,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; AVX512BW-LABEL: 'var_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %shift = lshr <64 x i8> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %shift = lshr <64 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = lshr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -766,6 +791,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = lshr <32 x i16> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = lshr <32 x i16> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %insert = insertelement <32 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -913,6 +944,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %shift = lshr <64 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %shift = lshr <64 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %insert = insertelement <64 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
@@ -1142,6 +1179,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v8i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
 ;
   %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <8 x i16> %shift
@@ -1175,6 +1216,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <16 x i16> %shift
@@ -1208,6 +1253,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
@@ -1237,6 +1286,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -1270,6 +1323,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v32i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
   %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
@@ -1303,6 +1360,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
@@ -1503,6 +1564,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
@@ -1540,6 +1605,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <32 x i16> %shift
@@ -1606,6 +1675,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v32i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
   %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
@@ -1643,6 +1716,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <64 x i8> %shift
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-shl-codesize.ll b/llvm/test/Analysis/CostModel/X86/vshift-shl-codesize.ll
index 07229e22873c4..265658b1e3a2d 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-shl-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-shl-codesize.ll
@@ -15,6 +15,7 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI
 
 ; Verify the cost of vector logical shift right instructions.
 
@@ -240,6 +241,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v8i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <8 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <8 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
 ;
   %shift = shl <8 x i16> %a, %b
   ret <8 x i16> %shift
@@ -273,6 +278,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = shl <16 x i16> %a, %b
   ret <16 x i16> %shift
@@ -306,6 +315,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = shl <32 x i16> %a, %b
   ret <32 x i16> %shift
@@ -335,6 +348,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; AVX512BW-LABEL: 'var_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i8> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = shl <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -397,6 +414,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; AVX512BW-LABEL: 'var_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %shift = shl <64 x i8> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %shift = shl <64 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = shl <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -762,6 +783,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <32 x i16> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <32 x i16> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %insert = insertelement <32 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -915,6 +942,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = shl <64 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = shl <64 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %insert = insertelement <64 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
@@ -1186,6 +1219,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
@@ -1215,6 +1252,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -1277,6 +1318,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
@@ -1477,6 +1522,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
@@ -1514,6 +1563,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <32 x i16> %shift
@@ -1613,6 +1666,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <64 x i8> %shift
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-shl-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/vshift-shl-cost-inseltpoison.ll
index d8737d2365649..f093d1c8ca358 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-shl-cost-inseltpoison.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-shl-cost-inseltpoison.ll
@@ -15,6 +15,7 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI
 
 ; Verify the cost of vector shift left instructions.
 
@@ -236,6 +237,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; SLM-LABEL: 'var_shift_v8i16'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %shift = shl <8 x i16> %a, %b
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <8 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
 ;
   %shift = shl <8 x i16> %a, %b
   ret <8 x i16> %shift
@@ -281,6 +286,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SLM-LABEL: 'var_shift_v16i16'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %shift = shl <16 x i16> %a, %b
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
   %shift = shl <16 x i16> %a, %b
   ret <16 x i16> %shift
@@ -326,6 +335,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; SLM-LABEL: 'var_shift_v32i16'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %shift = shl <32 x i16> %a, %b
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %shift = shl <32 x i16> %a, %b
   ret <32 x i16> %shift
@@ -371,6 +384,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; SLM-LABEL: 'var_shift_v16i8'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = shl <16 x i8> %a, %b
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
 ;
   %shift = shl <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -416,6 +433,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; SLM-LABEL: 'var_shift_v32i8'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %shift = shl <32 x i8> %a, %b
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <32 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
   %shift = shl <32 x i8> %a, %b
   ret <32 x i8> %shift
@@ -461,6 +482,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; SLM-LABEL: 'var_shift_v64i8'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %shift = shl <64 x i8> %a, %b
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = shl <64 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %shift = shl <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -868,6 +893,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <32 x i16> %a, %splat
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> poison, i16 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <32 x i16> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %insert = insertelement <32 x i16> poison, i16 %b, i32 0
   %splat = shufflevector <32 x i16> %insert, <32 x i16> poison, <32 x i32> zeroinitializer
@@ -1051,6 +1082,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %shift = shl <64 x i8> %a, %splat
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> poison, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <64 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %insert = insertelement <64 x i8> poison, i8 %b, i32 0
   %splat = shufflevector <64 x i8> %insert, <64 x i8> poison, <64 x i32> zeroinitializer
@@ -1305,6 +1342,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; SLM-LABEL: 'constant_shift_v16i16'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
   %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <16 x i16> %shift
@@ -1354,6 +1395,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; SLM-LABEL: 'constant_shift_v32i16'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
@@ -1399,6 +1444,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; SLM-LABEL: 'constant_shift_v16i8'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
 ;
   %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -1444,6 +1493,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; SLM-LABEL: 'constant_shift_v32i8'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
   %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
@@ -1489,6 +1542,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; SLM-LABEL: 'constant_shift_v64i8'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
@@ -1677,6 +1734,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v16i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
   %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
@@ -1718,6 +1779,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <32 x i16> %shift
@@ -1784,6 +1849,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
   %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
@@ -1825,6 +1894,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v64i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <64 x i8> %shift
@@ -2044,6 +2117,10 @@ define <16 x i16> @test6(<16 x i16> %a) {
 ; SLM-LABEL: 'test6'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shl
+;
+; AVX512GFNI-LABEL: 'test6'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shl
 ;
   %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
   ret <16 x i16> %shl
@@ -2162,6 +2239,10 @@ define <32 x i16> @test9(<32 x i16> %a) {
 ; SLM-LABEL: 'test9'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shl
+;
+; AVX512GFNI-LABEL: 'test9'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shl
 ;
   %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
   ret <32 x i16> %shl
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-shl-cost.ll b/llvm/test/Analysis/CostModel/X86/vshift-shl-cost.ll
index b254ff27daccb..09521cfa2d23d 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-shl-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-shl-cost.ll
@@ -15,6 +15,7 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI
 
 ; Verify the cost of vector shift left instructions.
 
@@ -236,6 +237,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; SLM-LABEL: 'var_shift_v8i16'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %shift = shl <8 x i16> %a, %b
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <8 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %shift
 ;
   %shift = shl <8 x i16> %a, %b
   ret <8 x i16> %shift
@@ -281,6 +286,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SLM-LABEL: 'var_shift_v16i16'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %shift = shl <16 x i16> %a, %b
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
   %shift = shl <16 x i16> %a, %b
   ret <16 x i16> %shift
@@ -326,6 +335,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; SLM-LABEL: 'var_shift_v32i16'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %shift = shl <32 x i16> %a, %b
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %shift = shl <32 x i16> %a, %b
   ret <32 x i16> %shift
@@ -371,6 +384,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; SLM-LABEL: 'var_shift_v16i8'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = shl <16 x i8> %a, %b
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
 ;
   %shift = shl <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -416,6 +433,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; SLM-LABEL: 'var_shift_v32i8'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %shift = shl <32 x i8> %a, %b
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <32 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
   %shift = shl <32 x i8> %a, %b
   ret <32 x i8> %shift
@@ -461,6 +482,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; SLM-LABEL: 'var_shift_v64i8'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %shift = shl <64 x i8> %a, %b
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = shl <64 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %shift = shl <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -868,6 +893,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <32 x i16> %a, %splat
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %shift = shl <32 x i16> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %insert = insertelement <32 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -1051,6 +1082,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %shift = shl <64 x i8> %a, %splat
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <64 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %insert = insertelement <64 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
@@ -1305,6 +1342,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; SLM-LABEL: 'constant_shift_v16i16'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
   %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <16 x i16> %shift
@@ -1354,6 +1395,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; SLM-LABEL: 'constant_shift_v32i16'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
@@ -1399,6 +1444,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; SLM-LABEL: 'constant_shift_v16i8'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %shift
 ;
   %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -1444,6 +1493,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; SLM-LABEL: 'constant_shift_v32i8'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
   %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <32 x i8> %shift
@@ -1489,6 +1542,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; SLM-LABEL: 'constant_shift_v64i8'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <64 x i8> %shift
@@ -1677,6 +1734,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v16i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shift
 ;
   %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
@@ -1718,6 +1779,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i16'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shift
 ;
   %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <32 x i16> %shift
@@ -1784,6 +1849,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v32i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i8> %shift
 ;
   %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
@@ -1825,6 +1894,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BWVL-LABEL: 'splatconstant_shift_v64i8'
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BWVL-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <64 x i8> %shift
 ;
   %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <64 x i8> %shift
@@ -2044,6 +2117,10 @@ define <16 x i16> @test6(<16 x i16> %a) {
 ; SLM-LABEL: 'test6'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shl
+;
+; AVX512GFNI-LABEL: 'test6'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i16> %shl
 ;
   %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
   ret <16 x i16> %shl
@@ -2162,6 +2239,10 @@ define <32 x i16> @test9(<32 x i16> %a) {
 ; SLM-LABEL: 'test9'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shl
+;
+; AVX512GFNI-LABEL: 'test9'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <32 x i16> %shl
 ;
   %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
   ret <32 x i16> %shl
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-shl-latency.ll b/llvm/test/Analysis/CostModel/X86/vshift-shl-latency.ll
index c53dbc5f586aa..42c91144ff6f9 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-shl-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-shl-latency.ll
@@ -15,6 +15,7 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=tigerlake | FileCheck %s --check-prefixes=AVX512,AVX512GFNI
 
 ; Verify the cost of vector logical shift right instructions.
 
@@ -240,6 +241,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v8i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <8 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <8 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
 ;
   %shift = shl <8 x i16> %a, %b
   ret <8 x i16> %shift
@@ -277,6 +282,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = shl <16 x i16> %a, %b
   ret <16 x i16> %shift
@@ -314,6 +323,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW-LABEL: 'var_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = shl <32 x i16> %a, %b
   ret <32 x i16> %shift
@@ -351,6 +364,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; AVX512BW-LABEL: 'var_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <16 x i8> %a, %b
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'var_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <16 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = shl <16 x i8> %a, %b
   ret <16 x i8> %shift
@@ -830,6 +847,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <32 x i16> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %shift = shl <32 x i16> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %insert = insertelement <32 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -983,6 +1006,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %shift = shl <64 x i8> %a, %splat
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %shift = shl <64 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %insert = insertelement <64 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
@@ -1208,6 +1237,10 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v8i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
 ;
   %shift = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <8 x i16> %shift
@@ -1241,6 +1274,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <16 x i16> %shift
@@ -1274,6 +1311,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
@@ -1311,6 +1352,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
   ret <16 x i8> %shift
@@ -1617,6 +1662,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
@@ -1650,6 +1699,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <32 x i16> %shift
@@ -1683,6 +1736,10 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v16i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
   %shift = shl <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <16 x i8> %shift
@@ -1745,6 +1802,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <64 x i8> %shift
diff --git a/llvm/test/Analysis/CostModel/X86/vshift-shl-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/vshift-shl-sizelatency.ll
index cbf0f24b90fb8..47b24df063ef7 100644
--- a/llvm/test/Analysis/CostModel/X86/vshift-shl-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/vshift-shl-sizelatency.ll
@@ -15,6 +15,7 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SLM
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=tigerlake | FileCheck %s --check-prefixes=CHECK,AVX512,AVX512GFNI
 
 ; Verify the cost of vector logical shift right instructions.
 
@@ -263,6 +264,10 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %shift = shl <8 x i16> %a, %b
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
 ;
+; AVX512GFNI-LABEL: 'var_shift_v8i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <8 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
+;
 ; BTVER2-LABEL: 'var_shift_v8i16'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %shift = shl <8 x i16> %a, %b
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %shift
@@ -303,6 +308,10 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %shift = shl <16 x i16> %a, %b
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
+; AVX512GFNI-LABEL: 'var_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
 ; BTVER2-LABEL: 'var_shift_v16i16'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %shift = shl <16 x i16> %a, %b
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
@@ -343,6 +352,10 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %shift = shl <32 x i16> %a, %b
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
+; AVX512GFNI-LABEL: 'var_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
 ; BTVER2-LABEL: 'var_shift_v32i16'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %shift = shl <32 x i16> %a, %b
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
@@ -383,6 +396,10 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %shift = shl <16 x i8> %a, %b
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
+; AVX512GFNI-LABEL: 'var_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = shl <16 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
 ; BTVER2-LABEL: 'var_shift_v16i8'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %shift = shl <16 x i8> %a, %b
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
@@ -423,6 +440,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %shift = shl <32 x i8> %a, %b
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
+; AVX512GFNI-LABEL: 'var_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %shift = shl <32 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
 ; BTVER2-LABEL: 'var_shift_v32i8'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %shift = shl <32 x i8> %a, %b
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
@@ -463,6 +484,10 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %shift = shl <64 x i8> %a, %b
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
+; AVX512GFNI-LABEL: 'var_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = shl <64 x i8> %a, %b
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
 ; BTVER2-LABEL: 'var_shift_v64i8'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %shift = shl <64 x i8> %a, %b
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
@@ -824,6 +849,12 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <32 x i16> %a, %splat
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <32 x i16> undef, i16 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = shl <32 x i16> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %insert = insertelement <32 x i16> undef, i16 %b, i32 0
   %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
@@ -995,6 +1026,12 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %shift = shl <64 x i8> %a, %splat
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatvar_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %insert = insertelement <64 x i8> undef, i8 %b, i32 0
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %shift = shl <64 x i8> %a, %splat
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %insert = insertelement <64 x i8> undef, i8 %b, i32 0
   %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer
@@ -1238,6 +1275,10 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <16 x i16> %shift
@@ -1271,6 +1312,10 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BW-LABEL: 'constant_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'constant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
   ret <32 x i16> %shift
@@ -1309,6 +1354,10 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
 ;
+; AVX512GFNI-LABEL: 'constant_shift_v16i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
+;
 ; BTVER2-LABEL: 'constant_shift_v16i8'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %shift = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %shift
@@ -1349,6 +1398,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
+; AVX512GFNI-LABEL: 'constant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
 ; BTVER2-LABEL: 'constant_shift_v32i8'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %shift = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
@@ -1389,6 +1442,10 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) {
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
+; AVX512GFNI-LABEL: 'constant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
 ; BTVER2-LABEL: 'constant_shift_v64i8'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
@@ -1571,6 +1628,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v16i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v16i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i16> %shift
 ;
   %shift = shl <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <16 x i16> %shift
@@ -1604,6 +1665,10 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v32i16'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i16'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i16> %shift
 ;
   %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
   ret <32 x i16> %shift
@@ -1662,6 +1727,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v32i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v32i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <32 x i8> %shift
 ;
   %shift = shl <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <32 x i8> %shift
@@ -1695,6 +1764,10 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) {
 ; AVX512BW-LABEL: 'splatconstant_shift_v64i8'
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
+;
+; AVX512GFNI-LABEL: 'splatconstant_shift_v64i8'
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+; AVX512GFNI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <64 x i8> %shift
 ;
   %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
   ret <64 x i8> %shift
diff --git a/llvm/test/Bindings/llvm-c/atomics.ll b/llvm/test/Bindings/llvm-c/atomics.ll
index e64a29944ef9d..162368c9d98d0 100644
--- a/llvm/test/Bindings/llvm-c/atomics.ll
+++ b/llvm/test/Bindings/llvm-c/atomics.ll
@@ -36,6 +36,31 @@ define void @atomic_load_store(ptr %word) {
   ret void
 }
 
+define void @atomic_rmw_ops(ptr %p, i32 %i, float %f) {
+  ; Test all atomicrmw operations
+  %a.xchg      = atomicrmw xchg      ptr %p, i32 %i acq_rel, align 8
+  %a.add       = atomicrmw add       ptr %p, i32 %i acq_rel, align 8
+  %a.sub       = atomicrmw sub       ptr %p, i32 %i acq_rel, align 8
+  %a.and       = atomicrmw and       ptr %p, i32 %i acq_rel, align 8
+  %a.nand      = atomicrmw nand      ptr %p, i32 %i acq_rel, align 8
+  %a.or        = atomicrmw or        ptr %p, i32 %i acq_rel, align 8
+  %a.xor       = atomicrmw xor       ptr %p, i32 %i acq_rel, align 8
+  %a.max       = atomicrmw max       ptr %p, i32 %i acq_rel, align 8
+  %a.min       = atomicrmw min       ptr %p, i32 %i acq_rel, align 8
+  %a.umax      = atomicrmw umax      ptr %p, i32 %i acq_rel, align 8
+  %a.umin      = atomicrmw umin      ptr %p, i32 %i acq_rel, align 8
+
+  %a.fadd      = atomicrmw fadd      ptr %p, float %f acq_rel, align 8
+  %a.fsub      = atomicrmw fsub      ptr %p, float %f acq_rel, align 8
+  %a.fmax      = atomicrmw fmax      ptr %p, float %f acq_rel, align 8
+  %a.fmin      = atomicrmw fmin      ptr %p, float %f acq_rel, align 8
+
+  %a.uinc_wrap = atomicrmw uinc_wrap ptr %p, i32 %i acq_rel, align 8
+  %a.udec_wrap = atomicrmw udec_wrap ptr %p, i32 %i acq_rel, align 8
+
+  ret void
+}
+
 define i32 @main() {
   %1 = alloca i32, align 4
   %2 = cmpxchg ptr %1, i32 2, i32 3 seq_cst acquire
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
index a131f35e66d03..a61931b898aea 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -1385,6 +1385,16 @@ define i32 @test_intrinsic_lrint(float %a) {
   ret i32 %res
 }
 
+declare i32 @llvm.llrint.i32.f32(float)
+define i32 @test_intrinsic_llrint(float %a) {
+; CHECK-LABEL: name: test_intrinsic_llrint
+; CHECK: [[A:%[0-9]+]]:_(s32) = COPY $s0
+; CHECK: [[RES:%[0-9]+]]:_(s32) = G_INTRINSIC_LLRINT [[A]]
+; CHECK: $w0 = COPY [[RES]]
+  %res = call i32 @llvm.llrint.i32.f32(float %a)
+  ret i32 %res
+}
+
 declare i32 @llvm.ctlz.i32(i32, i1)
 define i32 @test_ctlz_intrinsic_zero_not_undef(i32 %a) {
 ; CHECK-LABEL: name: test_ctlz_intrinsic_zero_not_undef
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-fp-const-lhs.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-fp-const-lhs.mir
index 76d82884a7b1f..d791660b7a5eb 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-fp-const-lhs.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-fp-const-lhs.mir
@@ -116,3 +116,129 @@ body:             |
     $q0 = COPY %mul
     RET_ReallyLR
 ...
+---
+name:            fminnum
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: fminnum
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    ; CHECK-NEXT: %min:_(s32) = G_FMINNUM [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %min(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    %min:_(s32) = G_FMINNUM %cst, %0
+    $s0 = COPY %min
+    RET_ReallyLR
+...
+---
+name:            fmaxnum
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: fmaxnum
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    ; CHECK-NEXT: %max:_(s32) = G_FMAXNUM [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %max(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    %max:_(s32) = G_FMAXNUM %cst, %0
+    $s0 = COPY %max
+    RET_ReallyLR
+...
+---
+name:            fminnum_ieee
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: fminnum_ieee
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    ; CHECK-NEXT: %min:_(s32) = G_FMINNUM_IEEE [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %min(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    %min:_(s32) = G_FMINNUM_IEEE %cst, %0
+    $s0 = COPY %min
+    RET_ReallyLR
+...
+---
+name:            fmaxnum_ieee
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: fmaxnum_ieee
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    ; CHECK-NEXT: %max:_(s32) = G_FMAXNUM_IEEE [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %max(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    %max:_(s32) = G_FMAXNUM_IEEE %cst, %0
+    $s0 = COPY %max
+    RET_ReallyLR
+...
+---
+name:            fminimum
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: fminimum
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    ; CHECK-NEXT: %min:_(s32) = G_FMINIMUM [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %min(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    %min:_(s32) = G_FMINIMUM %cst, %0
+    $s0 = COPY %min
+    RET_ReallyLR
+...
+---
+name:            fmaximum
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: fmaximum
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    ; CHECK-NEXT: %max:_(s32) = G_FMAXIMUM [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %max(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_FCONSTANT float 2.000000e+00
+    %max:_(s32) = G_FMAXIMUM %cst, %0
+    $s0 = COPY %max
+    RET_ReallyLR
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-int-const-lhs.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-int-const-lhs.mir
new file mode 100644
index 0000000000000..16365494f5f4e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-commute-int-const-lhs.mir
@@ -0,0 +1,456 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple aarch64 -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+---
+name:            add
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: add
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %add:_(s32) = G_ADD [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %add(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %add:_(s32) = G_ADD %cst, %0
+    $s0 = COPY %add
+    RET_ReallyLR
+
+...
+---
+name:            mul
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: mul
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: %mul:_(s32) = G_MUL [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %mul(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 3
+    %mul:_(s32) = G_MUL %cst, %0
+    $s0 = COPY %mul
+    RET_ReallyLR
+...
+---
+name:            and
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: and
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 5
+    ; CHECK-NEXT: %and:_(s32) = G_AND [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %and(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 5
+    %and:_(s32) = G_AND %cst, %0
+    $s0 = COPY %and
+    RET_ReallyLR
+...
+---
+name:            or
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: or
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 5
+    ; CHECK-NEXT: %or:_(s32) = G_OR [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %or(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 5
+    %or:_(s32) = G_OR %cst, %0
+    $s0 = COPY %or
+    RET_ReallyLR
+...
+---
+name:            xor
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: xor
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 5
+    ; CHECK-NEXT: %xor:_(s32) = G_XOR [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %xor(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 5
+    %xor:_(s32) = G_XOR %cst, %0
+    $s0 = COPY %xor
+    RET_ReallyLR
+...
+---
+name:            smin
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: smin
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 10
+    ; CHECK-NEXT: %min:_(s32) = G_SMIN [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %min(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 10
+    %min:_(s32) = G_SMIN %cst, %0
+    $s0 = COPY %min
+    RET_ReallyLR
+...
+---
+name:            smax
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: smax
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 10
+    ; CHECK-NEXT: %max:_(s32) = G_SMAX [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %max(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 10
+    %max:_(s32) = G_SMAX %cst, %0
+    $s0 = COPY %max
+    RET_ReallyLR
+...
+---
+name:            umin
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: umin
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 10
+    ; CHECK-NEXT: %min:_(s32) = G_UMIN [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %min(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 10
+    %min:_(s32) = G_UMIN %cst, %0
+    $s0 = COPY %min
+    RET_ReallyLR
+...
+---
+name:            umax
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: umax
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 10
+    ; CHECK-NEXT: %max:_(s32) = G_UMAX [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %max(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 10
+    %max:_(s32) = G_UMAX %cst, %0
+    $s0 = COPY %max
+    RET_ReallyLR
+...
+---
+name:            uaddo
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: uaddo
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %add:_(s32), %overflow:_(s1) = G_UADDO [[COPY]], %cst
+    ; CHECK-NEXT: %ret:_(s32) = G_ANYEXT %overflow(s1)
+    ; CHECK-NEXT: $s0 = COPY %ret(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %add:_(s32), %overflow:_(s1) = G_UADDO %cst, %0
+    %ret:_(s32) = G_ANYEXT %overflow
+    $s0 = COPY %ret
+    RET_ReallyLR
+
+...
+---
+name:            saddo
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: saddo
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %add:_(s32), %overflow:_(s1) = G_SADDO [[COPY]], %cst
+    ; CHECK-NEXT: %ret:_(s32) = G_ANYEXT %overflow(s1)
+    ; CHECK-NEXT: $s0 = COPY %ret(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %add:_(s32), %overflow:_(s1) = G_SADDO %cst, %0
+    %ret:_(s32) = G_ANYEXT %overflow
+    $s0 = COPY %ret
+    RET_ReallyLR
+
+...
+---
+name:            umulo
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: umulo
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: %mul:_(s32), %overflow:_(s1) = G_UMULO [[COPY]], %cst
+    ; CHECK-NEXT: %ret:_(s32) = G_ANYEXT %overflow(s1)
+    ; CHECK-NEXT: $s0 = COPY %ret(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 3
+    %mul:_(s32), %overflow:_(s1) = G_UMULO %cst, %0
+    %ret:_(s32) = G_ANYEXT %overflow
+    $s0 = COPY %ret
+    RET_ReallyLR
+...
+---
+name:            smulo
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: smulo
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: %mul:_(s32), %overflow:_(s1) = G_SMULO [[COPY]], %cst
+    ; CHECK-NEXT: %ret:_(s32) = G_ANYEXT %overflow(s1)
+    ; CHECK-NEXT: $s0 = COPY %ret(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 3
+    %mul:_(s32), %overflow:_(s1) = G_SMULO %cst, %0
+    %ret:_(s32) = G_ANYEXT %overflow
+    $s0 = COPY %ret
+    RET_ReallyLR
+...
+---
+name:            umulh
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: umulh
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: %mul:_(s32) = G_UMULH [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %mul(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 3
+    %mul:_(s32) = G_UMULH %cst, %0
+    $s0 = COPY %mul
+    RET_ReallyLR
+...
+---
+name:            smulh
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: smulh
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: %mul:_(s32) = G_UMULH [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %mul(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 3
+    %mul:_(s32) = G_UMULH %cst, %0
+    $s0 = COPY %mul
+    RET_ReallyLR
+...
+---
+name:            uaddsat
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: uaddsat
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %add:_(s32) = G_UADDSAT [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %add(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %add:_(s32) = G_UADDSAT %cst, %0
+    $s0 = COPY %add
+    RET_ReallyLR
+
+...
+---
+name:            saddsat
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: saddsat
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %add:_(s32) = G_SADDSAT [[COPY]], %cst
+    ; CHECK-NEXT: $s0 = COPY %add(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %add:_(s32) = G_SADDSAT %cst, %0
+    $s0 = COPY %add
+    RET_ReallyLR
+
+...
+---
+name:            smulfix
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: smulfix
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: %mul:_(s32) = G_SMULFIX [[COPY]], %cst, 7
+    ; CHECK-NEXT: $s0 = COPY %mul(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 3
+    %mul:_(s32) = G_SMULFIX %cst, %0, 7
+    $s0 = COPY %mul
+    RET_ReallyLR
+...
+---
+name:            umulfix
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: umulfix
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: %mul:_(s32) = G_UMULFIX [[COPY]], %cst, 7
+    ; CHECK-NEXT: $s0 = COPY %mul(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 3
+    %mul:_(s32) = G_UMULFIX %cst, %0, 7
+    $s0 = COPY %mul
+    RET_ReallyLR
+...
+---
+name:            smulfixsat
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: smulfixsat
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: %mul:_(s32) = G_SMULFIXSAT [[COPY]], %cst, 7
+    ; CHECK-NEXT: $s0 = COPY %mul(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 3
+    %mul:_(s32) = G_SMULFIXSAT %cst, %0, 7
+    $s0 = COPY %mul
+    RET_ReallyLR
+...
+---
+name:            umulfixsat
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: umulfixsat
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 3
+    ; CHECK-NEXT: %mul:_(s32) = G_UMULFIXSAT [[COPY]], %cst, 7
+    ; CHECK-NEXT: $s0 = COPY %mul(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $s0
+    %cst:_(s32) = G_CONSTANT i32 3
+    %mul:_(s32) = G_UMULFIXSAT %cst, %0, 7
+    $s0 = COPY %mul
+    RET_ReallyLR
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-const-fold-barrier-rhs.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-const-fold-barrier-rhs.mir
index 01e0dce5a661c..c967e4f2ea5e8 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-const-fold-barrier-rhs.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-const-fold-barrier-rhs.mir
@@ -78,3 +78,163 @@ body:             |
     RET_ReallyLR
 
 ...
+---
+name:            cfb_lhs_smulo
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: cfb_lhs_smulo
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %mul:_(s32), %overflow:_(s1) = G_SMULO [[COPY]], %cfb
+    ; CHECK-NEXT: %ret:_(s32) = G_ANYEXT %overflow(s1)
+    ; CHECK-NEXT: $w0 = COPY %ret(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $w0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    %mul:_(s32), %overflow:_(s1) = G_SMULO %cfb, %0
+    %ret:_(s32) = G_ANYEXT %overflow
+    $w0 = COPY %ret
+    RET_ReallyLR
+
+...
+---
+name:            cfb_lhs_cfb_already_rhs_smulo
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: cfb_lhs_cfb_already_rhs_smulo
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %cst2:_(s32) = G_CONSTANT i32 6
+    ; CHECK-NEXT: %cfb2:_(s32) = G_CONSTANT_FOLD_BARRIER %cst2
+    ; CHECK-NEXT: %mul:_(s32), %overflow:_(s1) = G_SMULO %cfb, %cfb2
+    ; CHECK-NEXT: %ret:_(s32) = G_ANYEXT %overflow(s1)
+    ; CHECK-NEXT: $w0 = COPY %ret(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $w0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    %cst2:_(s32) = G_CONSTANT i32 6
+    %cfb2:_(s32) = G_CONSTANT_FOLD_BARRIER %cst2
+    %mul:_(s32), %overflow:_(s1) = G_SMULO %cfb, %cfb2
+    %ret:_(s32) = G_ANYEXT %overflow
+    $w0 = COPY %ret
+    RET_ReallyLR
+
+...
+---
+name:            cfb_lhs_cst_on_rhs_smulo
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: cfb_lhs_cst_on_rhs_smulo
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %cst2:_(s32) = G_CONSTANT i32 6
+    ; CHECK-NEXT: %mul:_(s32), %overflow:_(s1) = G_SMULO %cfb, %cst2
+    ; CHECK-NEXT: %ret:_(s32) = G_ANYEXT %overflow(s1)
+    ; CHECK-NEXT: $w0 = COPY %ret(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $w0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    %cst2:_(s32) = G_CONSTANT i32 6
+    %mul:_(s32), %overflow:_(s1) = G_SMULO %cfb, %cst2
+    %ret:_(s32) = G_ANYEXT %overflow
+    $w0 = COPY %ret
+    RET_ReallyLR
+
+...
+---
+name:            cfb_lhs_umulfixsat
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: cfb_lhs_umulfixsat
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %mul:_(s32) = G_UMULFIXSAT [[COPY]], %cfb, 7
+    ; CHECK-NEXT: $w0 = COPY %mul(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $w0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    %mul:_(s32) = G_UMULFIXSAT %cfb, %0, 7
+    $w0 = COPY %mul
+    RET_ReallyLR
+
+...
+---
+name:            cfb_lhs_cfb_already_rhs_umulfixsat
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: cfb_lhs_cfb_already_rhs_umulfixsat
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %cst2:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: %cfb2:_(s32) = G_CONSTANT_FOLD_BARRIER %cst2
+    ; CHECK-NEXT: %add:_(s32) = G_UMULFIXSAT %cfb, %cfb2, 7
+    ; CHECK-NEXT: $w0 = COPY %add(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $w0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    %cst2:_(s32) = G_CONSTANT i32 2
+    %cfb2:_(s32) = G_CONSTANT_FOLD_BARRIER %cst2
+    %add:_(s32) = G_UMULFIXSAT %cfb, %cfb2, 7
+    $w0 = COPY %add
+    RET_ReallyLR
+
+...
+---
+name:            cfb_lhs_cst_on_rhs_umulfixsat
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $w0
+
+    ; CHECK-LABEL: name: cfb_lhs_cst_on_rhs_umulfixsat
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %cst:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    ; CHECK-NEXT: %cst2:_(s32) = G_CONSTANT i32 2
+    ; CHECK-NEXT: %add:_(s32) = G_UMULFIXSAT %cfb, %cst2, 7
+    ; CHECK-NEXT: $w0 = COPY %add(s32)
+    ; CHECK-NEXT: RET_ReallyLR
+    %0:_(s32) = COPY $w0
+    %cst:_(s32) = G_CONSTANT i32 1
+    %cfb:_(s32) = G_CONSTANT_FOLD_BARRIER %cst
+    %cst2:_(s32) = G_CONSTANT i32 2
+    %add:_(s32) = G_UMULFIXSAT %cfb, %cst2, 7
+    $w0 = COPY %add
+    RET_ReallyLR
+
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
index ceef0c49a45ec..9a525151ca328 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll
@@ -140,10 +140,10 @@ define <8 x i16> @combine_vec_udiv_nonuniform3(<8 x i16> %x) {
 ; GISEL-NEXT:    umull2 v2.4s, v0.8h, v1.8h
 ; GISEL-NEXT:    umull v1.4s, v0.4h, v1.4h
 ; GISEL-NEXT:    uzp2 v1.8h, v1.8h, v2.8h
+; GISEL-NEXT:    ldr q2, [x8, :lo12:.LCPI3_0]
 ; GISEL-NEXT:    sub v0.8h, v0.8h, v1.8h
 ; GISEL-NEXT:    usra v1.8h, v0.8h, #1
-; GISEL-NEXT:    ldr q0, [x8, :lo12:.LCPI3_0]
-; GISEL-NEXT:    neg v0.8h, v0.8h
+; GISEL-NEXT:    neg v0.8h, v2.8h
 ; GISEL-NEXT:    ushl v0.8h, v1.8h, v0.8h
 ; GISEL-NEXT:    ret
   %1 = udiv <8 x i16> %x, <i16 7, i16 23, i16 25, i16 27, i16 31, i16 47, i16 63, i16 127>
@@ -170,13 +170,13 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 ; GISEL-LABEL: combine_vec_udiv_nonuniform4:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    adrp x8, .LCPI4_2
+; GISEL-NEXT:    adrp x9, .LCPI4_0
 ; GISEL-NEXT:    ldr q1, [x8, :lo12:.LCPI4_2]
 ; GISEL-NEXT:    adrp x8, .LCPI4_1
+; GISEL-NEXT:    ldr q4, [x9, :lo12:.LCPI4_0]
 ; GISEL-NEXT:    ldr q3, [x8, :lo12:.LCPI4_1]
-; GISEL-NEXT:    adrp x8, .LCPI4_0
 ; GISEL-NEXT:    umull2 v2.8h, v0.16b, v1.16b
 ; GISEL-NEXT:    umull v1.8h, v0.8b, v1.8b
-; GISEL-NEXT:    ldr q4, [x8, :lo12:.LCPI4_0]
 ; GISEL-NEXT:    uzp2 v1.16b, v1.16b, v2.16b
 ; GISEL-NEXT:    neg v2.16b, v3.16b
 ; GISEL-NEXT:    shl v3.16b, v4.16b, #7
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-llrint.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-llrint.mir
new file mode 100644
index 0000000000000..f77649b793951
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-llrint.mir
@@ -0,0 +1,98 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -verify-machineinstrs -mtriple aarch64-unknown-unknown -run-pass=legalizer %s -o - | FileCheck %s
+---
+name:            testmsws
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$s0' }
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: testmsws
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[INTRINSIC_LLRINT:%[0-9]+]]:_(s64) = G_INTRINSIC_LLRINT [[COPY]](s32)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[INTRINSIC_LLRINT]](s64)
+    ; CHECK-NEXT: $w0 = COPY [[TRUNC]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s32) = COPY $s0
+    %1:_(s64) = G_INTRINSIC_LLRINT %0(s32)
+    %2:_(s32) = G_TRUNC %1(s64)
+    $w0 = COPY %2(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            testmsxs
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$s0' }
+body:             |
+  bb.1:
+    liveins: $s0
+
+    ; CHECK-LABEL: name: testmsxs
+    ; CHECK: liveins: $s0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+    ; CHECK-NEXT: [[INTRINSIC_LLRINT:%[0-9]+]]:_(s64) = G_INTRINSIC_LLRINT [[COPY]](s32)
+    ; CHECK-NEXT: $x0 = COPY [[INTRINSIC_LLRINT]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s32) = COPY $s0
+    %1:_(s64) = G_INTRINSIC_LLRINT %0(s32)
+    $x0 = COPY %1(s64)
+    RET_ReallyLR implicit $x0
+
+...
+---
+name:            testmswd
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$d0' }
+body:             |
+  bb.1:
+    liveins: $d0
+
+    ; CHECK-LABEL: name: testmswd
+    ; CHECK: liveins: $d0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-NEXT: [[INTRINSIC_LLRINT:%[0-9]+]]:_(s64) = G_INTRINSIC_LLRINT [[COPY]](s64)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[INTRINSIC_LLRINT]](s64)
+    ; CHECK-NEXT: $w0 = COPY [[TRUNC]](s32)
+    ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    %0:_(s64) = COPY $d0
+    %1:_(s64) = G_INTRINSIC_LLRINT %0(s64)
+    %2:_(s32) = G_TRUNC %1(s64)
+    $w0 = COPY %2(s32)
+    RET_ReallyLR implicit $w0
+
+...
+---
+name:            testmsxd
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '$d0' }
+body:             |
+  bb.1:
+    liveins: $d0
+
+    ; CHECK-LABEL: name: testmsxd
+    ; CHECK: liveins: $d0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+    ; CHECK-NEXT: [[INTRINSIC_LLRINT:%[0-9]+]]:_(s64) = G_INTRINSIC_LLRINT [[COPY]](s64)
+    ; CHECK-NEXT: $x0 = COPY [[INTRINSIC_LLRINT]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %0:_(s64) = COPY $d0
+    %1:_(s64) = G_INTRINSIC_LLRINT %0(s64)
+    $x0 = COPY %1(s64)
+    RET_ReallyLR implicit $x0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 0793f3983c8e5..098726b0a980d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -156,6 +156,10 @@
 # DEBUG-NEXT: G_INTRINSIC_LRINT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. the first uncovered type index: 2, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: G_INTRINSIC_LLRINT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
+# DEBUG-NEXT: .. the first uncovered type index: 2, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_INTRINSIC_ROUNDEVEN (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-select-to-fminmax.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-select-to-fminmax.mir
index 8c4300d9e7329..03e507f5eaa7f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-select-to-fminmax.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-select-to-fminmax.mir
@@ -11,7 +11,7 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000
-    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s16) = G_FMAXIMUM [[C]], [[COPY]]
+    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s16) = G_FMAXIMUM [[COPY]], [[C]]
     ; CHECK-NEXT: $h0 = COPY [[FMAXIMUM]](s16)
     ; CHECK-NEXT: RET_ReallyLR implicit $h0
     %0:_(s16) = COPY $h0
@@ -33,7 +33,7 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
-    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = G_FMAXIMUM [[C]], [[COPY]]
+    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY]], [[C]]
     ; CHECK-NEXT: $s0 = COPY [[FMAXIMUM]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $s0
     %0:_(s32) = COPY $s0
@@ -55,7 +55,7 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00
-    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s64) = G_FMAXIMUM [[C]], [[COPY]]
+    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s64) = G_FMAXIMUM [[COPY]], [[C]]
     ; CHECK-NEXT: $d0 = COPY [[FMAXIMUM]](s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $d0
     %0:_(s64) = COPY $d0
@@ -77,7 +77,7 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00
-    ; CHECK-NEXT: [[FMINIMUM:%[0-9]+]]:_(s64) = G_FMINIMUM [[C]], [[COPY]]
+    ; CHECK-NEXT: [[FMINIMUM:%[0-9]+]]:_(s64) = G_FMINIMUM [[COPY]], [[C]]
     ; CHECK-NEXT: $d0 = COPY [[FMINIMUM]](s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $d0
     %0:_(s64) = COPY $d0
@@ -100,7 +100,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000
     ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16), [[C]](s16)
-    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<8 x s16>) = G_FMAXIMUM [[BUILD_VECTOR]], [[COPY]]
+    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<8 x s16>) = G_FMAXIMUM [[COPY]], [[BUILD_VECTOR]]
     ; CHECK-NEXT: $q0 = COPY [[FMAXIMUM]](<8 x s16>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(<8 x s16>) = COPY $q0
@@ -125,7 +125,7 @@ body:             |
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
     ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
-    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<4 x s32>) = G_FMAXIMUM [[BUILD_VECTOR]], [[BITCAST]]
+    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<4 x s32>) = G_FMAXIMUM [[BITCAST]], [[BUILD_VECTOR]]
     ; CHECK-NEXT: $q0 = COPY [[FMAXIMUM]](<4 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %1:_(<2 x s64>) = COPY $q0
@@ -150,7 +150,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00
     ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64)
-    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<2 x s64>) = G_FMAXIMUM [[BUILD_VECTOR]], [[COPY]]
+    ; CHECK-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<2 x s64>) = G_FMAXIMUM [[COPY]], [[BUILD_VECTOR]]
     ; CHECK-NEXT: $q0 = COPY [[FMAXIMUM]](<2 x s64>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(<2 x s64>) = COPY $q0
@@ -174,7 +174,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0.000000e+00
     ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64)
-    ; CHECK-NEXT: [[FMINIMUM:%[0-9]+]]:_(<2 x s64>) = G_FMINIMUM [[BUILD_VECTOR]], [[COPY]]
+    ; CHECK-NEXT: [[FMINIMUM:%[0-9]+]]:_(<2 x s64>) = G_FMINIMUM [[COPY]], [[BUILD_VECTOR]]
     ; CHECK-NEXT: $q0 = COPY [[FMINIMUM]](<2 x s64>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(<2 x s64>) = COPY $q0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-to-fmin-fmax.ll b/llvm/test/CodeGen/AArch64/GlobalISel/select-to-fmin-fmax.ll
index 7badf4732fd0d..ae0a9b1c7c4f1 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-to-fmin-fmax.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-to-fmin-fmax.ll
@@ -4,7 +4,7 @@ define half @test_s16(half %a) #0 {
 ; CHECK-LABEL: test_s16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi d1, #0000000000000000
-; CHECK-NEXT:    fmax h0, h1, h0
+; CHECK-NEXT:    fmax h0, h0, h1
 ; CHECK-NEXT:    ret
 entry:
   %fcmp = fcmp olt half %a, 0.0
@@ -16,7 +16,7 @@ define float @test_s32(float %a) #0 {
 ; CHECK-LABEL: test_s32:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi d1, #0000000000000000
-; CHECK-NEXT:    fmax s0, s1, s0
+; CHECK-NEXT:    fmax s0, s0, s1
 ; CHECK-NEXT:    ret
 entry:
   %fcmp = fcmp olt float %a, 0.0
@@ -28,7 +28,7 @@ define double @test_s64(double %a) #0 {
 ; CHECK-LABEL: test_s64:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi d1, #0000000000000000
-; CHECK-NEXT:    fmax d0, d1, d0
+; CHECK-NEXT:    fmax d0, d0, d1
 ; CHECK-NEXT:    ret
 entry:
   %fcmp = fcmp olt double %a, 0.0
@@ -40,7 +40,7 @@ define <4 x half> @test_v4s16(<4 x half> %a) #0 {
 ; CHECK-LABEL: test_v4s16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    fmax v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    fmax v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    ret
 entry:
   %fcmp = fcmp olt <4 x half> %a, zeroinitializer
@@ -52,7 +52,7 @@ define <8 x half> @test_v8s16(<8 x half> %a) #0 {
 ; CHECK-LABEL: test_v8s16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    fmax v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    fmax v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
 entry:
   %fcmp = fcmp olt <8 x half> %a, zeroinitializer
@@ -64,7 +64,7 @@ define <2 x float> @test_v2s32(<2 x float> %a) #0 {
 ; CHECK-LABEL: test_v2s32:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    fmax v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    fmax v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    ret
 entry:
   %fcmp = fcmp olt <2 x float> %a, zeroinitializer
@@ -76,7 +76,7 @@ define <4 x float> @test_v4s32(<4 x float> %a) #0 {
 ; CHECK-LABEL: test_v4s32:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    fmax v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    fmax v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
 entry:
   %fcmp = fcmp olt <4 x float> %a, zeroinitializer
@@ -88,7 +88,7 @@ define <2 x double> @test_v2s64(<2 x double> %a) #0 {
 ; CHECK-LABEL: test_v2s64:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    fmax v0.2d, v1.2d, v0.2d
+; CHECK-NEXT:    fmax v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
 entry:
   %fcmp = fcmp olt <2 x double> %a, zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll b/llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll
index bd576d0f70e9c..8c6e01d934c2d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/vastart.ll
@@ -3,7 +3,7 @@
 
 
 declare void @llvm.va_start(ptr)
-define void @test_va_start(ptr %list) {
+define void @test_va_start(ptr %list, ...) {
 ; CHECK-LABEL: name: test_va_start
 ; CHECK: [[LIST:%[0-9]+]]:_(p0) = COPY $x0
 ; CHECK-IOS: G_VASTART [[LIST]](p0) :: (store (s64) into %ir.list, align 1)
diff --git a/llvm/test/CodeGen/AArch64/aarch64-addv.ll b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
index b77d591347659..ee035ec1941d5 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-addv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-addv.ll
@@ -101,12 +101,12 @@ define i32 @oversized_ADDV_256(ptr noalias nocapture readonly %arg1, ptr noalias
 ; GISEL-NEXT:    ushll v2.8h, v2.8b, #0
 ; GISEL-NEXT:    usubl v3.4s, v1.4h, v2.4h
 ; GISEL-NEXT:    usubl2 v1.4s, v1.8h, v2.8h
-; GISEL-NEXT:    neg v2.4s, v3.4s
-; GISEL-NEXT:    neg v4.4s, v1.4s
-; GISEL-NEXT:    cmgt v5.4s, v0.4s, v3.4s
+; GISEL-NEXT:    cmgt v2.4s, v0.4s, v3.4s
 ; GISEL-NEXT:    cmgt v0.4s, v0.4s, v1.4s
-; GISEL-NEXT:    bif v2.16b, v3.16b, v5.16b
-; GISEL-NEXT:    bsl v0.16b, v4.16b, v1.16b
+; GISEL-NEXT:    neg v4.4s, v3.4s
+; GISEL-NEXT:    neg v5.4s, v1.4s
+; GISEL-NEXT:    bsl v2.16b, v4.16b, v3.16b
+; GISEL-NEXT:    bsl v0.16b, v5.16b, v1.16b
 ; GISEL-NEXT:    add v0.4s, v2.4s, v0.4s
 ; GISEL-NEXT:    addv s0, v0.4s
 ; GISEL-NEXT:    fmov w0, s0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll
index fdeae9f326ad8..36b81d8e495ce 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dup-ext-scalable.ll
@@ -4,8 +4,8 @@
 define <vscale x 2 x i16> @dupsext_v2i8_v2i16(i8 %src, <vscale x 2 x i16> %b) {
 ; CHECK-LABEL: dupsext_v2i8_v2i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    sxtb w8, w0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -20,8 +20,8 @@ entry:
 define <vscale x 4 x i16> @dupsext_v4i8_v4i16(i8 %src, <vscale x 4 x i16> %b) {
 ; CHECK-LABEL: dupsext_v4i8_v4i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sxtb w8, w0
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -36,8 +36,8 @@ entry:
 define <vscale x 8 x i16> @dupsext_v8i8_v8i16(i8 %src, <vscale x 8 x i16> %b) {
 ; CHECK-LABEL: dupsext_v8i8_v8i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    sxtb w8, w0
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
@@ -52,8 +52,8 @@ entry:
 define <vscale x 2 x i32> @dupsext_v2i8_v2i32(i8 %src, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: dupsext_v2i8_v2i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    sxtb w8, w0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -68,8 +68,8 @@ entry:
 define <vscale x 4 x i32> @dupsext_v4i8_v4i32(i8 %src, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: dupsext_v4i8_v4i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sxtb w8, w0
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -84,9 +84,9 @@ entry:
 define <vscale x 2 x i64> @dupsext_v2i8_v2i64(i8 %src, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: dupsext_v2i8_v2i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    sxtb x8, w0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -101,8 +101,8 @@ entry:
 define <vscale x 2 x i32> @dupsext_v2i16_v2i32(i16 %src, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: dupsext_v2i16_v2i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    sxth w8, w0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -117,8 +117,8 @@ entry:
 define <vscale x 4 x i32> @dupsext_v4i16_v4i32(i16 %src, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: dupsext_v4i16_v4i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sxth w8, w0
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -133,9 +133,9 @@ entry:
 define <vscale x 2 x i64> @dupsext_v2i16_v2i64(i16 %src, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: dupsext_v2i16_v2i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    sxth x8, w0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -150,9 +150,9 @@ entry:
 define <vscale x 2 x i64> @dupsext_v2i32_v2i64(i32 %src, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: dupsext_v2i32_v2i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    sxtw x8, w0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -167,8 +167,8 @@ entry:
 define <vscale x 2 x i16> @dupzext_v2i8_v2i16(i8 %src, <vscale x 2 x i16> %b) {
 ; CHECK-LABEL: dupzext_v2i8_v2i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -183,8 +183,8 @@ entry:
 define <vscale x 4 x i16> @dupzext_v4i8_v4i16(i8 %src, <vscale x 4 x i16> %b) {
 ; CHECK-LABEL: dupzext_v4i8_v4i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -199,8 +199,8 @@ entry:
 define <vscale x 8 x i16> @dupzext_v8i8_v8i16(i8 %src, <vscale x 8 x i16> %b) {
 ; CHECK-LABEL: dupzext_v8i8_v8i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
@@ -215,8 +215,8 @@ entry:
 define <vscale x 2 x i32> @dupzext_v2i8_v2i32(i8 %src, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: dupzext_v2i8_v2i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -231,8 +231,8 @@ entry:
 define <vscale x 4 x i32> @dupzext_v4i8_v4i32(i8 %src, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: dupzext_v4i8_v4i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and w8, w0, #0xff
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -247,9 +247,9 @@ entry:
 define <vscale x 2 x i64> @dupzext_v2i8_v2i64(i8 %src, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: dupzext_v2i8_v2i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    and x8, x0, #0xff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -264,8 +264,8 @@ entry:
 define <vscale x 2 x i32> @dupzext_v2i16_v2i32(i16 %src, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: dupzext_v2i16_v2i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and w8, w0, #0xffff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -280,8 +280,8 @@ entry:
 define <vscale x 4 x i32> @dupzext_v4i16_v4i32(i16 %src, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: dupzext_v4i16_v4i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and w8, w0, #0xffff
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -296,9 +296,9 @@ entry:
 define <vscale x 2 x i64> @dupzext_v2i16_v2i64(i16 %src, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: dupzext_v2i16_v2i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    and x8, x0, #0xffff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -313,8 +313,8 @@ entry:
 define <vscale x 2 x i64> @dupzext_v2i32_v2i64(i32 %src, <vscale x 2 x i64> %b) {
 ; CHECK-LABEL: dupzext_v2i32_v2i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 61a4f64ac2bfc..540471a05901a 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -257,8 +257,8 @@ define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 ; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
 ; CHECK-SVE:       // %bb.0:
 ; CHECK-SVE-NEXT:    ldrh w8, [x0]
-; CHECK-SVE-NEXT:    ptrue p0.d, vl2
 ; CHECK-SVE-NEXT:    ldrh w9, [x0, #2]
+; CHECK-SVE-NEXT:    ptrue p0.d, vl2
 ; CHECK-SVE-NEXT:    ldr d0, [x1]
 ; CHECK-SVE-NEXT:    fmov d1, x8
 ; CHECK-SVE-NEXT:    sshll v0.2d, v0.2s, #0
diff --git a/llvm/test/CodeGen/AArch64/active_lane_mask.ll b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
index a65c5d6667794..43122c8c953fc 100644
--- a/llvm/test/CodeGen/AArch64/active_lane_mask.ll
+++ b/llvm/test/CodeGen/AArch64/active_lane_mask.ll
@@ -131,9 +131,9 @@ define <vscale x 2 x i1> @lane_mask_nxv2i1_i8(i8 %index, i8 %TC) {
 ; CHECK-NEXT:    index z0.d, #0, #1
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    and x8, x0, #0xff
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
-; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-NEXT:    and x8, x1, #0xff
 ; CHECK-NEXT:    and z0.d, z0.d, #0xff
 ; CHECK-NEXT:    add z0.d, z0.d, z1.d
@@ -153,6 +153,7 @@ define <vscale x 32 x i1> @lane_mask_nxv32i1_i32(i32 %index, i32 %TC) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
@@ -160,16 +161,16 @@ define <vscale x 32 x i1> @lane_mask_nxv32i1_i32(i32 %index, i32 %TC) {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    index z0.s, #0, #1
 ; CHECK-NEXT:    mov z1.s, w0
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z25.s, w1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.d, z0.d
 ; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:    uqadd z6.s, z0.s, z1.s
 ; CHECK-NEXT:    incw z0.s, all, mul #4
 ; CHECK-NEXT:    incw z2.s
 ; CHECK-NEXT:    incw z3.s, all, mul #2
-; CHECK-NEXT:    cmphi p2.s, p0/z, z25.s, z6.s
 ; CHECK-NEXT:    uqadd z0.s, z0.s, z1.s
+; CHECK-NEXT:    cmphi p2.s, p0/z, z25.s, z6.s
 ; CHECK-NEXT:    mov z4.d, z2.d
 ; CHECK-NEXT:    uqadd z5.s, z2.s, z1.s
 ; CHECK-NEXT:    uqadd z7.s, z3.s, z1.s
@@ -177,25 +178,26 @@ define <vscale x 32 x i1> @lane_mask_nxv32i1_i32(i32 %index, i32 %TC) {
 ; CHECK-NEXT:    incw z3.s, all, mul #4
 ; CHECK-NEXT:    cmphi p5.s, p0/z, z25.s, z0.s
 ; CHECK-NEXT:    incw z4.s, all, mul #2
-; CHECK-NEXT:    cmphi p1.s, p0/z, z25.s, z5.s
-; CHECK-NEXT:    cmphi p3.s, p0/z, z25.s, z7.s
 ; CHECK-NEXT:    uqadd z2.s, z2.s, z1.s
 ; CHECK-NEXT:    uqadd z3.s, z3.s, z1.s
+; CHECK-NEXT:    cmphi p1.s, p0/z, z25.s, z5.s
+; CHECK-NEXT:    cmphi p3.s, p0/z, z25.s, z7.s
 ; CHECK-NEXT:    uqadd z24.s, z4.s, z1.s
 ; CHECK-NEXT:    incw z4.s, all, mul #4
-; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
 ; CHECK-NEXT:    cmphi p6.s, p0/z, z25.s, z2.s
-; CHECK-NEXT:    cmphi p2.s, p0/z, z25.s, z3.s
+; CHECK-NEXT:    cmphi p7.s, p0/z, z25.s, z3.s
+; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
 ; CHECK-NEXT:    uqadd z1.s, z4.s, z1.s
 ; CHECK-NEXT:    cmphi p4.s, p0/z, z25.s, z24.s
-; CHECK-NEXT:    uzp1 p3.h, p3.h, p4.h
 ; CHECK-NEXT:    cmphi p0.s, p0/z, z25.s, z1.s
-; CHECK-NEXT:    uzp1 p4.h, p5.h, p6.h
+; CHECK-NEXT:    uzp1 p2.h, p3.h, p4.h
+; CHECK-NEXT:    uzp1 p3.h, p5.h, p6.h
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    uzp1 p2.h, p2.h, p0.h
-; CHECK-NEXT:    uzp1 p0.b, p1.b, p3.b
-; CHECK-NEXT:    uzp1 p1.b, p4.b, p2.b
+; CHECK-NEXT:    uzp1 p4.h, p7.h, p0.h
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    uzp1 p0.b, p1.b, p2.b
+; CHECK-NEXT:    uzp1 p1.b, p3.b, p4.b
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -208,96 +210,97 @@ define <vscale x 32 x i1> @lane_mask_nxv32i1_i64(i64 %index, i64 %TC) {
 ; CHECK-LABEL: lane_mask_nxv32i1_i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p10, [sp, #1, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    str p9, [sp, #2, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
+; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    index z1.d, #0, #1
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
+; CHECK-NEXT:    index z5.d, #0, #1
 ; CHECK-NEXT:    mov z0.d, x0
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z3.d, x1
-; CHECK-NEXT:    mov z2.d, z1.d
-; CHECK-NEXT:    mov z4.d, z1.d
-; CHECK-NEXT:    mov z6.d, z1.d
-; CHECK-NEXT:    uqadd z25.d, z1.d, z0.d
-; CHECK-NEXT:    incd z1.d, all, mul #8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z2.d, z5.d
+; CHECK-NEXT:    mov z1.d, z5.d
+; CHECK-NEXT:    mov z4.d, z5.d
+; CHECK-NEXT:    uqadd z25.d, z5.d, z0.d
+; CHECK-NEXT:    incd z5.d, all, mul #8
 ; CHECK-NEXT:    incd z2.d
-; CHECK-NEXT:    incd z4.d, all, mul #2
-; CHECK-NEXT:    incd z6.d, all, mul #4
-; CHECK-NEXT:    cmphi p1.d, p0/z, z3.d, z25.d
-; CHECK-NEXT:    uqadd z1.d, z1.d, z0.d
-; CHECK-NEXT:    mov z5.d, z2.d
-; CHECK-NEXT:    uqadd z26.d, z2.d, z0.d
+; CHECK-NEXT:    incd z1.d, all, mul #2
+; CHECK-NEXT:    incd z4.d, all, mul #4
+; CHECK-NEXT:    uqadd z5.d, z5.d, z0.d
+; CHECK-NEXT:    cmphi p3.d, p0/z, z3.d, z25.d
+; CHECK-NEXT:    mov z6.d, z2.d
 ; CHECK-NEXT:    mov z7.d, z2.d
-; CHECK-NEXT:    mov z24.d, z4.d
-; CHECK-NEXT:    uqadd z27.d, z4.d, z0.d
-; CHECK-NEXT:    uqadd z28.d, z6.d, z0.d
+; CHECK-NEXT:    mov z24.d, z1.d
+; CHECK-NEXT:    uqadd z26.d, z2.d, z0.d
+; CHECK-NEXT:    uqadd z27.d, z1.d, z0.d
+; CHECK-NEXT:    uqadd z28.d, z4.d, z0.d
 ; CHECK-NEXT:    incd z2.d, all, mul #8
+; CHECK-NEXT:    incd z1.d, all, mul #8
 ; CHECK-NEXT:    incd z4.d, all, mul #8
-; CHECK-NEXT:    incd z6.d, all, mul #8
-; CHECK-NEXT:    incd z5.d, all, mul #2
+; CHECK-NEXT:    incd z6.d, all, mul #2
 ; CHECK-NEXT:    incd z7.d, all, mul #4
-; CHECK-NEXT:    cmphi p2.d, p0/z, z3.d, z26.d
 ; CHECK-NEXT:    incd z24.d, all, mul #4
-; CHECK-NEXT:    cmphi p3.d, p0/z, z3.d, z27.d
-; CHECK-NEXT:    cmphi p5.d, p0/z, z3.d, z28.d
+; CHECK-NEXT:    cmphi p4.d, p0/z, z3.d, z26.d
+; CHECK-NEXT:    cmphi p2.d, p0/z, z3.d, z27.d
+; CHECK-NEXT:    cmphi p1.d, p0/z, z3.d, z28.d
+; CHECK-NEXT:    mov z31.d, z6.d
+; CHECK-NEXT:    uqadd z29.d, z6.d, z0.d
+; CHECK-NEXT:    uqadd z30.d, z7.d, z0.d
+; CHECK-NEXT:    uqadd z8.d, z24.d, z0.d
+; CHECK-NEXT:    incd z6.d, all, mul #8
+; CHECK-NEXT:    incd z7.d, all, mul #8
+; CHECK-NEXT:    incd z24.d, all, mul #8
 ; CHECK-NEXT:    uqadd z2.d, z2.d, z0.d
+; CHECK-NEXT:    uqadd z1.d, z1.d, z0.d
+; CHECK-NEXT:    incd z31.d, all, mul #4
 ; CHECK-NEXT:    uqadd z4.d, z4.d, z0.d
+; CHECK-NEXT:    uzp1 p3.s, p3.s, p4.s
+; CHECK-NEXT:    cmphi p5.d, p0/z, z3.d, z29.d
+; CHECK-NEXT:    cmphi p7.d, p0/z, z3.d, z30.d
 ; CHECK-NEXT:    uqadd z6.d, z6.d, z0.d
-; CHECK-NEXT:    mov z26.d, z5.d
-; CHECK-NEXT:    uqadd z25.d, z5.d, z0.d
-; CHECK-NEXT:    uqadd z27.d, z7.d, z0.d
-; CHECK-NEXT:    incd z5.d, all, mul #8
-; CHECK-NEXT:    incd z7.d, all, mul #8
-; CHECK-NEXT:    uzp1 p1.s, p1.s, p2.s
-; CHECK-NEXT:    incd z26.d, all, mul #4
-; CHECK-NEXT:    cmphi p8.d, p0/z, z3.d, z2.d
-; CHECK-NEXT:    cmphi p4.d, p0/z, z3.d, z25.d
-; CHECK-NEXT:    uqadd z25.d, z24.d, z0.d
-; CHECK-NEXT:    incd z24.d, all, mul #8
-; CHECK-NEXT:    uqadd z5.d, z5.d, z0.d
+; CHECK-NEXT:    cmphi p6.d, p0/z, z3.d, z8.d
+; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    uqadd z7.d, z7.d, z0.d
-; CHECK-NEXT:    cmphi p6.d, p0/z, z3.d, z27.d
-; CHECK-NEXT:    uqadd z28.d, z26.d, z0.d
-; CHECK-NEXT:    incd z26.d, all, mul #8
-; CHECK-NEXT:    uzp1 p3.s, p3.s, p4.s
+; CHECK-NEXT:    uqadd z25.d, z31.d, z0.d
+; CHECK-NEXT:    incd z31.d, all, mul #8
 ; CHECK-NEXT:    uqadd z24.d, z24.d, z0.d
-; CHECK-NEXT:    cmphi p7.d, p0/z, z3.d, z25.d
-; CHECK-NEXT:    cmphi p4.d, p0/z, z3.d, z1.d
-; CHECK-NEXT:    uzp1 p5.s, p5.s, p6.s
-; CHECK-NEXT:    cmphi p6.d, p0/z, z3.d, z4.d
-; CHECK-NEXT:    cmphi p9.d, p0/z, z3.d, z5.d
-; CHECK-NEXT:    cmphi p10.d, p0/z, z3.d, z7.d
-; CHECK-NEXT:    uqadd z0.d, z26.d, z0.d
-; CHECK-NEXT:    cmphi p2.d, p0/z, z3.d, z28.d
-; CHECK-NEXT:    uzp1 p4.s, p4.s, p8.s
-; CHECK-NEXT:    cmphi p8.d, p0/z, z3.d, z24.d
-; CHECK-NEXT:    uzp1 p6.s, p6.s, p9.s
+; CHECK-NEXT:    cmphi p4.d, p0/z, z3.d, z5.d
+; CHECK-NEXT:    uzp1 p2.s, p2.s, p5.s
+; CHECK-NEXT:    cmphi p5.d, p0/z, z3.d, z2.d
+; CHECK-NEXT:    cmphi p9.d, p0/z, z3.d, z6.d
+; CHECK-NEXT:    uqadd z0.d, z31.d, z0.d
+; CHECK-NEXT:    uzp1 p1.s, p1.s, p7.s
+; CHECK-NEXT:    cmphi p7.d, p0/z, z3.d, z1.d
+; CHECK-NEXT:    cmphi p8.d, p0/z, z3.d, z25.d
+; CHECK-NEXT:    uzp1 p2.h, p3.h, p2.h
+; CHECK-NEXT:    cmphi p3.d, p0/z, z3.d, z7.d
+; CHECK-NEXT:    uzp1 p4.s, p4.s, p5.s
+; CHECK-NEXT:    uzp1 p5.s, p7.s, p9.s
 ; CHECK-NEXT:    ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    uzp1 p1.h, p1.h, p3.h
-; CHECK-NEXT:    uzp1 p2.s, p7.s, p2.s
-; CHECK-NEXT:    cmphi p7.d, p0/z, z3.d, z6.d
+; CHECK-NEXT:    uzp1 p6.s, p6.s, p8.s
+; CHECK-NEXT:    cmphi p8.d, p0/z, z3.d, z4.d
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    uzp1 p4.h, p4.h, p5.h
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    uzp1 p1.h, p1.h, p6.h
+; CHECK-NEXT:    cmphi p6.d, p0/z, z3.d, z24.d
 ; CHECK-NEXT:    cmphi p0.d, p0/z, z3.d, z0.d
-; CHECK-NEXT:    uzp1 p7.s, p7.s, p10.s
-; CHECK-NEXT:    ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    uzp1 p0.s, p8.s, p0.s
+; CHECK-NEXT:    uzp1 p3.s, p8.s, p3.s
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    uzp1 p3.h, p4.h, p6.h
+; CHECK-NEXT:    uzp1 p0.s, p6.s, p0.s
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    uzp1 p2.h, p5.h, p2.h
-; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    uzp1 p4.h, p7.h, p0.h
-; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    uzp1 p0.b, p1.b, p2.b
-; CHECK-NEXT:    uzp1 p1.b, p3.b, p4.b
+; CHECK-NEXT:    uzp1 p3.h, p3.h, p0.h
+; CHECK-NEXT:    uzp1 p0.b, p2.b, p1.b
+; CHECK-NEXT:    uzp1 p1.b, p4.b, p3.b
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %active.lane.mask = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 %index, i64 %TC)
@@ -459,12 +462,12 @@ define <4 x i1> @lane_mask_v4i1_i8(i8 %index, i8 %TC) {
 ; CHECK-NEXT:    adrp x8, .LCPI26_0
 ; CHECK-NEXT:    movi d2, #0xff00ff00ff00ff
 ; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI26_0]
+; CHECK-NEXT:    dup v3.4h, w1
 ; CHECK-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-NEXT:    bic v3.4h, #255, lsl #8
 ; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    dup v1.4h, w1
 ; CHECK-NEXT:    umin v0.4h, v0.4h, v2.4h
-; CHECK-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-NEXT:    cmhi v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    cmhi v0.4h, v3.4h, v0.4h
 ; CHECK-NEXT:    ret
   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i8(i8 %index, i8 %TC)
   ret <4 x i1> %active.lane.mask
@@ -480,9 +483,9 @@ define <2 x i1> @lane_mask_v2i1_i8(i8 %index, i8 %TC) {
 ; CHECK-NEXT:    dup v3.2s, w1
 ; CHECK-NEXT:    and v1.8b, v1.8b, v0.8b
 ; CHECK-NEXT:    add v1.2s, v1.2s, v2.2s
-; CHECK-NEXT:    umin v1.2s, v1.2s, v0.2s
-; CHECK-NEXT:    and v0.8b, v3.8b, v0.8b
-; CHECK-NEXT:    cmhi v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    and v2.8b, v3.8b, v0.8b
+; CHECK-NEXT:    umin v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    cmhi v0.2s, v2.2s, v0.2s
 ; CHECK-NEXT:    ret
   %active.lane.mask = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i8(i8 %index, i8 %TC)
   ret <2 x i1> %active.lane.mask
diff --git a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
index 3007e7ce771e6..508f68d6f14d4 100644
--- a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
@@ -54,19 +54,19 @@ define <4 x bfloat> @uitofp_v4i64_to_v4bf16(ptr %ptr) {
 ; CHECK-LABEL: uitofp_v4i64_to_v4bf16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    movi v2.4s, #1
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    ucvtf v0.2d, v0.2d
 ; CHECK-NEXT:    ucvtf v1.2d, v1.2d
 ; CHECK-NEXT:    fcvtn v0.2s, v0.2d
 ; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
-; CHECK-NEXT:    add v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    and v2.16b, v3.16b, v2.16b
-; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    fcmeq v2.4s, v0.4s, v0.4s
+; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
 ; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    bit v0.16b, v1.16b, v3.16b
 ; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x i64>, ptr %ptr
diff --git a/llvm/test/CodeGen/AArch64/arm64-vabs.ll b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
index fe4da2e7cf36b..5b45ba2552cef 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vabs.ll
@@ -257,12 +257,12 @@ define i16 @uabd16b_rdx(ptr %a, ptr %b) {
 ; CHECK-GI-NEXT:    movi.2d v0, #0000000000000000
 ; CHECK-GI-NEXT:    usubl.8h v3, v1, v2
 ; CHECK-GI-NEXT:    usubl2.8h v1, v1, v2
-; CHECK-GI-NEXT:    neg.8h v2, v3
-; CHECK-GI-NEXT:    neg.8h v4, v1
-; CHECK-GI-NEXT:    cmgt.8h v5, v0, v3
+; CHECK-GI-NEXT:    cmgt.8h v2, v0, v3
 ; CHECK-GI-NEXT:    cmgt.8h v0, v0, v1
-; CHECK-GI-NEXT:    bif.16b v2, v3, v5
-; CHECK-GI-NEXT:    bsl.16b v0, v4, v1
+; CHECK-GI-NEXT:    neg.8h v4, v3
+; CHECK-GI-NEXT:    neg.8h v5, v1
+; CHECK-GI-NEXT:    bsl.16b v2, v4, v3
+; CHECK-GI-NEXT:    bsl.16b v0, v5, v1
 ; CHECK-GI-NEXT:    add.8h v0, v2, v0
 ; CHECK-GI-NEXT:    addv.8h h0, v0
 ; CHECK-GI-NEXT:    fmov w0, s0
@@ -299,18 +299,18 @@ define i32 @uabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-GI-NEXT:    usubl2.4s v3, v3, v4
 ; CHECK-GI-NEXT:    usubl.4s v4, v0, v1
 ; CHECK-GI-NEXT:    usubl2.4s v0, v0, v1
-; CHECK-GI-NEXT:    neg.4s v6, v5
-; CHECK-GI-NEXT:    neg.4s v7, v3
 ; CHECK-GI-NEXT:    cmgt.4s v1, v2, v5
-; CHECK-GI-NEXT:    neg.4s v16, v4
-; CHECK-GI-NEXT:    neg.4s v17, v0
-; CHECK-GI-NEXT:    cmgt.4s v18, v2, v3
-; CHECK-GI-NEXT:    cmgt.4s v19, v2, v4
+; CHECK-GI-NEXT:    cmgt.4s v6, v2, v3
+; CHECK-GI-NEXT:    neg.4s v16, v5
+; CHECK-GI-NEXT:    cmgt.4s v7, v2, v4
 ; CHECK-GI-NEXT:    cmgt.4s v2, v2, v0
-; CHECK-GI-NEXT:    bsl.16b v1, v6, v5
-; CHECK-GI-NEXT:    bit.16b v3, v7, v18
-; CHECK-GI-NEXT:    bit.16b v4, v16, v19
-; CHECK-GI-NEXT:    bit.16b v0, v17, v2
+; CHECK-GI-NEXT:    neg.4s v17, v3
+; CHECK-GI-NEXT:    neg.4s v18, v4
+; CHECK-GI-NEXT:    neg.4s v19, v0
+; CHECK-GI-NEXT:    bsl.16b v1, v16, v5
+; CHECK-GI-NEXT:    bit.16b v3, v17, v6
+; CHECK-GI-NEXT:    bit.16b v4, v18, v7
+; CHECK-GI-NEXT:    bit.16b v0, v19, v2
 ; CHECK-GI-NEXT:    add.4s v1, v1, v3
 ; CHECK-GI-NEXT:    add.4s v0, v4, v0
 ; CHECK-GI-NEXT:    add.4s v0, v1, v0
@@ -347,18 +347,18 @@ define i32 @sabd16b_rdx_i32(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-GI-NEXT:    ssubl2.4s v3, v3, v4
 ; CHECK-GI-NEXT:    ssubl.4s v4, v0, v1
 ; CHECK-GI-NEXT:    ssubl2.4s v0, v0, v1
-; CHECK-GI-NEXT:    neg.4s v6, v5
-; CHECK-GI-NEXT:    neg.4s v7, v3
 ; CHECK-GI-NEXT:    cmgt.4s v1, v2, v5
-; CHECK-GI-NEXT:    neg.4s v16, v4
-; CHECK-GI-NEXT:    neg.4s v17, v0
-; CHECK-GI-NEXT:    cmgt.4s v18, v2, v3
-; CHECK-GI-NEXT:    cmgt.4s v19, v2, v4
+; CHECK-GI-NEXT:    cmgt.4s v6, v2, v3
+; CHECK-GI-NEXT:    neg.4s v16, v5
+; CHECK-GI-NEXT:    cmgt.4s v7, v2, v4
 ; CHECK-GI-NEXT:    cmgt.4s v2, v2, v0
-; CHECK-GI-NEXT:    bsl.16b v1, v6, v5
-; CHECK-GI-NEXT:    bit.16b v3, v7, v18
-; CHECK-GI-NEXT:    bit.16b v4, v16, v19
-; CHECK-GI-NEXT:    bit.16b v0, v17, v2
+; CHECK-GI-NEXT:    neg.4s v17, v3
+; CHECK-GI-NEXT:    neg.4s v18, v4
+; CHECK-GI-NEXT:    neg.4s v19, v0
+; CHECK-GI-NEXT:    bsl.16b v1, v16, v5
+; CHECK-GI-NEXT:    bit.16b v3, v17, v6
+; CHECK-GI-NEXT:    bit.16b v4, v18, v7
+; CHECK-GI-NEXT:    bit.16b v0, v19, v2
 ; CHECK-GI-NEXT:    add.4s v1, v1, v3
 ; CHECK-GI-NEXT:    add.4s v0, v4, v0
 ; CHECK-GI-NEXT:    add.4s v0, v1, v0
@@ -396,12 +396,12 @@ define i32 @uabd8h_rdx(ptr %a, ptr %b) {
 ; CHECK-GI-NEXT:    movi.2d v0, #0000000000000000
 ; CHECK-GI-NEXT:    usubl.4s v3, v1, v2
 ; CHECK-GI-NEXT:    usubl2.4s v1, v1, v2
-; CHECK-GI-NEXT:    neg.4s v2, v3
-; CHECK-GI-NEXT:    neg.4s v4, v1
-; CHECK-GI-NEXT:    cmgt.4s v5, v0, v3
+; CHECK-GI-NEXT:    cmgt.4s v2, v0, v3
 ; CHECK-GI-NEXT:    cmgt.4s v0, v0, v1
-; CHECK-GI-NEXT:    bif.16b v2, v3, v5
-; CHECK-GI-NEXT:    bsl.16b v0, v4, v1
+; CHECK-GI-NEXT:    neg.4s v4, v3
+; CHECK-GI-NEXT:    neg.4s v5, v1
+; CHECK-GI-NEXT:    bsl.16b v2, v4, v3
+; CHECK-GI-NEXT:    bsl.16b v0, v5, v1
 ; CHECK-GI-NEXT:    add.4s v0, v2, v0
 ; CHECK-GI-NEXT:    addv.4s s0, v0
 ; CHECK-GI-NEXT:    fmov w0, s0
@@ -428,15 +428,15 @@ define i32 @sabd8h_rdx(<8 x i16> %a, <8 x i16> %b) {
 ;
 ; CHECK-GI-LABEL: sabd8h_rdx:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.2d v2, #0000000000000000
 ; CHECK-GI-NEXT:    ssubl.4s v3, v0, v1
 ; CHECK-GI-NEXT:    ssubl2.4s v0, v0, v1
-; CHECK-GI-NEXT:    movi.2d v2, #0000000000000000
-; CHECK-GI-NEXT:    neg.4s v1, v3
-; CHECK-GI-NEXT:    neg.4s v4, v0
-; CHECK-GI-NEXT:    cmgt.4s v5, v2, v3
+; CHECK-GI-NEXT:    neg.4s v4, v3
+; CHECK-GI-NEXT:    neg.4s v5, v0
+; CHECK-GI-NEXT:    cmgt.4s v1, v2, v3
 ; CHECK-GI-NEXT:    cmgt.4s v2, v2, v0
-; CHECK-GI-NEXT:    bif.16b v1, v3, v5
-; CHECK-GI-NEXT:    bit.16b v0, v4, v2
+; CHECK-GI-NEXT:    bsl.16b v1, v4, v3
+; CHECK-GI-NEXT:    bit.16b v0, v5, v2
 ; CHECK-GI-NEXT:    add.4s v0, v1, v0
 ; CHECK-GI-NEXT:    addv.4s s0, v0
 ; CHECK-GI-NEXT:    fmov w0, s0
@@ -461,10 +461,10 @@ define i32 @uabdl4s_rdx_i32(<4 x i16> %a, <4 x i16> %b) {
 ;
 ; CHECK-GI-LABEL: uabdl4s_rdx_i32:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.2d v2, #0000000000000000
 ; CHECK-GI-NEXT:    usubl.4s v0, v0, v1
-; CHECK-GI-NEXT:    movi.2d v1, #0000000000000000
+; CHECK-GI-NEXT:    cmgt.4s v1, v2, v0
 ; CHECK-GI-NEXT:    neg.4s v2, v0
-; CHECK-GI-NEXT:    cmgt.4s v1, v1, v0
 ; CHECK-GI-NEXT:    bit.16b v0, v2, v1
 ; CHECK-GI-NEXT:    addv.4s s0, v0
 ; CHECK-GI-NEXT:    fmov w0, s0
@@ -499,12 +499,12 @@ define i64 @uabd4s_rdx(ptr %a, ptr %b, i32 %h) {
 ; CHECK-GI-NEXT:    movi.2d v0, #0000000000000000
 ; CHECK-GI-NEXT:    usubl.2d v3, v1, v2
 ; CHECK-GI-NEXT:    usubl2.2d v1, v1, v2
-; CHECK-GI-NEXT:    neg.2d v2, v3
-; CHECK-GI-NEXT:    neg.2d v4, v1
-; CHECK-GI-NEXT:    cmgt.2d v5, v0, v3
+; CHECK-GI-NEXT:    cmgt.2d v2, v0, v3
 ; CHECK-GI-NEXT:    cmgt.2d v0, v0, v1
-; CHECK-GI-NEXT:    bif.16b v2, v3, v5
-; CHECK-GI-NEXT:    bsl.16b v0, v4, v1
+; CHECK-GI-NEXT:    neg.2d v4, v3
+; CHECK-GI-NEXT:    neg.2d v5, v1
+; CHECK-GI-NEXT:    bsl.16b v2, v4, v3
+; CHECK-GI-NEXT:    bsl.16b v0, v5, v1
 ; CHECK-GI-NEXT:    add.2d v0, v2, v0
 ; CHECK-GI-NEXT:    addp.2d d0, v0
 ; CHECK-GI-NEXT:    fmov x0, d0
@@ -531,15 +531,15 @@ define i64 @sabd4s_rdx(<4 x i32> %a, <4 x i32> %b) {
 ;
 ; CHECK-GI-LABEL: sabd4s_rdx:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.2d v2, #0000000000000000
 ; CHECK-GI-NEXT:    ssubl.2d v3, v0, v1
 ; CHECK-GI-NEXT:    ssubl2.2d v0, v0, v1
-; CHECK-GI-NEXT:    movi.2d v2, #0000000000000000
-; CHECK-GI-NEXT:    neg.2d v1, v3
-; CHECK-GI-NEXT:    neg.2d v4, v0
-; CHECK-GI-NEXT:    cmgt.2d v5, v2, v3
+; CHECK-GI-NEXT:    neg.2d v4, v3
+; CHECK-GI-NEXT:    neg.2d v5, v0
+; CHECK-GI-NEXT:    cmgt.2d v1, v2, v3
 ; CHECK-GI-NEXT:    cmgt.2d v2, v2, v0
-; CHECK-GI-NEXT:    bif.16b v1, v3, v5
-; CHECK-GI-NEXT:    bit.16b v0, v4, v2
+; CHECK-GI-NEXT:    bsl.16b v1, v4, v3
+; CHECK-GI-NEXT:    bit.16b v0, v5, v2
 ; CHECK-GI-NEXT:    add.2d v0, v1, v0
 ; CHECK-GI-NEXT:    addp.2d d0, v0
 ; CHECK-GI-NEXT:    fmov x0, d0
@@ -564,10 +564,10 @@ define i64 @uabdl2d_rdx_i64(<2 x i32> %a, <2 x i32> %b) {
 ;
 ; CHECK-GI-LABEL: uabdl2d_rdx_i64:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.2d v2, #0000000000000000
 ; CHECK-GI-NEXT:    usubl.2d v0, v0, v1
-; CHECK-GI-NEXT:    movi.2d v1, #0000000000000000
+; CHECK-GI-NEXT:    cmgt.2d v1, v2, v0
 ; CHECK-GI-NEXT:    neg.2d v2, v0
-; CHECK-GI-NEXT:    cmgt.2d v1, v1, v0
 ; CHECK-GI-NEXT:    bit.16b v0, v2, v1
 ; CHECK-GI-NEXT:    addp.2d d0, v0
 ; CHECK-GI-NEXT:    fmov x0, d0
@@ -1796,10 +1796,10 @@ define <2 x i64> @uabd_i32(<2 x i32> %a, <2 x i32> %b) {
 ;
 ; CHECK-GI-LABEL: uabd_i32:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi.2d v2, #0000000000000000
 ; CHECK-GI-NEXT:    ssubl.2d v0, v0, v1
-; CHECK-GI-NEXT:    movi.2d v1, #0000000000000000
+; CHECK-GI-NEXT:    cmgt.2d v1, v2, v0
 ; CHECK-GI-NEXT:    neg.2d v2, v0
-; CHECK-GI-NEXT:    cmgt.2d v1, v1, v0
 ; CHECK-GI-NEXT:    bit.16b v0, v2, v1
 ; CHECK-GI-NEXT:    ret
   %aext = sext <2 x i32> %a to <2 x i64>
diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
index cafee32ada686..d4cc154ac6afc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
@@ -205,15 +205,15 @@ define <2 x bfloat> @test_vcvt_bf16_f64(<2 x double> %v) nounwind readnone ssp {
 ; GENERIC-LABEL: test_vcvt_bf16_f64:
 ; GENERIC:       // %bb.0:
 ; GENERIC-NEXT:    fcvtxn v0.2s, v0.2d
-; GENERIC-NEXT:    movi.4s v1, #127, msl #8
-; GENERIC-NEXT:    movi.4s v2, #1
+; GENERIC-NEXT:    movi.4s v1, #1
+; GENERIC-NEXT:    movi.4s v2, #127, msl #8
 ; GENERIC-NEXT:    ushr.4s v3, v0, #16
-; GENERIC-NEXT:    add.4s v1, v0, v1
-; GENERIC-NEXT:    and.16b v2, v3, v2
-; GENERIC-NEXT:    add.4s v1, v2, v1
-; GENERIC-NEXT:    fcmeq.4s v2, v0, v0
+; GENERIC-NEXT:    add.4s v2, v0, v2
+; GENERIC-NEXT:    and.16b v1, v3, v1
+; GENERIC-NEXT:    fcmeq.4s v3, v0, v0
 ; GENERIC-NEXT:    orr.4s v0, #64, lsl #16
-; GENERIC-NEXT:    bit.16b v0, v1, v2
+; GENERIC-NEXT:    add.4s v1, v1, v2
+; GENERIC-NEXT:    bit.16b v0, v1, v3
 ; GENERIC-NEXT:    shrn.4h v0, v0, #16
 ; GENERIC-NEXT:    ret
 ;
@@ -238,15 +238,15 @@ define <2 x bfloat> @test_vcvt_bf16_f64(<2 x double> %v) nounwind readnone ssp {
 ; GISEL-LABEL: test_vcvt_bf16_f64:
 ; GISEL:       // %bb.0:
 ; GISEL-NEXT:    fcvtxn v0.2s, v0.2d
-; GISEL-NEXT:    movi.4s v1, #127, msl #8
-; GISEL-NEXT:    movi.4s v2, #1
+; GISEL-NEXT:    movi.4s v1, #1
+; GISEL-NEXT:    movi.4s v2, #127, msl #8
 ; GISEL-NEXT:    ushr.4s v3, v0, #16
-; GISEL-NEXT:    add.4s v1, v0, v1
-; GISEL-NEXT:    and.16b v2, v3, v2
-; GISEL-NEXT:    add.4s v1, v2, v1
-; GISEL-NEXT:    fcmeq.4s v2, v0, v0
+; GISEL-NEXT:    add.4s v2, v0, v2
+; GISEL-NEXT:    and.16b v1, v3, v1
+; GISEL-NEXT:    fcmeq.4s v3, v0, v0
 ; GISEL-NEXT:    orr.4s v0, #64, lsl #16
-; GISEL-NEXT:    bit.16b v0, v1, v2
+; GISEL-NEXT:    add.4s v1, v1, v2
+; GISEL-NEXT:    bit.16b v0, v1, v3
 ; GISEL-NEXT:    shrn.4h v0, v0, #16
 ; GISEL-NEXT:    ret
   %vcvt1.i = fptrunc <2 x double> %v to <2 x bfloat>
diff --git a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
index dda610e5dd3cb..e754f01daa2a9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vhadd.ll
@@ -903,10 +903,10 @@ define <2 x i16> @hadd8x2_sext_lsr(<2 x i8> %src1, <2 x i8> %src2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    shl.2s v0, v0, #24
 ; CHECK-NEXT:    shl.2s v1, v1, #24
+; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
 ; CHECK-NEXT:    sshr.2s v0, v0, #24
 ; CHECK-NEXT:    ssra.2s v0, v1, #24
-; CHECK-NEXT:    movi d1, #0x00ffff0000ffff
-; CHECK-NEXT:    and.8b v0, v0, v1
+; CHECK-NEXT:    and.8b v0, v0, v2
 ; CHECK-NEXT:    ushr.2s v0, v0, #1
 ; CHECK-NEXT:    ret
   %zextsrc1 = sext <2 x i8> %src1 to <2 x i16>
@@ -968,10 +968,10 @@ define <4 x i16> @rhadd8_sext_lsr(<4 x i8> %src1, <4 x i8> %src2) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    shl.4h v0, v0, #8
 ; CHECK-NEXT:    shl.4h v1, v1, #8
+; CHECK-NEXT:    movi.4h v2, #1
 ; CHECK-NEXT:    sshr.4h v0, v0, #8
 ; CHECK-NEXT:    ssra.4h v0, v1, #8
-; CHECK-NEXT:    movi.4h v1, #1
-; CHECK-NEXT:    add.4h v0, v0, v1
+; CHECK-NEXT:    add.4h v0, v0, v2
 ; CHECK-NEXT:    ushr.4h v0, v0, #1
 ; CHECK-NEXT:    ret
   %zextsrc1 = sext <4 x i8> %src1 to <4 x i16>
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll
index ebf5ce20d4ecc..86b1d5d195ffd 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-contract.ll
@@ -7,21 +7,22 @@ target triple = "aarch64-unknown-linux-gnu"
 define <vscale x 4 x double> @mull_add(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c) {
 ; CHECK-LABEL: mull_add:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uzp1 z6.d, z0.d, z1.d
-; CHECK-NEXT:    uzp2 z7.d, z2.d, z3.d
-; CHECK-NEXT:    uzp2 z0.d, z0.d, z1.d
+; CHECK-NEXT:    uzp2 z6.d, z0.d, z1.d
+; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    uzp2 z1.d, z2.d, z3.d
+; CHECK-NEXT:    uzp1 z2.d, z2.d, z3.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uzp1 z1.d, z2.d, z3.d
-; CHECK-NEXT:    fmul z2.d, z6.d, z7.d
-; CHECK-NEXT:    fmul z3.d, z0.d, z7.d
-; CHECK-NEXT:    fmad z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    fnmsb z1.d, p0/m, z6.d, z3.d
-; CHECK-NEXT:    uzp2 z2.d, z4.d, z5.d
-; CHECK-NEXT:    uzp1 z3.d, z4.d, z5.d
-; CHECK-NEXT:    fadd z2.d, z0.d, z2.d
+; CHECK-NEXT:    fmul z7.d, z0.d, z1.d
+; CHECK-NEXT:    fmul z1.d, z6.d, z1.d
+; CHECK-NEXT:    movprfx z3, z7
+; CHECK-NEXT:    fmla z3.d, p0/m, z6.d, z2.d
+; CHECK-NEXT:    fnmsb z0.d, p0/m, z2.d, z1.d
+; CHECK-NEXT:    uzp2 z1.d, z4.d, z5.d
+; CHECK-NEXT:    uzp1 z2.d, z4.d, z5.d
+; CHECK-NEXT:    fadd z2.d, z2.d, z0.d
 ; CHECK-NEXT:    fadd z1.d, z3.d, z1.d
-; CHECK-NEXT:    zip1 z0.d, z1.d, z2.d
-; CHECK-NEXT:    zip2 z1.d, z1.d, z2.d
+; CHECK-NEXT:    zip1 z0.d, z2.d, z1.d
+; CHECK-NEXT:    zip2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -49,21 +50,21 @@ entry:
 define <vscale x 4 x double> @mul_add_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
 ; CHECK-LABEL: mul_add_mull:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z24.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z25.d, z24.d
 ; CHECK-NEXT:    mov z26.d, z24.d
 ; CHECK-NEXT:    mov z27.d, z24.d
+; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #0
 ; CHECK-NEXT:    fcmla z25.d, p0/m, z2.d, z0.d, #0
 ; CHECK-NEXT:    fcmla z26.d, p0/m, z3.d, z1.d, #0
 ; CHECK-NEXT:    fcmla z27.d, p0/m, z6.d, z4.d, #0
-; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #0
+; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #90
 ; CHECK-NEXT:    fcmla z25.d, p0/m, z2.d, z0.d, #90
 ; CHECK-NEXT:    fcmla z26.d, p0/m, z3.d, z1.d, #90
 ; CHECK-NEXT:    fcmla z27.d, p0/m, z6.d, z4.d, #90
-; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #90
-; CHECK-NEXT:    fadd z0.d, z25.d, z27.d
 ; CHECK-NEXT:    fadd z1.d, z26.d, z24.d
+; CHECK-NEXT:    fadd z0.d, z25.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -100,21 +101,21 @@ entry:
 define <vscale x 4 x double> @mul_sub_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
 ; CHECK-LABEL: mul_sub_mull:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z24.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z25.d, z24.d
 ; CHECK-NEXT:    mov z26.d, z24.d
 ; CHECK-NEXT:    mov z27.d, z24.d
+; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #0
 ; CHECK-NEXT:    fcmla z25.d, p0/m, z2.d, z0.d, #0
 ; CHECK-NEXT:    fcmla z26.d, p0/m, z3.d, z1.d, #0
 ; CHECK-NEXT:    fcmla z27.d, p0/m, z6.d, z4.d, #0
-; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #0
+; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #90
 ; CHECK-NEXT:    fcmla z25.d, p0/m, z2.d, z0.d, #90
 ; CHECK-NEXT:    fcmla z26.d, p0/m, z3.d, z1.d, #90
 ; CHECK-NEXT:    fcmla z27.d, p0/m, z6.d, z4.d, #90
-; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #90
-; CHECK-NEXT:    fsub z0.d, z25.d, z27.d
 ; CHECK-NEXT:    fsub z1.d, z26.d, z24.d
+; CHECK-NEXT:    fsub z0.d, z25.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -151,21 +152,21 @@ entry:
 define <vscale x 4 x double> @mul_conj_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
 ; CHECK-LABEL: mul_conj_mull:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z24.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z25.d, z24.d
 ; CHECK-NEXT:    mov z26.d, z24.d
 ; CHECK-NEXT:    mov z27.d, z24.d
+; CHECK-NEXT:    fcmla z24.d, p0/m, z5.d, z7.d, #0
 ; CHECK-NEXT:    fcmla z25.d, p0/m, z2.d, z0.d, #0
 ; CHECK-NEXT:    fcmla z26.d, p0/m, z3.d, z1.d, #0
 ; CHECK-NEXT:    fcmla z27.d, p0/m, z4.d, z6.d, #0
-; CHECK-NEXT:    fcmla z24.d, p0/m, z5.d, z7.d, #0
+; CHECK-NEXT:    fcmla z24.d, p0/m, z5.d, z7.d, #270
 ; CHECK-NEXT:    fcmla z25.d, p0/m, z2.d, z0.d, #90
 ; CHECK-NEXT:    fcmla z26.d, p0/m, z3.d, z1.d, #90
 ; CHECK-NEXT:    fcmla z27.d, p0/m, z4.d, z6.d, #270
-; CHECK-NEXT:    fcmla z24.d, p0/m, z5.d, z7.d, #270
-; CHECK-NEXT:    fadd z0.d, z25.d, z27.d
 ; CHECK-NEXT:    fadd z1.d, z26.d, z24.d
+; CHECK-NEXT:    fadd z0.d, z25.d, z27.d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -206,8 +207,8 @@ define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale
 ; CHECK-NEXT:    mov z25.d, #0 // =0x0
 ; CHECK-NEXT:    uzp1 z4.d, z4.d, z5.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    and z25.d, z25.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    mov z26.d, z24.d
+; CHECK-NEXT:    and z25.d, z25.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    and z26.d, z26.d, #0x8000000000000000
 ; CHECK-NEXT:    orr z5.d, z25.d, z26.d
 ; CHECK-NEXT:    fadd z5.d, z4.d, z5.d
@@ -220,18 +221,19 @@ define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale
 ; CHECK-NEXT:    fsub z4.d, z4.d, z24.d
 ; CHECK-NEXT:    uzp2 z24.d, z6.d, z7.d
 ; CHECK-NEXT:    uzp1 z6.d, z6.d, z7.d
-; CHECK-NEXT:    fmul z3.d, z0.d, z1.d
+; CHECK-NEXT:    fmul z26.d, z0.d, z1.d
 ; CHECK-NEXT:    fmul z1.d, z25.d, z1.d
-; CHECK-NEXT:    fmul z7.d, z4.d, z24.d
+; CHECK-NEXT:    fmul z3.d, z4.d, z24.d
 ; CHECK-NEXT:    fmul z24.d, z5.d, z24.d
-; CHECK-NEXT:    fmla z3.d, p0/m, z25.d, z2.d
+; CHECK-NEXT:    movprfx z7, z26
+; CHECK-NEXT:    fmla z7.d, p0/m, z25.d, z2.d
 ; CHECK-NEXT:    fnmsb z0.d, p0/m, z2.d, z1.d
-; CHECK-NEXT:    movprfx z1, z7
+; CHECK-NEXT:    movprfx z1, z3
 ; CHECK-NEXT:    fmla z1.d, p0/m, z6.d, z5.d
 ; CHECK-NEXT:    movprfx z2, z24
 ; CHECK-NEXT:    fnmls z2.d, p0/m, z4.d, z6.d
 ; CHECK-NEXT:    fadd z2.d, z0.d, z2.d
-; CHECK-NEXT:    fadd z1.d, z3.d, z1.d
+; CHECK-NEXT:    fadd z1.d, z7.d, z1.d
 ; CHECK-NEXT:    zip1 z0.d, z2.d, z1.d
 ; CHECK-NEXT:    zip2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
index 1751f99b3e92c..edf580e334e80 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
@@ -41,19 +41,19 @@ entry:
 define <vscale x 4 x double> @mul_add_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
 ; CHECK-LABEL: mul_add_mull:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z24.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z25.d, z24.d
-; CHECK-NEXT:    fcmla z25.d, p0/m, z6.d, z4.d, #0
 ; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #0
-; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #0
+; CHECK-NEXT:    fcmla z25.d, p0/m, z6.d, z4.d, #0
 ; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #0
-; CHECK-NEXT:    fcmla z25.d, p0/m, z6.d, z4.d, #90
+; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #90
-; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #90
+; CHECK-NEXT:    fcmla z25.d, p0/m, z6.d, z4.d, #90
 ; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #90
-; CHECK-NEXT:    mov z0.d, z25.d
+; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #90
 ; CHECK-NEXT:    mov z1.d, z24.d
+; CHECK-NEXT:    mov z0.d, z25.d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -90,19 +90,19 @@ entry:
 define <vscale x 4 x double> @mul_sub_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
 ; CHECK-LABEL: mul_sub_mull:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z24.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z25.d, z24.d
-; CHECK-NEXT:    fcmla z25.d, p0/m, z6.d, z4.d, #270
 ; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #270
-; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #0
+; CHECK-NEXT:    fcmla z25.d, p0/m, z6.d, z4.d, #270
 ; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #0
-; CHECK-NEXT:    fcmla z25.d, p0/m, z6.d, z4.d, #180
+; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z5.d, #180
-; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #90
+; CHECK-NEXT:    fcmla z25.d, p0/m, z6.d, z4.d, #180
 ; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #90
-; CHECK-NEXT:    mov z0.d, z25.d
+; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #90
 ; CHECK-NEXT:    mov z1.d, z24.d
+; CHECK-NEXT:    mov z0.d, z25.d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -139,19 +139,19 @@ entry:
 define <vscale x 4 x double> @mul_conj_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
 ; CHECK-LABEL: mul_conj_mull:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z24.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z25.d, z24.d
-; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #0
-; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #90
+; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #90
-; CHECK-NEXT:    fcmla z25.d, p0/m, z4.d, z6.d, #0
+; CHECK-NEXT:    fcmla z25.d, p0/m, z0.d, z2.d, #90
 ; CHECK-NEXT:    fcmla z24.d, p0/m, z5.d, z7.d, #0
-; CHECK-NEXT:    fcmla z25.d, p0/m, z4.d, z6.d, #270
+; CHECK-NEXT:    fcmla z25.d, p0/m, z4.d, z6.d, #0
 ; CHECK-NEXT:    fcmla z24.d, p0/m, z5.d, z7.d, #270
-; CHECK-NEXT:    mov z0.d, z25.d
+; CHECK-NEXT:    fcmla z25.d, p0/m, z4.d, z6.d, #270
 ; CHECK-NEXT:    mov z1.d, z24.d
+; CHECK-NEXT:    mov z0.d, z25.d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -188,24 +188,25 @@ entry:
 define <vscale x 4 x double> @mul_add_rot_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
 ; CHECK-LABEL: mul_add_rot_mull:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uzp1 z24.d, z2.d, z3.d
+; CHECK-NEXT:    uzp2 z24.d, z2.d, z3.d
 ; CHECK-NEXT:    uzp2 z25.d, z0.d, z1.d
-; CHECK-NEXT:    uzp2 z2.d, z2.d, z3.d
-; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uzp1 z2.d, z2.d, z3.d
 ; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    uzp2 z1.d, z4.d, z5.d
 ; CHECK-NEXT:    uzp1 z26.d, z6.d, z7.d
-; CHECK-NEXT:    fmul z1.d, z24.d, z25.d
-; CHECK-NEXT:    fmul z3.d, z2.d, z25.d
-; CHECK-NEXT:    uzp2 z25.d, z4.d, z5.d
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uzp1 z4.d, z4.d, z5.d
 ; CHECK-NEXT:    uzp2 z5.d, z6.d, z7.d
-; CHECK-NEXT:    fmla z3.d, p0/m, z26.d, z25.d
-; CHECK-NEXT:    fmla z1.d, p0/m, z2.d, z0.d
-; CHECK-NEXT:    movprfx z2, z3
-; CHECK-NEXT:    fmla z2.d, p0/m, z5.d, z4.d
-; CHECK-NEXT:    fmla z1.d, p0/m, z26.d, z4.d
-; CHECK-NEXT:    fnmls z2.d, p0/m, z24.d, z0.d
-; CHECK-NEXT:    fmls z1.d, p0/m, z5.d, z25.d
+; CHECK-NEXT:    fmul z3.d, z2.d, z25.d
+; CHECK-NEXT:    fmul z25.d, z24.d, z25.d
+; CHECK-NEXT:    fmla z3.d, p0/m, z24.d, z0.d
+; CHECK-NEXT:    movprfx z24, z25
+; CHECK-NEXT:    fmla z24.d, p0/m, z26.d, z1.d
+; CHECK-NEXT:    movprfx z6, z24
+; CHECK-NEXT:    fmla z6.d, p0/m, z5.d, z4.d
+; CHECK-NEXT:    fmla z3.d, p0/m, z26.d, z4.d
+; CHECK-NEXT:    fnmsb z2.d, p0/m, z0.d, z6.d
+; CHECK-NEXT:    fmsb z1.d, p0/m, z5.d, z3.d
 ; CHECK-NEXT:    zip1 z0.d, z2.d, z1.d
 ; CHECK-NEXT:    zip2 z1.d, z2.d, z1.d
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
index 611cf44ea7ee8..cb285c05b2e86 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
@@ -16,9 +16,10 @@ define <vscale x 4 x half> @complex_mul_v4f16(<vscale x 4 x half> %a, <vscale x
 ; CHECK-NEXT:    uzp1 z0.d, z0.d, z2.d
 ; CHECK-NEXT:    uzp2 z2.d, z1.d, z3.d
 ; CHECK-NEXT:    uzp1 z1.d, z1.d, z3.d
-; CHECK-NEXT:    movprfx z3, z2
-; CHECK-NEXT:    fmul z3.h, p0/m, z3.h, z0.h
+; CHECK-NEXT:    movprfx z5, z2
+; CHECK-NEXT:    fmul z5.h, p0/m, z5.h, z0.h
 ; CHECK-NEXT:    fmul z2.h, p0/m, z2.h, z4.h
+; CHECK-NEXT:    movprfx z3, z5
 ; CHECK-NEXT:    fmla z3.h, p0/m, z1.h, z4.h
 ; CHECK-NEXT:    fnmsb z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    zip2 z1.d, z0.d, z3.d
@@ -46,8 +47,8 @@ entry:
 define <vscale x 8 x half> @complex_mul_v8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
 ; CHECK-LABEL: complex_mul_v8f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmla z2.h, p0/m, z1.h, z0.h, #0
 ; CHECK-NEXT:    fcmla z2.h, p0/m, z1.h, z0.h, #90
 ; CHECK-NEXT:    mov z0.d, z2.d
@@ -72,15 +73,15 @@ entry:
 define <vscale x 16 x half> @complex_mul_v16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b) {
 ; CHECK-LABEL: complex_mul_v16f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z4.h, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z5.d, z4.d
-; CHECK-NEXT:    fcmla z5.h, p0/m, z2.h, z0.h, #0
 ; CHECK-NEXT:    fcmla z4.h, p0/m, z3.h, z1.h, #0
-; CHECK-NEXT:    fcmla z5.h, p0/m, z2.h, z0.h, #90
+; CHECK-NEXT:    fcmla z5.h, p0/m, z2.h, z0.h, #0
 ; CHECK-NEXT:    fcmla z4.h, p0/m, z3.h, z1.h, #90
-; CHECK-NEXT:    mov z0.d, z5.d
+; CHECK-NEXT:    fcmla z5.h, p0/m, z2.h, z0.h, #90
 ; CHECK-NEXT:    mov z1.d, z4.d
+; CHECK-NEXT:    mov z0.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
   %a.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %a)
@@ -103,23 +104,23 @@ entry:
 define <vscale x 32 x half> @complex_mul_v32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b) {
 ; CHECK-LABEL: complex_mul_v32f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z24.h, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z25.d, z24.d
 ; CHECK-NEXT:    mov z26.d, z24.d
 ; CHECK-NEXT:    mov z27.d, z24.d
+; CHECK-NEXT:    fcmla z24.h, p0/m, z7.h, z3.h, #0
 ; CHECK-NEXT:    fcmla z25.h, p0/m, z4.h, z0.h, #0
 ; CHECK-NEXT:    fcmla z26.h, p0/m, z5.h, z1.h, #0
 ; CHECK-NEXT:    fcmla z27.h, p0/m, z6.h, z2.h, #0
-; CHECK-NEXT:    fcmla z24.h, p0/m, z7.h, z3.h, #0
+; CHECK-NEXT:    fcmla z24.h, p0/m, z7.h, z3.h, #90
 ; CHECK-NEXT:    fcmla z25.h, p0/m, z4.h, z0.h, #90
 ; CHECK-NEXT:    fcmla z26.h, p0/m, z5.h, z1.h, #90
 ; CHECK-NEXT:    fcmla z27.h, p0/m, z6.h, z2.h, #90
-; CHECK-NEXT:    fcmla z24.h, p0/m, z7.h, z3.h, #90
+; CHECK-NEXT:    mov z3.d, z24.d
 ; CHECK-NEXT:    mov z0.d, z25.d
 ; CHECK-NEXT:    mov z1.d, z26.d
 ; CHECK-NEXT:    mov z2.d, z27.d
-; CHECK-NEXT:    mov z3.d, z24.d
 ; CHECK-NEXT:    ret
 entry:
   %a.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %a)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll
index 0f5e9a2202ddd..1e2afb78de1b0 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll
@@ -7,8 +7,8 @@ target triple = "aarch64"
 define <vscale x 4 x float> @complex_mul_v4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: complex_mul_v4f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmla z2.s, p0/m, z1.s, z0.s, #0
 ; CHECK-NEXT:    fcmla z2.s, p0/m, z1.s, z0.s, #90
 ; CHECK-NEXT:    mov z0.d, z2.d
@@ -34,15 +34,15 @@ entry:
 define <vscale x 8 x float> @complex_mul_v8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b) {
 ; CHECK-LABEL: complex_mul_v8f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z5.d, z4.d
-; CHECK-NEXT:    fcmla z5.s, p0/m, z2.s, z0.s, #0
 ; CHECK-NEXT:    fcmla z4.s, p0/m, z3.s, z1.s, #0
-; CHECK-NEXT:    fcmla z5.s, p0/m, z2.s, z0.s, #90
+; CHECK-NEXT:    fcmla z5.s, p0/m, z2.s, z0.s, #0
 ; CHECK-NEXT:    fcmla z4.s, p0/m, z3.s, z1.s, #90
-; CHECK-NEXT:    mov z0.d, z5.d
+; CHECK-NEXT:    fcmla z5.s, p0/m, z2.s, z0.s, #90
 ; CHECK-NEXT:    mov z1.d, z4.d
+; CHECK-NEXT:    mov z0.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
   %a.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %a)
@@ -65,23 +65,23 @@ entry:
 define <vscale x 16 x float> @complex_mul_v16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b) {
 ; CHECK-LABEL: complex_mul_v16f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z24.s, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z25.d, z24.d
 ; CHECK-NEXT:    mov z26.d, z24.d
 ; CHECK-NEXT:    mov z27.d, z24.d
+; CHECK-NEXT:    fcmla z24.s, p0/m, z7.s, z3.s, #0
 ; CHECK-NEXT:    fcmla z25.s, p0/m, z4.s, z0.s, #0
 ; CHECK-NEXT:    fcmla z26.s, p0/m, z5.s, z1.s, #0
 ; CHECK-NEXT:    fcmla z27.s, p0/m, z6.s, z2.s, #0
-; CHECK-NEXT:    fcmla z24.s, p0/m, z7.s, z3.s, #0
+; CHECK-NEXT:    fcmla z24.s, p0/m, z7.s, z3.s, #90
 ; CHECK-NEXT:    fcmla z25.s, p0/m, z4.s, z0.s, #90
 ; CHECK-NEXT:    fcmla z26.s, p0/m, z5.s, z1.s, #90
 ; CHECK-NEXT:    fcmla z27.s, p0/m, z6.s, z2.s, #90
-; CHECK-NEXT:    fcmla z24.s, p0/m, z7.s, z3.s, #90
+; CHECK-NEXT:    mov z3.d, z24.d
 ; CHECK-NEXT:    mov z0.d, z25.d
 ; CHECK-NEXT:    mov z1.d, z26.d
 ; CHECK-NEXT:    mov z2.d, z27.d
-; CHECK-NEXT:    mov z3.d, z24.d
 ; CHECK-NEXT:    ret
 entry:
   %a.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %a)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll
index 1fe554bdc616e..17a239a09a033 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll
@@ -7,8 +7,8 @@ target triple = "aarch64"
 define <vscale x 2 x double> @complex_mul_v2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b) {
 ; CHECK-LABEL: complex_mul_v2f64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmla z2.d, p0/m, z1.d, z0.d, #0
 ; CHECK-NEXT:    fcmla z2.d, p0/m, z1.d, z0.d, #90
 ; CHECK-NEXT:    mov z0.d, z2.d
@@ -34,15 +34,15 @@ entry:
 define <vscale x 4 x double> @complex_mul_v4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b) {
 ; CHECK-LABEL: complex_mul_v4f64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z4.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z5.d, z4.d
-; CHECK-NEXT:    fcmla z5.d, p0/m, z2.d, z0.d, #0
 ; CHECK-NEXT:    fcmla z4.d, p0/m, z3.d, z1.d, #0
-; CHECK-NEXT:    fcmla z5.d, p0/m, z2.d, z0.d, #90
+; CHECK-NEXT:    fcmla z5.d, p0/m, z2.d, z0.d, #0
 ; CHECK-NEXT:    fcmla z4.d, p0/m, z3.d, z1.d, #90
-; CHECK-NEXT:    mov z0.d, z5.d
+; CHECK-NEXT:    fcmla z5.d, p0/m, z2.d, z0.d, #90
 ; CHECK-NEXT:    mov z1.d, z4.d
+; CHECK-NEXT:    mov z0.d, z5.d
 ; CHECK-NEXT:    ret
 entry:
   %a.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -65,23 +65,23 @@ entry:
 define <vscale x 8 x double> @complex_mul_v8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b) {
 ; CHECK-LABEL: complex_mul_v8f64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z24.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z25.d, z24.d
 ; CHECK-NEXT:    mov z26.d, z24.d
 ; CHECK-NEXT:    mov z27.d, z24.d
+; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z3.d, #0
 ; CHECK-NEXT:    fcmla z25.d, p0/m, z4.d, z0.d, #0
 ; CHECK-NEXT:    fcmla z26.d, p0/m, z5.d, z1.d, #0
 ; CHECK-NEXT:    fcmla z27.d, p0/m, z6.d, z2.d, #0
-; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z3.d, #0
+; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z3.d, #90
 ; CHECK-NEXT:    fcmla z25.d, p0/m, z4.d, z0.d, #90
 ; CHECK-NEXT:    fcmla z26.d, p0/m, z5.d, z1.d, #90
 ; CHECK-NEXT:    fcmla z27.d, p0/m, z6.d, z2.d, #90
-; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z3.d, #90
+; CHECK-NEXT:    mov z3.d, z24.d
 ; CHECK-NEXT:    mov z0.d, z25.d
 ; CHECK-NEXT:    mov z1.d, z26.d
 ; CHECK-NEXT:    mov z2.d, z27.d
-; CHECK-NEXT:    mov z3.d, z24.d
 ; CHECK-NEXT:    ret
 entry:
   %a.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %a)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
index 1b8a21b66ade9..07488b623b98d 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-i16-mul-scalable.ll
@@ -16,8 +16,9 @@ define <vscale x 4 x i16> @complex_mul_v4i16(<vscale x 4 x i16> %a, <vscale x 4
 ; CHECK-NEXT:    uzp2 z0.d, z0.d, z2.d
 ; CHECK-NEXT:    uzp1 z2.d, z1.d, z3.d
 ; CHECK-NEXT:    uzp2 z1.d, z1.d, z3.d
-; CHECK-NEXT:    mul z3.d, z2.d, z0.d
+; CHECK-NEXT:    mul z5.d, z2.d, z0.d
 ; CHECK-NEXT:    mul z2.d, z2.d, z4.d
+; CHECK-NEXT:    movprfx z3, z5
 ; CHECK-NEXT:    mla z3.d, p0/m, z1.d, z4.d
 ; CHECK-NEXT:    msb z0.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    zip2 z1.d, z0.d, z3.d
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
index 467c3c254fc2d..19318fdeeca70 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll
@@ -14,37 +14,37 @@ target triple = "aarch64"
 define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: complex_mul_v2f64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w9, #100 // =0x64
 ; CHECK-NEXT:    mov z1.d, #0 // =0x0
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    whilelo p1.d, xzr, x9
+; CHECK-NEXT:    mov w9, #100 // =0x64
 ; CHECK-NEXT:    cntd x10
+; CHECK-NEXT:    whilelo p1.d, xzr, x9
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    rdvl x11, #2
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x12, x10
 ; CHECK-NEXT:    zip2 z0.d, z1.d, z1.d
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z1.d
 ; CHECK-NEXT:  .LBB0_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    zip2 p3.d, p1.d, p1.d
+; CHECK-NEXT:    zip2 p2.d, p1.d, p1.d
 ; CHECK-NEXT:    add x13, x0, x8
 ; CHECK-NEXT:    add x14, x1, x8
-; CHECK-NEXT:    zip1 p2.d, p1.d, p1.d
+; CHECK-NEXT:    zip1 p1.d, p1.d, p1.d
 ; CHECK-NEXT:    mov z6.d, z1.d
 ; CHECK-NEXT:    mov z7.d, z0.d
-; CHECK-NEXT:    whilelo p1.d, x12, x9
+; CHECK-NEXT:    ld1d { z2.d }, p2/z, [x13, #1, mul vl]
+; CHECK-NEXT:    ld1d { z4.d }, p2/z, [x14, #1, mul vl]
 ; CHECK-NEXT:    add x8, x8, x11
-; CHECK-NEXT:    add x12, x12, x10
-; CHECK-NEXT:    ld1d { z2.d }, p3/z, [x13, #1, mul vl]
-; CHECK-NEXT:    ld1d { z4.d }, p3/z, [x14, #1, mul vl]
-; CHECK-NEXT:    ld1d { z3.d }, p2/z, [x13]
-; CHECK-NEXT:    ld1d { z5.d }, p2/z, [x14]
+; CHECK-NEXT:    ld1d { z3.d }, p1/z, [x13]
+; CHECK-NEXT:    ld1d { z5.d }, p1/z, [x14]
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z4.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #0
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z4.d, z2.d, #90
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #90
-; CHECK-NEXT:    mov z0.d, p3/m, z7.d
-; CHECK-NEXT:    mov z1.d, p2/m, z6.d
+; CHECK-NEXT:    mov z0.d, p2/m, z7.d
+; CHECK-NEXT:    mov z1.d, p1/m, z6.d
+; CHECK-NEXT:    whilelo p1.d, x12, x9
+; CHECK-NEXT:    add x12, x12, x10
 ; CHECK-NEXT:    b.mi .LBB0_1
 ; CHECK-NEXT:  // %bb.2: // %exit.block
 ; CHECK-NEXT:    uzp1 z2.d, z1.d, z0.d
@@ -114,10 +114,10 @@ define %"class.std::complex" @complex_mul_predicated_v2f64(ptr %a, ptr %b, ptr %
 ; CHECK-LABEL: complex_mul_predicated_v2f64:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov z1.d, #0 // =0x0
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cntd x10
-; CHECK-NEXT:    neg x11, x10
 ; CHECK-NEXT:    mov w12, #100 // =0x64
+; CHECK-NEXT:    neg x11, x10
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    mov x9, xzr
 ; CHECK-NEXT:    and x11, x11, x12
@@ -133,20 +133,20 @@ define %"class.std::complex" @complex_mul_predicated_v2f64(ptr %a, ptr %b, ptr %
 ; CHECK-NEXT:    mov z7.d, z0.d
 ; CHECK-NEXT:    add x9, x9, x10
 ; CHECK-NEXT:    add x8, x8, x12
-; CHECK-NEXT:    cmpne p1.d, p0/z, z2.d, #0
+; CHECK-NEXT:    cmpne p2.d, p0/z, z2.d, #0
 ; CHECK-NEXT:    cmp x11, x9
-; CHECK-NEXT:    zip2 p2.d, p1.d, p1.d
-; CHECK-NEXT:    zip1 p1.d, p1.d, p1.d
-; CHECK-NEXT:    ld1d { z2.d }, p2/z, [x13, #1, mul vl]
-; CHECK-NEXT:    ld1d { z4.d }, p2/z, [x14, #1, mul vl]
-; CHECK-NEXT:    ld1d { z3.d }, p1/z, [x13]
-; CHECK-NEXT:    ld1d { z5.d }, p1/z, [x14]
+; CHECK-NEXT:    zip2 p1.d, p2.d, p2.d
+; CHECK-NEXT:    zip1 p2.d, p2.d, p2.d
+; CHECK-NEXT:    ld1d { z2.d }, p1/z, [x13, #1, mul vl]
+; CHECK-NEXT:    ld1d { z4.d }, p1/z, [x14, #1, mul vl]
+; CHECK-NEXT:    ld1d { z3.d }, p2/z, [x13]
+; CHECK-NEXT:    ld1d { z5.d }, p2/z, [x14]
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z4.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #0
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z4.d, z2.d, #90
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #90
-; CHECK-NEXT:    mov z0.d, p2/m, z7.d
-; CHECK-NEXT:    mov z1.d, p1/m, z6.d
+; CHECK-NEXT:    mov z0.d, p1/m, z7.d
+; CHECK-NEXT:    mov z1.d, p2/m, z6.d
 ; CHECK-NEXT:    b.ne .LBB1_1
 ; CHECK-NEXT:  // %bb.2: // %exit.block
 ; CHECK-NEXT:    uzp1 z2.d, z1.d, z0.d
@@ -217,8 +217,8 @@ exit.block:                                     ; preds = %vector.body
 define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, ptr %cond) {
 ; CHECK-LABEL: complex_mul_predicated_x2_v2f64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w10, #100 // =0x64
 ; CHECK-NEXT:    mov z1.d, #0 // =0x0
+; CHECK-NEXT:    mov w10, #100 // =0x64
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    whilelo p1.d, xzr, x10
 ; CHECK-NEXT:    mov x8, xzr
@@ -236,19 +236,19 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
 ; CHECK-NEXT:    mov z7.d, z0.d
 ; CHECK-NEXT:    add x9, x9, x11
 ; CHECK-NEXT:    add x8, x8, x12
-; CHECK-NEXT:    cmpne p1.d, p1/z, z2.d, #0
-; CHECK-NEXT:    zip2 p3.d, p1.d, p1.d
-; CHECK-NEXT:    zip1 p2.d, p1.d, p1.d
-; CHECK-NEXT:    whilelo p1.d, x9, x10
-; CHECK-NEXT:    ld1d { z2.d }, p3/z, [x13, #1, mul vl]
-; CHECK-NEXT:    ld1d { z4.d }, p3/z, [x14, #1, mul vl]
+; CHECK-NEXT:    cmpne p2.d, p1/z, z2.d, #0
+; CHECK-NEXT:    zip2 p1.d, p2.d, p2.d
+; CHECK-NEXT:    zip1 p2.d, p2.d, p2.d
+; CHECK-NEXT:    ld1d { z2.d }, p1/z, [x13, #1, mul vl]
+; CHECK-NEXT:    ld1d { z4.d }, p1/z, [x14, #1, mul vl]
 ; CHECK-NEXT:    ld1d { z3.d }, p2/z, [x13]
 ; CHECK-NEXT:    ld1d { z5.d }, p2/z, [x14]
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z4.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #0
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z4.d, z2.d, #90
 ; CHECK-NEXT:    fcmla z6.d, p0/m, z5.d, z3.d, #90
-; CHECK-NEXT:    mov z0.d, p3/m, z7.d
+; CHECK-NEXT:    mov z0.d, p1/m, z7.d
+; CHECK-NEXT:    whilelo p1.d, x9, x10
 ; CHECK-NEXT:    mov z1.d, p2/m, z6.d
 ; CHECK-NEXT:    b.mi .LBB2_1
 ; CHECK-NEXT:  // %bb.2: // %exit.block
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
index 1696ac8709d40..664d99a3627b5 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
@@ -15,11 +15,11 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: complex_mul_v2f64:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov z1.d, #0 // =0x0
-; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    neg x9, x9
 ; CHECK-NEXT:    mov w10, #100 // =0x64
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    and x10, x9, x10
 ; CHECK-NEXT:    rdvl x11, #2
@@ -101,18 +101,18 @@ exit.block:                                     ; preds = %vector.body
 define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: complex_mul_nonzero_init_v2f64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d, vl1
 ; CHECK-NEXT:    fmov d0, #1.00000000
 ; CHECK-NEXT:    mov z1.d, #0 // =0x0
-; CHECK-NEXT:    fmov d2, #2.00000000
 ; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    mov w10, #100 // =0x64
-; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    fmov d2, #2.00000000
+; CHECK-NEXT:    ptrue p0.d, vl1
 ; CHECK-NEXT:    neg x9, x9
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    mov w10, #100 // =0x64
 ; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    sel z3.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    and x10, x9, x10
 ; CHECK-NEXT:    rdvl x11, #2
-; CHECK-NEXT:    sel z3.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    mov z1.d, p0/m, z2.d
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    zip2 z0.d, z1.d, z3.d
@@ -190,12 +190,12 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
 ; CHECK-LABEL: complex_mul_v2f64_unrolled:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov z1.d, #0 // =0x0
-; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    cntw x9
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    neg x9, x9
 ; CHECK-NEXT:    mov w10, #1000 // =0x3e8
+; CHECK-NEXT:    neg x9, x9
 ; CHECK-NEXT:    rdvl x12, #2
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    and x10, x9, x10
 ; CHECK-NEXT:    zip2 z0.d, z1.d, z1.d
@@ -324,10 +324,10 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia
 ; CHECK-LABEL: reduction_mix:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov z2.d, #0 // =0x0
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cntd x9
-; CHECK-NEXT:    neg x10, x9
 ; CHECK-NEXT:    mov w11, #100 // =0x64
+; CHECK-NEXT:    neg x10, x9
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    and x10, x10, x11
 ; CHECK-NEXT:    rdvl x11, #2
@@ -349,8 +349,8 @@ define dso_local %"class.std::complex" @reduction_mix(ptr %a, ptr %b, ptr noalia
 ; CHECK-NEXT:    uaddv d2, p0, z2.d
 ; CHECK-NEXT:    uzp2 z3.d, z1.d, z0.d
 ; CHECK-NEXT:    uzp1 z1.d, z1.d, z0.d
-; CHECK-NEXT:    fmov x8, d2
 ; CHECK-NEXT:    faddv d0, p0, z3.d
+; CHECK-NEXT:    fmov x8, d2
 ; CHECK-NEXT:    faddv d1, p0, z1.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $z1
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
index 742a7099559f7..17bf5ba6eb48b 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
@@ -8,8 +8,8 @@ target triple = "aarch64"
 define <vscale x 4 x double> @complex_mul_const(<vscale x 4 x double> %a, <vscale x 4 x double> %b) {
 ; CHECK-LABEL: complex_mul_const:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z4.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fmov z7.d, #3.00000000
 ; CHECK-NEXT:    fmov z24.d, #11.00000000
 ; CHECK-NEXT:    mov z6.d, z4.d
@@ -55,25 +55,25 @@ entry:
 define <vscale x 4 x double> @complex_mul_non_const(<vscale x 4 x double> %a, <vscale x 4 x double> %b, [2 x double] %c) {
 ; CHECK-LABEL: complex_mul_non_const:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z6.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $d5 killed $d5 def $z5
 ; CHECK-NEXT:    // kill: def $d4 killed $d4 def $z4
 ; CHECK-NEXT:    mov z5.d, d5
 ; CHECK-NEXT:    mov z4.d, d4
 ; CHECK-NEXT:    mov z24.d, z6.d
 ; CHECK-NEXT:    mov z7.d, z6.d
-; CHECK-NEXT:    zip2 z25.d, z4.d, z5.d
-; CHECK-NEXT:    zip1 z4.d, z4.d, z5.d
 ; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #0
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z0.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z24.d, p0/m, z1.d, z3.d, #90
+; CHECK-NEXT:    zip2 z1.d, z4.d, z5.d
 ; CHECK-NEXT:    fcmla z7.d, p0/m, z0.d, z2.d, #90
+; CHECK-NEXT:    zip1 z2.d, z4.d, z5.d
 ; CHECK-NEXT:    mov z0.d, z6.d
-; CHECK-NEXT:    fcmla z6.d, p0/m, z24.d, z25.d, #0
-; CHECK-NEXT:    fcmla z0.d, p0/m, z7.d, z4.d, #0
-; CHECK-NEXT:    fcmla z6.d, p0/m, z24.d, z25.d, #90
-; CHECK-NEXT:    fcmla z0.d, p0/m, z7.d, z4.d, #90
+; CHECK-NEXT:    fcmla z6.d, p0/m, z24.d, z1.d, #0
+; CHECK-NEXT:    fcmla z0.d, p0/m, z7.d, z2.d, #0
+; CHECK-NEXT:    fcmla z6.d, p0/m, z24.d, z1.d, #90
+; CHECK-NEXT:    fcmla z0.d, p0/m, z7.d, z2.d, #90
 ; CHECK-NEXT:    mov z1.d, z6.d
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll b/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
index 1c254f9ed935d..e6d5a2ac0fd79 100644
--- a/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
+++ b/llvm/test/CodeGen/AArch64/concat_vector-truncate-combine.ll
@@ -96,8 +96,8 @@ entry:
 define void @test_concat_fptrunc_v4f64_to_v4f32(ptr %ptr) #1 {
 ; CHECK-LABEL: test_concat_fptrunc_v4f64_to_v4f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fmov z0.s, #1.00000000
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
index 4a2e85c715f7a..83c7f73800af1 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll
@@ -9,8 +9,8 @@ define fastcc i8 @allocno_reload_assign() {
 ; CHECK-LABEL: allocno_reload_assign:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.b, #0 // =0x0
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z16.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    uunpklo z1.h, z0.b
 ; CHECK-NEXT:    uunpkhi z0.h, z0.b
@@ -48,12 +48,12 @@ define fastcc i8 @allocno_reload_assign() {
 ; CHECK-NEXT:    punpklo p4.h, p3.b
 ; CHECK-NEXT:    punpkhi p3.h, p3.b
 ; CHECK-NEXT:    st1b { z2.d }, p4, [z16.d]
-; CHECK-NEXT:    punpklo p4.h, p2.b
-; CHECK-NEXT:    punpkhi p2.h, p2.b
 ; CHECK-NEXT:    st1b { z3.d }, p3, [z16.d]
-; CHECK-NEXT:    punpklo p3.h, p4.b
-; CHECK-NEXT:    st1b { z4.d }, p3, [z16.d]
-; CHECK-NEXT:    punpkhi p3.h, p4.b
+; CHECK-NEXT:    punpklo p3.h, p2.b
+; CHECK-NEXT:    punpkhi p2.h, p2.b
+; CHECK-NEXT:    punpklo p4.h, p3.b
+; CHECK-NEXT:    punpkhi p3.h, p3.b
+; CHECK-NEXT:    st1b { z4.d }, p4, [z16.d]
 ; CHECK-NEXT:    st1b { z5.d }, p3, [z16.d]
 ; CHECK-NEXT:    punpklo p3.h, p2.b
 ; CHECK-NEXT:    punpkhi p2.h, p2.b
diff --git a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll
index 49ad3ae7d6290..6e13ae6feb66b 100644
--- a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-signed.ll
@@ -228,13 +228,13 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounw
 ; ALL-NEXT:    sdiv x9, x9, x8
 ; ALL-NEXT:    sdiv x11, x11, x10
 ; ALL-NEXT:    mul x8, x9, x8
+; ALL-NEXT:    fmov d2, x9
 ; ALL-NEXT:    fmov d1, x8
 ; ALL-NEXT:    mul x10, x11, x10
+; ALL-NEXT:    mov v2.d[1], x11
+; ALL-NEXT:    str q2, [x0]
 ; ALL-NEXT:    mov v1.d[1], x10
 ; ALL-NEXT:    sub v0.2d, v0.2d, v1.2d
-; ALL-NEXT:    fmov d1, x9
-; ALL-NEXT:    mov v1.d[1], x11
-; ALL-NEXT:    str q1, [x0]
 ; ALL-NEXT:    ret
   %div = sdiv <2 x i64> %x, %y
   store <2 x i64> %div, ptr %divdst, align 16
diff --git a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll
index 3bc50b2f03d83..650219e03b8a7 100644
--- a/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/AArch64/div-rem-pair-recomposition-unsigned.ll
@@ -228,13 +228,13 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounw
 ; ALL-NEXT:    udiv x9, x9, x8
 ; ALL-NEXT:    udiv x11, x11, x10
 ; ALL-NEXT:    mul x8, x9, x8
+; ALL-NEXT:    fmov d2, x9
 ; ALL-NEXT:    fmov d1, x8
 ; ALL-NEXT:    mul x10, x11, x10
+; ALL-NEXT:    mov v2.d[1], x11
+; ALL-NEXT:    str q2, [x0]
 ; ALL-NEXT:    mov v1.d[1], x10
 ; ALL-NEXT:    sub v0.2d, v0.2d, v1.2d
-; ALL-NEXT:    fmov d1, x9
-; ALL-NEXT:    mov v1.d[1], x11
-; ALL-NEXT:    str q1, [x0]
 ; ALL-NEXT:    ret
   %div = udiv <2 x i64> %x, %y
   store <2 x i64> %div, ptr %divdst, align 16
diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index dff4831330deb..bd9d9b99622e3 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -502,9 +502,9 @@ define <16 x i32> @double_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t,
 ; CHECK-NEXT:    usubl v4.8h, v4.8b, v5.8b
 ; CHECK-NEXT:    ld1 { v6.s }[1], [x7], #4
 ; CHECK-NEXT:    ld1 { v7.s }[1], [x7]
-; CHECK-NEXT:    usubl v5.8h, v6.8b, v7.8b
 ; CHECK-NEXT:    shll v0.4s, v4.4h, #16
 ; CHECK-NEXT:    shll2 v4.4s, v4.8h, #16
+; CHECK-NEXT:    usubl v5.8h, v6.8b, v7.8b
 ; CHECK-NEXT:    saddw v0.4s, v0.4s, v1.4h
 ; CHECK-NEXT:    saddw2 v1.4s, v4.4s, v1.8h
 ; CHECK-NEXT:    shll v6.4s, v5.4h, #16
@@ -647,10 +647,10 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr s1, [x0]
 ; CHECK-NEXT:    add x8, x3, #8
-; CHECK-NEXT:    add x11, x3, #12
+; CHECK-NEXT:    add x11, x1, #12
 ; CHECK-NEXT:    str s1, [x4]
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ldp s0, s4, [x2]
+; CHECK-NEXT:    ldr s0, [x2]
 ; CHECK-NEXT:    ushll v2.8h, v0.8b, #0
 ; CHECK-NEXT:    umov w9, v2.h[0]
 ; CHECK-NEXT:    umov w10, v2.h[1]
@@ -664,31 +664,32 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-NEXT:    add x9, x1, #4
 ; CHECK-NEXT:    mov v1.d[1], v2.d[0]
 ; CHECK-NEXT:    mov v0.b[11], w10
-; CHECK-NEXT:    add x10, x1, #12
+; CHECK-NEXT:    add x10, x3, #12
 ; CHECK-NEXT:    bic v1.8h, #255, lsl #8
 ; CHECK-NEXT:    ld1 { v0.s }[3], [x3], #4
-; CHECK-NEXT:    ldr s3, [x0, #12]
-; CHECK-NEXT:    ldp s2, s7, [x0, #4]
-; CHECK-NEXT:    ld1 { v4.s }[1], [x3]
-; CHECK-NEXT:    ldp s5, s6, [x2, #8]
-; CHECK-NEXT:    ld1 { v3.s }[1], [x10]
-; CHECK-NEXT:    ld1 { v2.s }[1], [x9]
-; CHECK-NEXT:    ld1 { v5.s }[1], [x8]
-; CHECK-NEXT:    ld1 { v6.s }[1], [x11]
+; CHECK-NEXT:    ldr s5, [x0, #4]
+; CHECK-NEXT:    ldp s2, s3, [x2, #4]
+; CHECK-NEXT:    ldr s7, [x2, #12]
+; CHECK-NEXT:    ldp s6, s4, [x0, #8]
+; CHECK-NEXT:    ld1 { v5.s }[1], [x9]
+; CHECK-NEXT:    ld1 { v7.s }[1], [x10]
+; CHECK-NEXT:    ld1 { v3.s }[1], [x8]
+; CHECK-NEXT:    ld1 { v2.s }[1], [x3]
 ; CHECK-NEXT:    add x8, x1, #8
-; CHECK-NEXT:    ld1 { v7.s }[1], [x8]
-; CHECK-NEXT:    uaddl v2.8h, v2.8b, v3.8b
-; CHECK-NEXT:    ushll v3.8h, v5.8b, #0
-; CHECK-NEXT:    uaddl v4.8h, v4.8b, v6.8b
-; CHECK-NEXT:    uaddw v1.8h, v1.8h, v7.8b
+; CHECK-NEXT:    ld1 { v4.s }[1], [x11]
+; CHECK-NEXT:    ld1 { v6.s }[1], [x8]
+; CHECK-NEXT:    ushll v3.8h, v3.8b, #0
+; CHECK-NEXT:    uaddl v2.8h, v2.8b, v7.8b
+; CHECK-NEXT:    uaddl v4.8h, v5.8b, v4.8b
+; CHECK-NEXT:    uaddw v1.8h, v1.8h, v6.8b
 ; CHECK-NEXT:    uaddw2 v5.8h, v3.8h, v0.16b
-; CHECK-NEXT:    ushll v0.4s, v2.4h, #3
+; CHECK-NEXT:    ushll v6.4s, v2.4h, #3
 ; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #3
-; CHECK-NEXT:    ushll v6.4s, v4.4h, #3
+; CHECK-NEXT:    ushll v0.4s, v4.4h, #3
 ; CHECK-NEXT:    ushll2 v3.4s, v4.8h, #3
 ; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
-; CHECK-NEXT:    uaddw2 v1.4s, v2.4s, v1.8h
-; CHECK-NEXT:    uaddw2 v3.4s, v3.4s, v5.8h
+; CHECK-NEXT:    uaddw2 v1.4s, v3.4s, v1.8h
+; CHECK-NEXT:    uaddw2 v3.4s, v2.4s, v5.8h
 ; CHECK-NEXT:    uaddw v2.4s, v6.4s, v5.4h
 ; CHECK-NEXT:    ret
   %lp1 = load <4 x i8>, ptr %p
@@ -762,35 +763,35 @@ define <16 x i32> @extrause_shuffle(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-NEXT:    add x8, x1, #8
 ; CHECK-NEXT:    ldr s6, [x1, #12]
 ; CHECK-NEXT:    ldp s17, s18, [x2, #8]
-; CHECK-NEXT:    ldp s2, s3, [x2]
+; CHECK-NEXT:    ldp s3, s5, [x2]
 ; CHECK-NEXT:    add x9, x3, #8
 ; CHECK-NEXT:    mov v4.16b, v1.16b
 ; CHECK-NEXT:    ldp s7, s16, [x0]
-; CHECK-NEXT:    ldr s5, [x3, #12]
+; CHECK-NEXT:    ldr s2, [x3, #12]
 ; CHECK-NEXT:    mov v1.s[1], v6.s[0]
-; CHECK-NEXT:    ld1 { v2.s }[1], [x3], #4
+; CHECK-NEXT:    ld1 { v3.s }[1], [x3], #4
 ; CHECK-NEXT:    mov v4.s[1], v6.s[0]
 ; CHECK-NEXT:    ld1 { v7.s }[1], [x1], #4
 ; CHECK-NEXT:    ld1 { v16.s }[1], [x1]
-; CHECK-NEXT:    ld1 { v3.s }[1], [x3]
-; CHECK-NEXT:    ld1 { v0.s }[1], [x8]
+; CHECK-NEXT:    ld1 { v5.s }[1], [x3]
 ; CHECK-NEXT:    ld1 { v17.s }[1], [x9]
+; CHECK-NEXT:    ld1 { v0.s }[1], [x8]
 ; CHECK-NEXT:    mov v4.s[2], v18.s[0]
-; CHECK-NEXT:    mov v18.s[1], v5.s[0]
+; CHECK-NEXT:    mov v18.s[1], v2.s[0]
 ; CHECK-NEXT:    uaddl v1.8h, v16.8b, v1.8b
 ; CHECK-NEXT:    uaddl v6.8h, v7.8b, v0.8b
-; CHECK-NEXT:    uaddl v2.8h, v2.8b, v17.8b
-; CHECK-NEXT:    uaddl v3.8h, v3.8b, v18.8b
+; CHECK-NEXT:    uaddl v7.8h, v3.8b, v17.8b
 ; CHECK-NEXT:    ushll v0.4s, v1.4h, #3
 ; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #3
-; CHECK-NEXT:    mov v4.s[3], v5.s[0]
+; CHECK-NEXT:    uaddl v5.8h, v5.8b, v18.8b
+; CHECK-NEXT:    mov v4.s[3], v2.s[0]
 ; CHECK-NEXT:    uaddw v0.4s, v0.4s, v6.4h
 ; CHECK-NEXT:    uaddw2 v1.4s, v1.4s, v6.8h
-; CHECK-NEXT:    ushll v7.4s, v3.4h, #3
-; CHECK-NEXT:    ushll2 v3.4s, v3.8h, #3
+; CHECK-NEXT:    ushll v16.4s, v5.4h, #3
+; CHECK-NEXT:    ushll2 v3.4s, v5.8h, #3
 ; CHECK-NEXT:    str q4, [x4]
-; CHECK-NEXT:    uaddw2 v3.4s, v3.4s, v2.8h
-; CHECK-NEXT:    uaddw v2.4s, v7.4s, v2.4h
+; CHECK-NEXT:    uaddw2 v3.4s, v3.4s, v7.8h
+; CHECK-NEXT:    uaddw v2.4s, v16.4s, v7.4h
 ; CHECK-NEXT:    ret
   %lp1 = load <4 x i8>, ptr %p
   %p2 = getelementptr i8, ptr %p, i32 4
@@ -873,8 +874,8 @@ define <16 x i32> @extrause_ext(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-NEXT:    ld1 { v4.s }[1], [x11]
 ; CHECK-NEXT:    ld1 { v2.s }[1], [x3]
 ; CHECK-NEXT:    ld1 { v0.s }[1], [x10]
-; CHECK-NEXT:    ld1 { v7.s }[1], [x9]
 ; CHECK-NEXT:    ld1 { v6.s }[1], [x8]
+; CHECK-NEXT:    ld1 { v7.s }[1], [x9]
 ; CHECK-NEXT:    uaddl v5.8h, v5.8b, v4.8b
 ; CHECK-NEXT:    uaddl v2.8h, v2.8b, v0.8b
 ; CHECK-NEXT:    ushll v16.8h, v0.8b, #0
@@ -972,8 +973,8 @@ define <16 x i32> @extrause_add(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-NEXT:    ld1 { v5.s }[1], [x11]
 ; CHECK-NEXT:    ld1 { v3.s }[1], [x3]
 ; CHECK-NEXT:    ld1 { v7.s }[1], [x10]
-; CHECK-NEXT:    ld1 { v4.s }[1], [x9]
 ; CHECK-NEXT:    ld1 { v6.s }[1], [x8]
+; CHECK-NEXT:    ld1 { v4.s }[1], [x9]
 ; CHECK-NEXT:    uaddl v5.8h, v1.8b, v5.8b
 ; CHECK-NEXT:    uaddl v7.8h, v3.8b, v7.8b
 ; CHECK-NEXT:    uaddl v1.8h, v0.8b, v4.8b
@@ -1072,23 +1073,23 @@ define <16 x i32> @extrause_ext2(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-NEXT:    ld1 { v6.s }[1], [x9]
 ; CHECK-NEXT:    ld1 { v4.s }[1], [x8]
 ; CHECK-NEXT:    uaddl v7.8h, v3.8b, v7.8b
-; CHECK-NEXT:    uaddl v3.8h, v1.8b, v5.8b
+; CHECK-NEXT:    uaddl v1.8h, v1.8b, v5.8b
 ; CHECK-NEXT:    uaddl v2.8h, v2.8b, v6.8b
 ; CHECK-NEXT:    uaddl v4.8h, v0.8b, v4.8b
 ; CHECK-NEXT:    ushll v0.4s, v7.4h, #3
-; CHECK-NEXT:    ushll2 v1.4s, v7.8h, #3
-; CHECK-NEXT:    ushll v5.4s, v3.4h, #3
-; CHECK-NEXT:    ushll2 v6.4s, v3.8h, #3
-; CHECK-NEXT:    ushll2 v16.4s, v3.8h, #0
-; CHECK-NEXT:    ushll v17.4s, v3.4h, #0
-; CHECK-NEXT:    uaddw2 v1.4s, v1.4s, v2.8h
+; CHECK-NEXT:    ushll2 v3.4s, v7.8h, #3
+; CHECK-NEXT:    ushll v6.4s, v1.4h, #3
+; CHECK-NEXT:    ushll2 v16.4s, v1.8h, #3
+; CHECK-NEXT:    ushll2 v5.4s, v1.8h, #0
+; CHECK-NEXT:    ushll v17.4s, v1.4h, #0
+; CHECK-NEXT:    ushll2 v18.4s, v7.8h, #0
+; CHECK-NEXT:    uaddw2 v1.4s, v3.4s, v2.8h
 ; CHECK-NEXT:    uaddw v0.4s, v0.4s, v2.4h
-; CHECK-NEXT:    uaddw v2.4s, v5.4s, v4.4h
-; CHECK-NEXT:    uaddw2 v3.4s, v6.4s, v4.8h
-; CHECK-NEXT:    ushll2 v4.4s, v7.8h, #0
-; CHECK-NEXT:    ushll v5.4s, v7.4h, #0
-; CHECK-NEXT:    stp q17, q16, [x4, #32]
-; CHECK-NEXT:    stp q5, q4, [x4]
+; CHECK-NEXT:    uaddw v2.4s, v6.4s, v4.4h
+; CHECK-NEXT:    uaddw2 v3.4s, v16.4s, v4.8h
+; CHECK-NEXT:    ushll v4.4s, v7.4h, #0
+; CHECK-NEXT:    stp q17, q5, [x4, #32]
+; CHECK-NEXT:    stp q4, q18, [x4]
 ; CHECK-NEXT:    ret
   %lp1 = load <4 x i8>, ptr %p
   %p2 = getelementptr i8, ptr %p, i32 4
@@ -1157,32 +1158,32 @@ define <16 x i32> @extrause_ext2(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 define <16 x i32> @extrause_shl(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-LABEL: extrause_shl:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp s0, s1, [x0]
+; CHECK-NEXT:    ldp s1, s2, [x0]
 ; CHECK-NEXT:    add x10, x3, #12
-; CHECK-NEXT:    ldp s2, s3, [x2]
+; CHECK-NEXT:    ldp s0, s3, [x2]
 ; CHECK-NEXT:    add x11, x1, #12
 ; CHECK-NEXT:    ldp s4, s5, [x0, #8]
 ; CHECK-NEXT:    add x8, x3, #8
 ; CHECK-NEXT:    ldp s6, s7, [x2, #8]
 ; CHECK-NEXT:    add x9, x1, #8
-; CHECK-NEXT:    ld1 { v2.s }[1], [x3], #4
-; CHECK-NEXT:    ld1 { v0.s }[1], [x1], #4
-; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
+; CHECK-NEXT:    ld1 { v0.s }[1], [x3], #4
+; CHECK-NEXT:    ld1 { v1.s }[1], [x1], #4
+; CHECK-NEXT:    ld1 { v2.s }[1], [x1]
 ; CHECK-NEXT:    ld1 { v5.s }[1], [x11]
 ; CHECK-NEXT:    ld1 { v3.s }[1], [x3]
 ; CHECK-NEXT:    ld1 { v7.s }[1], [x10]
 ; CHECK-NEXT:    ld1 { v4.s }[1], [x9]
 ; CHECK-NEXT:    ld1 { v6.s }[1], [x8]
-; CHECK-NEXT:    uaddl v1.8h, v1.8b, v5.8b
+; CHECK-NEXT:    uaddl v2.8h, v2.8b, v5.8b
 ; CHECK-NEXT:    uaddl v3.8h, v3.8b, v7.8b
-; CHECK-NEXT:    uaddl v4.8h, v0.8b, v4.8b
-; CHECK-NEXT:    uaddl v2.8h, v2.8b, v6.8b
-; CHECK-NEXT:    ushll v5.4s, v1.4h, #3
+; CHECK-NEXT:    uaddl v4.8h, v1.8b, v4.8b
+; CHECK-NEXT:    ushll v5.4s, v2.4h, #3
+; CHECK-NEXT:    ushll2 v7.4s, v2.8h, #3
+; CHECK-NEXT:    uaddl v2.8h, v0.8b, v6.8b
 ; CHECK-NEXT:    ushll v6.4s, v3.4h, #3
-; CHECK-NEXT:    ushll2 v7.4s, v1.8h, #3
 ; CHECK-NEXT:    ushll2 v16.4s, v3.8h, #3
-; CHECK-NEXT:    uaddw v0.4s, v5.4s, v4.4h
 ; CHECK-NEXT:    uaddw2 v1.4s, v7.4s, v4.8h
+; CHECK-NEXT:    uaddw v0.4s, v5.4s, v4.4h
 ; CHECK-NEXT:    stp q5, q7, [x4]
 ; CHECK-NEXT:    uaddw2 v3.4s, v16.4s, v2.8h
 ; CHECK-NEXT:    uaddw v2.4s, v6.4s, v2.4h
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 9916aeeab1cad..b1ca88975a621 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -280,10 +280,10 @@ define <3 x i32> @v3f64_i32(<3 x double> %a, <3 x double> %b, <3 x i32> %d, <3 x
 ; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v2.4s
 ; CHECK-GI-NEXT:    fmov s2, w8
 ; CHECK-GI-NEXT:    mov v2.s[1], w8
-; CHECK-GI-NEXT:    neg v3.4s, v1.4s
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    neg v1.4s, v1.4s
 ; CHECK-GI-NEXT:    mov v2.s[2], w8
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v3.4s
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v2.16b
 ; CHECK-GI-NEXT:    and v0.16b, v6.16b, v0.16b
 ; CHECK-GI-NEXT:    and v1.16b, v7.16b, v1.16b
@@ -348,10 +348,10 @@ define <3 x float> @v3f32_float(<3 x float> %a, <3 x float> %b, <3 x float> %d,
 ; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-GI-NEXT:    fmov s1, w8
 ; CHECK-GI-NEXT:    mov v1.s[1], w8
-; CHECK-GI-NEXT:    neg v5.4s, v4.4s
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT:    neg v4.4s, v4.4s
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v4.4s
 ; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v5.4s
 ; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
 ; CHECK-GI-NEXT:    and v1.16b, v3.16b, v1.16b
@@ -426,10 +426,10 @@ define <3 x i32> @v3f32_i32(<3 x float> %a, <3 x float> %b, <3 x i32> %d, <3 x i
 ; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-GI-NEXT:    fmov s1, w8
 ; CHECK-GI-NEXT:    mov v1.s[1], w8
-; CHECK-GI-NEXT:    neg v5.4s, v4.4s
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT:    neg v4.4s, v4.4s
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v4.4s
 ; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v5.4s
 ; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
 ; CHECK-GI-NEXT:    and v1.16b, v3.16b, v1.16b
@@ -545,8 +545,8 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x
 ; CHECK-GI-NOFP16-LABEL: v7f16_half:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    mov w8, #15 // =0xf
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fmov s4, w8
 ; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[4]
 ; CHECK-GI-NOFP16-NEXT:    mov w8, #65535 // =0xffff
@@ -555,32 +555,32 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x
 ; CHECK-GI-NOFP16-NEXT:    mov h19, v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT:    mov v5.16b, v4.16b
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[1], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    fmov s7, w8
+; CHECK-GI-NOFP16-NEXT:    mov v7.16b, v4.16b
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v6.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s6, w8
 ; CHECK-GI-NOFP16-NEXT:    mov v16.h[1], v17.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v17.16b, v7.16b
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[1], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v17.16b, v6.16b
 ; CHECK-GI-NOFP16-NEXT:    fcmgt v0.4s, v1.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v18.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[1], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v18.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v17.h[1], v6.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v16.h[2], v19.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v6.4s, v6.4h
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[2], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[2], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v5.4s, v5.4h
+; CHECK-GI-NOFP16-NEXT:    mov v17.h[2], v6.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v16.4s, v16.4h
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[3], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    fcmgt v1.4s, v16.4s, v6.4s
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[4], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[4], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[3], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v17.h[3], v6.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcmgt v1.4s, v16.4s, v5.4s
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[4], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v17.h[4], v6.h[0]
 ; CHECK-GI-NOFP16-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[5], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[5], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[6], v4.h[0]
-; CHECK-GI-NOFP16-NEXT:    mov v17.h[6], v7.h[0]
-; CHECK-GI-NOFP16-NEXT:    neg v1.8h, v5.8h
-; CHECK-GI-NOFP16-NEXT:    ushl v0.8h, v0.8h, v5.8h
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[5], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v17.h[5], v6.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v7.h[6], v4.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v17.h[6], v6.h[0]
+; CHECK-GI-NOFP16-NEXT:    ushl v0.8h, v0.8h, v7.8h
+; CHECK-GI-NOFP16-NEXT:    neg v1.8h, v7.8h
 ; CHECK-GI-NOFP16-NEXT:    sshl v0.8h, v0.8h, v1.8h
 ; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v0.16b, v17.16b
 ; CHECK-GI-NOFP16-NEXT:    and v0.16b, v2.16b, v0.16b
@@ -609,8 +609,8 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x
 ; CHECK-GI-FP16-NEXT:    mov v7.h[5], v6.h[0]
 ; CHECK-GI-FP16-NEXT:    mov v5.h[6], v4.h[0]
 ; CHECK-GI-FP16-NEXT:    mov v7.h[6], v6.h[0]
-; CHECK-GI-FP16-NEXT:    neg v1.8h, v5.8h
 ; CHECK-GI-FP16-NEXT:    ushl v0.8h, v0.8h, v5.8h
+; CHECK-GI-FP16-NEXT:    neg v1.8h, v5.8h
 ; CHECK-GI-FP16-NEXT:    sshl v0.8h, v0.8h, v1.8h
 ; CHECK-GI-FP16-NEXT:    eor v1.16b, v0.16b, v7.16b
 ; CHECK-GI-FP16-NEXT:    and v0.16b, v2.16b, v0.16b
@@ -1047,6 +1047,7 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fmov s16, w0
+; CHECK-GI-NOFP16-NEXT:    fmov s18, w4
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v3.h[0]
@@ -1054,6 +1055,7 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v5.h[0]
 ; CHECK-GI-NOFP16-NEXT:    ldr s5, [sp]
 ; CHECK-GI-NOFP16-NEXT:    mov v16.s[1], w1
+; CHECK-GI-NOFP16-NEXT:    mov v18.s[1], w5
 ; CHECK-GI-NOFP16-NEXT:    mov v3.s[1], w8
 ; CHECK-GI-NOFP16-NEXT:    fmov w9, s5
 ; CHECK-GI-NOFP16-NEXT:    fmov s5, w7
@@ -1069,27 +1071,25 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ; CHECK-GI-NOFP16-NEXT:    mov v3.s[2], w8
 ; CHECK-GI-NOFP16-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-GI-NOFP16-NEXT:    mov v7.s[1], v17.s[0]
-; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    ldr s17, [sp, #40]
+; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
+; CHECK-GI-NOFP16-NEXT:    mov v18.s[2], w6
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v4.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v16.s[3], w3
 ; CHECK-GI-NOFP16-NEXT:    mov v5.s[2], w9
-; CHECK-GI-NOFP16-NEXT:    neg v18.4s, v3.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v7.s[2], v17.s[0]
 ; CHECK-GI-NOFP16-NEXT:    fcmgt v2.4s, v4.4s, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    fmov s4, w8
 ; CHECK-GI-NOFP16-NEXT:    mov v4.s[1], w8
 ; CHECK-GI-NOFP16-NEXT:    ushl v2.4s, v2.4s, v3.4s
-; CHECK-GI-NOFP16-NEXT:    fmov s3, w4
-; CHECK-GI-NOFP16-NEXT:    mov v3.s[1], w5
+; CHECK-GI-NOFP16-NEXT:    neg v3.4s, v3.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v4.s[2], w8
-; CHECK-GI-NOFP16-NEXT:    sshl v2.4s, v2.4s, v18.4s
+; CHECK-GI-NOFP16-NEXT:    sshl v2.4s, v2.4s, v3.4s
 ; CHECK-GI-NOFP16-NEXT:    fmov w8, s6
-; CHECK-GI-NOFP16-NEXT:    mov v3.s[2], w6
-; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v2.16b, v4.16b
 ; CHECK-GI-NOFP16-NEXT:    mov v5.s[3], w8
+; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v2.16b, v4.16b
+; CHECK-GI-NOFP16-NEXT:    and v2.16b, v18.16b, v2.16b
 ; CHECK-GI-NOFP16-NEXT:    and v1.16b, v7.16b, v1.16b
-; CHECK-GI-NOFP16-NEXT:    and v2.16b, v3.16b, v2.16b
 ; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v16.16b, v5.16b
 ; CHECK-GI-NOFP16-NEXT:    orr v1.16b, v2.16b, v1.16b
 ; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[1]
@@ -1111,30 +1111,32 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ; CHECK-GI-FP16-NEXT:    fcmgt v0.8h, v1.8h, v0.8h
 ; CHECK-GI-FP16-NEXT:    mov w10, #31 // =0x1f
 ; CHECK-GI-FP16-NEXT:    ldr s3, [sp]
-; CHECK-GI-FP16-NEXT:    fmov s1, w10
+; CHECK-GI-FP16-NEXT:    fmov s2, w10
 ; CHECK-GI-FP16-NEXT:    fmov s6, w0
 ; CHECK-GI-FP16-NEXT:    ldr s4, [sp, #8]
+; CHECK-GI-FP16-NEXT:    fmov s17, w4
 ; CHECK-GI-FP16-NEXT:    ldr s7, [sp, #24]
 ; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #32]
 ; CHECK-GI-FP16-NEXT:    umov w8, v0.h[4]
 ; CHECK-GI-FP16-NEXT:    umov w9, v0.h[5]
-; CHECK-GI-FP16-NEXT:    mov v1.s[1], w10
+; CHECK-GI-FP16-NEXT:    mov v2.s[1], w10
 ; CHECK-GI-FP16-NEXT:    mov v6.s[1], w1
+; CHECK-GI-FP16-NEXT:    mov v17.s[1], w5
 ; CHECK-GI-FP16-NEXT:    mov v7.s[1], v16.s[0]
 ; CHECK-GI-FP16-NEXT:    ldr s16, [sp, #40]
-; CHECK-GI-FP16-NEXT:    fmov s2, w8
+; CHECK-GI-FP16-NEXT:    fmov s1, w8
 ; CHECK-GI-FP16-NEXT:    umov w8, v0.h[6]
-; CHECK-GI-FP16-NEXT:    mov v1.s[2], w10
+; CHECK-GI-FP16-NEXT:    mov v2.s[2], w10
 ; CHECK-GI-FP16-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-GI-FP16-NEXT:    mov v6.s[2], w2
+; CHECK-GI-FP16-NEXT:    mov v17.s[2], w6
 ; CHECK-GI-FP16-NEXT:    mov v7.s[2], v16.s[0]
-; CHECK-GI-FP16-NEXT:    mov v2.s[1], w9
+; CHECK-GI-FP16-NEXT:    mov v1.s[1], w9
 ; CHECK-GI-FP16-NEXT:    mov w9, #-1 // =0xffffffff
 ; CHECK-GI-FP16-NEXT:    fmov s5, w9
-; CHECK-GI-FP16-NEXT:    neg v17.4s, v1.4s
 ; CHECK-GI-FP16-NEXT:    shl v0.4s, v0.4s, #31
 ; CHECK-GI-FP16-NEXT:    mov v6.s[3], w3
-; CHECK-GI-FP16-NEXT:    mov v2.s[2], w8
+; CHECK-GI-FP16-NEXT:    mov v1.s[2], w8
 ; CHECK-GI-FP16-NEXT:    fmov w8, s3
 ; CHECK-GI-FP16-NEXT:    fmov s3, w7
 ; CHECK-GI-FP16-NEXT:    mov v5.s[1], w9
@@ -1142,26 +1144,24 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ; CHECK-GI-FP16-NEXT:    mov v3.s[1], w8
 ; CHECK-GI-FP16-NEXT:    fmov w8, s4
 ; CHECK-GI-FP16-NEXT:    ldr s4, [sp, #16]
-; CHECK-GI-FP16-NEXT:    ushl v1.4s, v2.4s, v1.4s
-; CHECK-GI-FP16-NEXT:    fmov s2, w4
+; CHECK-GI-FP16-NEXT:    ushl v1.4s, v1.4s, v2.4s
+; CHECK-GI-FP16-NEXT:    neg v2.4s, v2.4s
 ; CHECK-GI-FP16-NEXT:    mov v5.s[2], w9
-; CHECK-GI-FP16-NEXT:    mov v2.s[1], w5
 ; CHECK-GI-FP16-NEXT:    mov v3.s[2], w8
-; CHECK-GI-FP16-NEXT:    sshl v1.4s, v1.4s, v17.4s
+; CHECK-GI-FP16-NEXT:    sshl v1.4s, v1.4s, v2.4s
 ; CHECK-GI-FP16-NEXT:    fmov w8, s4
-; CHECK-GI-FP16-NEXT:    eor v4.16b, v1.16b, v5.16b
-; CHECK-GI-FP16-NEXT:    mov v2.s[2], w6
+; CHECK-GI-FP16-NEXT:    eor v2.16b, v1.16b, v5.16b
+; CHECK-GI-FP16-NEXT:    and v1.16b, v17.16b, v1.16b
 ; CHECK-GI-FP16-NEXT:    mov v3.s[3], w8
-; CHECK-GI-FP16-NEXT:    and v1.16b, v2.16b, v1.16b
-; CHECK-GI-FP16-NEXT:    and v2.16b, v7.16b, v4.16b
+; CHECK-GI-FP16-NEXT:    and v2.16b, v7.16b, v2.16b
 ; CHECK-GI-FP16-NEXT:    bsl v0.16b, v6.16b, v3.16b
 ; CHECK-GI-FP16-NEXT:    orr v1.16b, v1.16b, v2.16b
 ; CHECK-GI-FP16-NEXT:    mov s2, v0.s[1]
 ; CHECK-GI-FP16-NEXT:    mov s3, v0.s[2]
 ; CHECK-GI-FP16-NEXT:    mov s4, v0.s[3]
-; CHECK-GI-FP16-NEXT:    fmov w0, s0
 ; CHECK-GI-FP16-NEXT:    mov s5, v1.s[1]
 ; CHECK-GI-FP16-NEXT:    mov s6, v1.s[2]
+; CHECK-GI-FP16-NEXT:    fmov w0, s0
 ; CHECK-GI-FP16-NEXT:    fmov w4, s1
 ; CHECK-GI-FP16-NEXT:    fmov w1, s2
 ; CHECK-GI-FP16-NEXT:    fmov w2, s3
diff --git a/llvm/test/CodeGen/AArch64/fdiv-combine.ll b/llvm/test/CodeGen/AArch64/fdiv-combine.ll
index 1ed63f3ef2507..0627250d07791 100644
--- a/llvm/test/CodeGen/AArch64/fdiv-combine.ll
+++ b/llvm/test/CodeGen/AArch64/fdiv-combine.ll
@@ -171,8 +171,8 @@ entry:
 define <vscale x 2 x double> @splat_fdiv_nxv2f64(double %D, <vscale x 2 x double> %a) #1 {
 ; CHECK-LABEL: splat_fdiv_nxv2f64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z0.d, d0
 ; CHECK-NEXT:    fdivr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
index 03e64f8b785b0..a78addc490086 100644
--- a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -604,8 +604,8 @@ define fastcc i1 @quantum_hadamard(i32 %0) {
 define <vscale x 4 x float> @fdiv_pow2_nx4xfloat(<vscale x 4 x i32> %i) "target-features"="+sve" {
 ; CHECK-LABEL: fdiv_pow2_nx4xfloat:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    lslr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    fmov z1.s, #9.00000000
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
diff --git a/llvm/test/CodeGen/AArch64/fp-veclib-expansion.ll b/llvm/test/CodeGen/AArch64/fp-veclib-expansion.ll
index 67c056c780cc8..2c8e2190f8209 100644
--- a/llvm/test/CodeGen/AArch64/fp-veclib-expansion.ll
+++ b/llvm/test/CodeGen/AArch64/fp-veclib-expansion.ll
@@ -62,9 +62,9 @@ define <vscale x 4 x float> @frem_nxv4f32(<vscale x 4 x float> %unused, <vscale
 ; ARMPL-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; ARMPL-NEXT:    .cfi_def_cfa_offset 16
 ; ARMPL-NEXT:    .cfi_offset w30, -16
-; ARMPL-NEXT:    ptrue p0.s
 ; ARMPL-NEXT:    mov z0.d, z1.d
 ; ARMPL-NEXT:    mov z1.d, z2.d
+; ARMPL-NEXT:    ptrue p0.s
 ; ARMPL-NEXT:    bl armpl_svfmod_f32_x
 ; ARMPL-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; ARMPL-NEXT:    ret
@@ -74,9 +74,9 @@ define <vscale x 4 x float> @frem_nxv4f32(<vscale x 4 x float> %unused, <vscale
 ; SLEEF-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; SLEEF-NEXT:    .cfi_def_cfa_offset 16
 ; SLEEF-NEXT:    .cfi_offset w30, -16
-; SLEEF-NEXT:    ptrue p0.s
 ; SLEEF-NEXT:    mov z0.d, z1.d
 ; SLEEF-NEXT:    mov z1.d, z2.d
+; SLEEF-NEXT:    ptrue p0.s
 ; SLEEF-NEXT:    bl _ZGVsMxvv_fmodf
 ; SLEEF-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; SLEEF-NEXT:    ret
@@ -90,9 +90,9 @@ define <vscale x 2 x double> @frem_strict_nxv2f64(<vscale x 2 x double> %unused,
 ; ARMPL-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; ARMPL-NEXT:    .cfi_def_cfa_offset 16
 ; ARMPL-NEXT:    .cfi_offset w30, -16
-; ARMPL-NEXT:    ptrue p0.d
 ; ARMPL-NEXT:    mov z0.d, z1.d
 ; ARMPL-NEXT:    mov z1.d, z2.d
+; ARMPL-NEXT:    ptrue p0.d
 ; ARMPL-NEXT:    bl armpl_svfmod_f64_x
 ; ARMPL-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; ARMPL-NEXT:    ret
@@ -102,9 +102,9 @@ define <vscale x 2 x double> @frem_strict_nxv2f64(<vscale x 2 x double> %unused,
 ; SLEEF-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
 ; SLEEF-NEXT:    .cfi_def_cfa_offset 16
 ; SLEEF-NEXT:    .cfi_offset w30, -16
-; SLEEF-NEXT:    ptrue p0.d
 ; SLEEF-NEXT:    mov z0.d, z1.d
 ; SLEEF-NEXT:    mov z1.d, z2.d
+; SLEEF-NEXT:    ptrue p0.d
 ; SLEEF-NEXT:    bl _ZGVsMxvv_fmod
 ; SLEEF-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; SLEEF-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
index 301d28fd7be56..2ea581359af6f 100644
--- a/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/fpclamptosat_vec.ll
@@ -194,10 +194,10 @@ define <2 x i16> @ustest_f64i16(<2 x double> %x) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-NEXT:    smin v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    smax v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    smax v0.2s, v0.2s, v2.2s
 ; CHECK-NEXT:    ret
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
@@ -833,10 +833,10 @@ define <2 x i16> @ustest_f64i16_mm(<2 x double> %x) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
 ; CHECK-NEXT:    movi d1, #0x00ffff0000ffff
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-NEXT:    smin v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    smax v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    smax v0.2s, v0.2s, v2.2s
 ; CHECK-NEXT:    ret
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index 92fd3183393ea..c45885a38f159 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -697,9 +697,9 @@ define <2 x i1> @test_signed_v2f32_v2i1(<2 x float> %f) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    fcvtzs v0.2s, v0.2s
+; CHECK-NEXT:    movi v2.2d, #0xffffffffffffffff
 ; CHECK-NEXT:    smin v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    movi v1.2d, #0xffffffffffffffff
-; CHECK-NEXT:    smax v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    smax v0.2s, v0.2s, v2.2s
 ; CHECK-NEXT:    ret
     %x = call <2 x i1> @llvm.fptosi.sat.v2f32.v2i1(<2 x float> %f)
     ret <2 x i1> %x
@@ -1620,9 +1620,9 @@ define <4 x i1> @test_signed_v4f16_v4i1(<4 x half> %f) {
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-FP16-NEXT:    fcvtzs v0.4h, v0.4h
+; CHECK-FP16-NEXT:    movi v2.2d, #0xffffffffffffffff
 ; CHECK-FP16-NEXT:    smin v0.4h, v0.4h, v1.4h
-; CHECK-FP16-NEXT:    movi v1.2d, #0xffffffffffffffff
-; CHECK-FP16-NEXT:    smax v0.4h, v0.4h, v1.4h
+; CHECK-FP16-NEXT:    smax v0.4h, v0.4h, v2.4h
 ; CHECK-FP16-NEXT:    ret
     %x = call <4 x i1> @llvm.fptosi.sat.v4f16.v4i1(<4 x half> %f)
     ret <4 x i1> %x
@@ -1668,9 +1668,9 @@ define <4 x i13> @test_signed_v4f16_v4i13(<4 x half> %f) {
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcvtzs v0.4h, v0.4h
 ; CHECK-FP16-NEXT:    mvni v1.4h, #240, lsl #8
+; CHECK-FP16-NEXT:    movi v2.4h, #240, lsl #8
 ; CHECK-FP16-NEXT:    smin v0.4h, v0.4h, v1.4h
-; CHECK-FP16-NEXT:    movi v1.4h, #240, lsl #8
-; CHECK-FP16-NEXT:    smax v0.4h, v0.4h, v1.4h
+; CHECK-FP16-NEXT:    smax v0.4h, v0.4h, v2.4h
 ; CHECK-FP16-NEXT:    ret
     %x = call <4 x i13> @llvm.fptosi.sat.v4f16.v4i13(<4 x half> %f)
     ret <4 x i13> %x
@@ -2103,9 +2103,9 @@ define <8 x i1> @test_signed_v8f16_v8i1(<8 x half> %f) {
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-FP16-NEXT:    fcvtzs v0.8h, v0.8h
+; CHECK-FP16-NEXT:    movi v2.2d, #0xffffffffffffffff
 ; CHECK-FP16-NEXT:    smin v0.8h, v0.8h, v1.8h
-; CHECK-FP16-NEXT:    movi v1.2d, #0xffffffffffffffff
-; CHECK-FP16-NEXT:    smax v0.8h, v0.8h, v1.8h
+; CHECK-FP16-NEXT:    smax v0.8h, v0.8h, v2.8h
 ; CHECK-FP16-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-FP16-NEXT:    ret
     %x = call <8 x i1> @llvm.fptosi.sat.v8f16.v8i1(<8 x half> %f)
@@ -2254,9 +2254,9 @@ define <8 x i13> @test_signed_v8f16_v8i13(<8 x half> %f) {
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcvtzs v0.8h, v0.8h
 ; CHECK-FP16-NEXT:    mvni v1.8h, #240, lsl #8
+; CHECK-FP16-NEXT:    movi v2.8h, #240, lsl #8
 ; CHECK-FP16-NEXT:    smin v0.8h, v0.8h, v1.8h
-; CHECK-FP16-NEXT:    movi v1.8h, #240, lsl #8
-; CHECK-FP16-NEXT:    smax v0.8h, v0.8h, v1.8h
+; CHECK-FP16-NEXT:    smax v0.8h, v0.8h, v2.8h
 ; CHECK-FP16-NEXT:    ret
     %x = call <8 x i13> @llvm.fptosi.sat.v8f16.v8i13(<8 x half> %f)
     ret <8 x i13> %x
diff --git a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
index 181f2185893e4..d39c09524e1ad 100644
--- a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
+++ b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
@@ -78,9 +78,9 @@ define <4 x i32> @rotl_v4i32(<4 x i32> %x, <4 x i32> %z) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v2.4s, #31
 ; CHECK-NEXT:    neg v3.4s, v1.4s
+; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v2.16b, v3.16b, v2.16b
-; CHECK-NEXT:    neg v2.4s, v2.4s
+; CHECK-NEXT:    neg v2.4s, v3.4s
 ; CHECK-NEXT:    ushl v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
diff --git a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
index 97511639ec8cf..cb9f04a7fac48 100644
--- a/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/AArch64/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
@@ -202,8 +202,8 @@ define <4 x i1> @vec_4xi32_splat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
 define <4 x i1> @vec_4xi32_nonsplat_eq(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; CHECK-LABEL: vec_4xi32_nonsplat_eq:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v1.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI13_0
+; CHECK-NEXT:    neg v1.4s, v1.4s
 ; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI13_0]
 ; CHECK-NEXT:    ushl v1.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll
index e7352fe03d01a..8e10847e7aae3 100644
--- a/llvm/test/CodeGen/AArch64/icmp.ll
+++ b/llvm/test/CodeGen/AArch64/icmp.ll
@@ -179,10 +179,10 @@ define <3 x i32> @v3i32_i32(<3 x i32> %a, <3 x i32> %b, <3 x i32> %d, <3 x i32>
 ; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
 ; CHECK-GI-NEXT:    fmov s1, w8
 ; CHECK-GI-NEXT:    mov v1.s[1], w8
-; CHECK-GI-NEXT:    neg v5.4s, v4.4s
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT:    neg v4.4s, v4.4s
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v4.4s
 ; CHECK-GI-NEXT:    mov v1.s[2], w8
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v5.4s
 ; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v1.16b
 ; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
 ; CHECK-GI-NEXT:    and v1.16b, v3.16b, v1.16b
diff --git a/llvm/test/CodeGen/AArch64/insert-extend.ll b/llvm/test/CodeGen/AArch64/insert-extend.ll
index e4d2b516b8fbf..0b730f6e77156 100644
--- a/llvm/test/CodeGen/AArch64/insert-extend.ll
+++ b/llvm/test/CodeGen/AArch64/insert-extend.ll
@@ -64,104 +64,104 @@ define i32 @large(ptr nocapture noundef readonly %p1, i32 noundef %st1, ptr noca
 ; CHECK-NEXT:    ldr d3, [x11]
 ; CHECK-NEXT:    ldr d4, [x10, x8]
 ; CHECK-NEXT:    ldr d5, [x11, x9]
+; CHECK-NEXT:    shll2 v6.4s, v0.8h, #16
 ; CHECK-NEXT:    usubl v2.8h, v2.8b, v3.8b
 ; CHECK-NEXT:    usubl v3.8h, v4.8b, v5.8b
-; CHECK-NEXT:    shll2 v4.4s, v0.8h, #16
-; CHECK-NEXT:    shll2 v5.4s, v1.8h, #16
-; CHECK-NEXT:    saddw v0.4s, v4.4s, v0.4h
+; CHECK-NEXT:    shll2 v4.4s, v1.8h, #16
+; CHECK-NEXT:    saddw v0.4s, v6.4s, v0.4h
 ; CHECK-NEXT:    shll2 v6.4s, v2.8h, #16
-; CHECK-NEXT:    saddw v1.4s, v5.4s, v1.4h
-; CHECK-NEXT:    shll2 v4.4s, v3.8h, #16
-; CHECK-NEXT:    saddw v2.4s, v6.4s, v2.4h
-; CHECK-NEXT:    saddw v3.4s, v4.4s, v3.4h
+; CHECK-NEXT:    shll2 v5.4s, v3.8h, #16
+; CHECK-NEXT:    saddw v1.4s, v4.4s, v1.4h
 ; CHECK-NEXT:    rev64 v4.4s, v0.4s
+; CHECK-NEXT:    saddw v2.4s, v6.4s, v2.4h
+; CHECK-NEXT:    saddw v3.4s, v5.4s, v3.4h
 ; CHECK-NEXT:    rev64 v5.4s, v1.4s
 ; CHECK-NEXT:    rev64 v6.4s, v2.4s
-; CHECK-NEXT:    rev64 v7.4s, v3.4s
 ; CHECK-NEXT:    sub v4.4s, v0.4s, v4.4s
 ; CHECK-NEXT:    addp v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    rev64 v7.4s, v3.4s
 ; CHECK-NEXT:    sub v5.4s, v1.4s, v5.4s
 ; CHECK-NEXT:    sub v6.4s, v2.4s, v6.4s
 ; CHECK-NEXT:    addp v2.4s, v3.4s, v2.4s
-; CHECK-NEXT:    sub v7.4s, v3.4s, v7.4s
 ; CHECK-NEXT:    zip1 v16.4s, v5.4s, v4.4s
-; CHECK-NEXT:    ext v1.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT:    sub v7.4s, v3.4s, v7.4s
 ; CHECK-NEXT:    zip2 v3.4s, v6.4s, v7.4s
 ; CHECK-NEXT:    mov v6.s[1], v7.s[0]
 ; CHECK-NEXT:    ext v7.16b, v5.16b, v16.16b, #8
 ; CHECK-NEXT:    mov v5.s[3], v4.s[2]
-; CHECK-NEXT:    uzp1 v4.4s, v1.4s, v0.4s
-; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v0.4s
+; CHECK-NEXT:    ext v4.16b, v2.16b, v2.16b, #8
 ; CHECK-NEXT:    mov v6.d[1], v7.d[1]
 ; CHECK-NEXT:    mov v3.d[1], v5.d[1]
+; CHECK-NEXT:    uzp1 v1.4s, v4.4s, v0.4s
+; CHECK-NEXT:    uzp2 v4.4s, v4.4s, v0.4s
 ; CHECK-NEXT:    uzp2 v5.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    uzp1 v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    sub v1.4s, v4.4s, v1.4s
 ; CHECK-NEXT:    add v2.4s, v3.4s, v6.4s
 ; CHECK-NEXT:    sub v3.4s, v6.4s, v3.4s
+; CHECK-NEXT:    sub v1.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
-; CHECK-NEXT:    rev64 v6.4s, v1.4s
 ; CHECK-NEXT:    rev64 v4.4s, v2.4s
 ; CHECK-NEXT:    rev64 v5.4s, v3.4s
-; CHECK-NEXT:    addp v16.4s, v1.4s, v3.4s
+; CHECK-NEXT:    rev64 v6.4s, v1.4s
 ; CHECK-NEXT:    rev64 v7.4s, v0.4s
+; CHECK-NEXT:    addp v16.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    addp v17.4s, v0.4s, v2.4s
-; CHECK-NEXT:    sub v1.4s, v1.4s, v6.4s
 ; CHECK-NEXT:    sub v3.4s, v3.4s, v5.4s
 ; CHECK-NEXT:    sub v2.4s, v2.4s, v4.4s
+; CHECK-NEXT:    sub v1.4s, v1.4s, v6.4s
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v7.4s
 ; CHECK-NEXT:    zip1 v18.4s, v17.4s, v17.4s
-; CHECK-NEXT:    ext v6.16b, v1.16b, v16.16b, #8
 ; CHECK-NEXT:    ext v4.16b, v17.16b, v2.16b, #4
 ; CHECK-NEXT:    ext v5.16b, v16.16b, v3.16b, #4
 ; CHECK-NEXT:    mov v20.16b, v3.16b
+; CHECK-NEXT:    ext v6.16b, v1.16b, v16.16b, #8
 ; CHECK-NEXT:    ext v7.16b, v0.16b, v17.16b, #4
 ; CHECK-NEXT:    mov v21.16b, v2.16b
 ; CHECK-NEXT:    trn2 v0.4s, v18.4s, v0.4s
-; CHECK-NEXT:    ext v19.16b, v6.16b, v1.16b, #4
-; CHECK-NEXT:    mov v1.s[2], v16.s[1]
 ; CHECK-NEXT:    mov v20.s[2], v16.s[3]
 ; CHECK-NEXT:    zip2 v4.4s, v4.4s, v17.4s
 ; CHECK-NEXT:    zip2 v5.4s, v5.4s, v16.4s
 ; CHECK-NEXT:    mov v21.s[2], v17.s[3]
+; CHECK-NEXT:    ext v19.16b, v6.16b, v1.16b, #4
 ; CHECK-NEXT:    ext v7.16b, v7.16b, v7.16b, #4
-; CHECK-NEXT:    mov v18.16b, v1.16b
+; CHECK-NEXT:    mov v1.s[2], v16.s[1]
 ; CHECK-NEXT:    ext v2.16b, v2.16b, v4.16b, #12
 ; CHECK-NEXT:    ext v3.16b, v3.16b, v5.16b, #12
 ; CHECK-NEXT:    uzp2 v4.4s, v6.4s, v19.4s
 ; CHECK-NEXT:    mov v5.16b, v7.16b
 ; CHECK-NEXT:    mov v6.16b, v20.16b
+; CHECK-NEXT:    mov v18.16b, v1.16b
 ; CHECK-NEXT:    mov v19.16b, v21.16b
-; CHECK-NEXT:    mov v18.s[1], v16.s[0]
 ; CHECK-NEXT:    sub v7.4s, v0.4s, v7.4s
 ; CHECK-NEXT:    mov v6.s[1], v16.s[2]
 ; CHECK-NEXT:    mov v5.s[0], v17.s[1]
+; CHECK-NEXT:    mov v18.s[1], v16.s[0]
 ; CHECK-NEXT:    mov v19.s[1], v17.s[2]
 ; CHECK-NEXT:    sub v1.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    sub v16.4s, v20.4s, v3.4s
 ; CHECK-NEXT:    sub v17.4s, v21.4s, v2.4s
-; CHECK-NEXT:    add v4.4s, v18.4s, v4.4s
 ; CHECK-NEXT:    add v3.4s, v6.4s, v3.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v5.4s
+; CHECK-NEXT:    add v4.4s, v18.4s, v4.4s
 ; CHECK-NEXT:    add v2.4s, v19.4s, v2.4s
-; CHECK-NEXT:    mov v4.d[1], v1.d[1]
 ; CHECK-NEXT:    mov v3.d[1], v16.d[1]
 ; CHECK-NEXT:    mov v0.d[1], v7.d[1]
+; CHECK-NEXT:    mov v4.d[1], v1.d[1]
 ; CHECK-NEXT:    mov v2.d[1], v17.d[1]
-; CHECK-NEXT:    cmlt v6.8h, v4.8h, #0
 ; CHECK-NEXT:    cmlt v1.8h, v3.8h, #0
 ; CHECK-NEXT:    cmlt v5.8h, v0.8h, #0
+; CHECK-NEXT:    cmlt v6.8h, v4.8h, #0
 ; CHECK-NEXT:    cmlt v7.8h, v2.8h, #0
-; CHECK-NEXT:    add v4.4s, v6.4s, v4.4s
 ; CHECK-NEXT:    add v3.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
+; CHECK-NEXT:    add v4.4s, v6.4s, v4.4s
 ; CHECK-NEXT:    add v2.4s, v7.4s, v2.4s
 ; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    eor v0.16b, v0.16b, v5.16b
 ; CHECK-NEXT:    eor v2.16b, v2.16b, v7.16b
 ; CHECK-NEXT:    eor v3.16b, v4.16b, v6.16b
-; CHECK-NEXT:    eor v0.16b, v0.16b, v5.16b
-; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
diff --git a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
index 29f9c0336bbcc..542b2e90ffc15 100644
--- a/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/insert-subvector-res-legalization.ll
@@ -9,9 +9,9 @@ define <vscale x 8 x i8> @vec_scalable_subvec_scalable_idx_zero_i8(ptr %a, ptr %
 ; CHECK-LABEL: vec_scalable_subvec_scalable_idx_zero_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1b { z1.s }, p1/z, [x1]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1b { z1.s }, p0/z, [x1]
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
 ; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
 ; CHECK-NEXT:    ret
@@ -25,9 +25,9 @@ define <vscale x 8 x i8> @vec_scalable_subvec_scalable_idx_nonzero_i8(ptr %a, pt
 ; CHECK-LABEL: vec_scalable_subvec_scalable_idx_nonzero_i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1b { z1.s }, p1/z, [x1]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1b { z1.s }, p0/z, [x1]
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
 ; CHECK-NEXT:    ret
@@ -41,9 +41,9 @@ define <vscale x 4 x i16> @vec_scalable_subvec_scalable_idx_zero_i16(ptr %a, ptr
 ; CHECK-LABEL: vec_scalable_subvec_scalable_idx_zero_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1h { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
 ; CHECK-NEXT:    ret
@@ -57,9 +57,9 @@ define <vscale x 4 x i16> @vec_scalable_subvec_scalable_idx_nonzero_i16(ptr %a,
 ; CHECK-LABEL: vec_scalable_subvec_scalable_idx_nonzero_i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1h { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -76,10 +76,10 @@ define <vscale x 8 x i8> @vec_scalable_subvec_fixed_idx_zero_i8(ptr %a, ptr %b)
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ptrue p1.h, vl8
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    ld1b { z1.h }, p0/z, [x0]
-; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 8 x i8>, ptr %a
   %subvec = load <8 x i8>, ptr %b
@@ -92,19 +92,19 @@ define <vscale x 8 x i8> @vec_scalable_subvec_fixed_idx_nonzero_i8(ptr %a, ptr %
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cnth x8
-; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    sub x8, x8, #8
 ; CHECK-NEXT:    mov w9, #8 // =0x8
 ; CHECK-NEXT:    cmp x8, #8
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    lsl x8, x8, #1
-; CHECK-NEXT:    ld1b { z1.h }, p0/z, [x0]
-; CHECK-NEXT:    st1h { z1.h }, p0, [sp]
-; CHECK-NEXT:    str q0, [x9, x8]
+; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK-NEXT:    str q1, [x9, x8]
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -120,10 +120,10 @@ define <vscale x 4 x i16> @vec_scalable_subvec_fixed_idx_zero_i16(ptr %a, ptr %b
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ptrue p1.s, vl4
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    ld1h { z1.s }, p0/z, [x0]
-; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ptrue p0.s, vl4
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 4 x i16>, ptr %a
   %subvec = load <4 x i16>, ptr %b
@@ -136,19 +136,19 @@ define <vscale x 4 x i16> @vec_scalable_subvec_fixed_idx_nonzero_i16(ptr %a, ptr
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cntw x8
-; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    sub x8, x8, #4
 ; CHECK-NEXT:    mov w9, #4 // =0x4
 ; CHECK-NEXT:    cmp x8, #4
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    lsl x8, x8, #2
-; CHECK-NEXT:    ld1h { z1.s }, p0/z, [x0]
-; CHECK-NEXT:    st1w { z1.s }, p0, [sp]
-; CHECK-NEXT:    str q0, [x9, x8]
+; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK-NEXT:    str q1, [x9, x8]
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -164,10 +164,10 @@ define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_zero_i32(ptr %a, ptr %b
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldr d0, [x1]
-; CHECK-NEXT:    ptrue p1.d, vl2
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    ld1w { z1.d }, p0/z, [x0]
-; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 2 x i32>, ptr %a
   %subvec = load <2 x i32>, ptr %b
@@ -180,19 +180,19 @@ define <vscale x 2 x i32> @vec_scalable_subvec_fixed_idx_nonzero_i32(ptr %a, ptr
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cntd x8
-; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr d1, [x1]
 ; CHECK-NEXT:    sub x8, x8, #2
 ; CHECK-NEXT:    mov w9, #2 // =0x2
 ; CHECK-NEXT:    cmp x8, #2
-; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    lsl x8, x8, #3
-; CHECK-NEXT:    ld1w { z1.d }, p0/z, [x0]
-; CHECK-NEXT:    st1d { z1.d }, p0, [sp]
-; CHECK-NEXT:    str q0, [x9, x8]
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK-NEXT:    str q1, [x9, x8]
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
index 1a4ab6ab334a6..9bd2ed240810d 100644
--- a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
+++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
 
@@ -31,28 +32,28 @@ define i32 @ctz_nxv32i1(<vscale x 32 x i1> %a) #0 {
 ; CHECK-NEXT:    neg x8, x8
 ; CHECK-NEXT:    punpklo p3.h, p1.b
 ; CHECK-NEXT:    rdvl x9, #2
-; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    rdvl x8, #-1
-; CHECK-NEXT:    punpkhi p1.h, p1.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    mov z2.h, w8
-; CHECK-NEXT:    inch z0.h, all, mul #4
+; CHECK-NEXT:    punpkhi p1.h, p1.b
 ; CHECK-NEXT:    mov z3.h, p2/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    ptrue p2.h
+; CHECK-NEXT:    inch z0.h, all, mul #4
 ; CHECK-NEXT:    mov z5.h, p3/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    add z1.h, z0.h, z1.h
-; CHECK-NEXT:    add z4.h, z0.h, z2.h
 ; CHECK-NEXT:    mov z6.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z7.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    add z1.h, z0.h, z1.h
+; CHECK-NEXT:    add z4.h, z0.h, z2.h
 ; CHECK-NEXT:    and z0.d, z0.d, z3.d
 ; CHECK-NEXT:    add z2.h, z1.h, z2.h
 ; CHECK-NEXT:    and z3.d, z4.d, z5.d
 ; CHECK-NEXT:    and z1.d, z1.d, z6.d
 ; CHECK-NEXT:    and z2.d, z2.d, z7.d
-; CHECK-NEXT:    umax z0.h, p2/m, z0.h, z3.h
-; CHECK-NEXT:    umax z1.h, p2/m, z1.h, z2.h
-; CHECK-NEXT:    umax z0.h, p2/m, z0.h, z1.h
-; CHECK-NEXT:    umaxv h0, p2, z0.h
+; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z3.h
+; CHECK-NEXT:    umax z1.h, p0/m, z1.h, z2.h
+; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    umaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    sub w8, w9, w8
 ; CHECK-NEXT:    and w0, w8, #0xffff
@@ -65,12 +66,12 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: ctz_nxv4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    index z1.s, #0, #-1
 ; CHECK-NEXT:    cntw x9
-; CHECK-NEXT:    incw z1.s
 ; CHECK-NEXT:    cmpne p1.s, p0/z, z0.s, #0
-; CHECK-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    and z0.d, z1.d, z0.d
+; CHECK-NEXT:    index z0.s, #0, #-1
+; CHECK-NEXT:    mov z1.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    incw z0.s
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
 ; CHECK-NEXT:    and z0.s, z0.s, #0xff
 ; CHECK-NEXT:    umaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w8, s0
@@ -87,38 +88,38 @@ define i64 @vscale_4096(<vscale x 16 x i8> %a) #1 {
 ; CHECK-LABEL: vscale_4096:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    index z1.s, #0, #-1
 ; CHECK-NEXT:    cntw x8
-; CHECK-NEXT:    cnth x9
 ; CHECK-NEXT:    neg x8, x8
-; CHECK-NEXT:    mov z1.s, w8
-; CHECK-NEXT:    neg x8, x9
 ; CHECK-NEXT:    rdvl x9, #1
-; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
-; CHECK-NEXT:    index z0.s, #0, #-1
+; CHECK-NEXT:    mov z0.s, w8
+; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    neg x8, x8
+; CHECK-NEXT:    incw z1.s, all, mul #4
+; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
-; CHECK-NEXT:    incw z0.s, all, mul #4
-; CHECK-NEXT:    add z1.s, z0.s, z1.s
-; CHECK-NEXT:    add z5.s, z0.s, z2.s
+; CHECK-NEXT:    add z0.s, z1.s, z0.s
+; CHECK-NEXT:    add z4.s, z1.s, z2.s
 ; CHECK-NEXT:    punpkhi p2.h, p1.b
 ; CHECK-NEXT:    punpkhi p3.h, p0.b
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    add z2.s, z1.s, z2.s
-; CHECK-NEXT:    punpklo p1.h, p1.b
 ; CHECK-NEXT:    mov z3.s, p2/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    ptrue p2.s
-; CHECK-NEXT:    mov z4.s, p3/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    add z2.s, z0.s, z2.s
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    mov z5.s, p3/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z6.s, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z7.s, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    and z1.d, z1.d, z3.d
-; CHECK-NEXT:    and z2.d, z2.d, z4.d
-; CHECK-NEXT:    and z3.d, z5.d, z6.d
-; CHECK-NEXT:    and z0.d, z0.d, z7.d
-; CHECK-NEXT:    umax z1.s, p2/m, z1.s, z2.s
-; CHECK-NEXT:    umax z0.s, p2/m, z0.s, z3.s
-; CHECK-NEXT:    umax z0.s, p2/m, z0.s, z1.s
-; CHECK-NEXT:    umaxv s0, p2, z0.s
+; CHECK-NEXT:    and z0.d, z0.d, z3.d
+; CHECK-NEXT:    and z2.d, z2.d, z5.d
+; CHECK-NEXT:    and z3.d, z4.d, z6.d
+; CHECK-NEXT:    and z1.d, z1.d, z7.d
+; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    umax z1.s, p0/m, z1.s, z3.s
+; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    umaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    sub w0, w9, w8
 ; CHECK-NEXT:    ret
@@ -130,21 +131,21 @@ define i64 @vscale_4096_poison(<vscale x 16 x i8> %a) #1 {
 ; CHECK-LABEL: vscale_4096_poison:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    index z1.h, #0, #-1
 ; CHECK-NEXT:    cnth x8
-; CHECK-NEXT:    rdvl x9, #1
 ; CHECK-NEXT:    neg x8, x8
-; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    rdvl x9, #1
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
-; CHECK-NEXT:    index z0.h, #0, #-1
+; CHECK-NEXT:    mov z0.h, w8
+; CHECK-NEXT:    inch z1.h, all, mul #2
 ; CHECK-NEXT:    punpkhi p1.h, p0.b
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    inch z0.h, all, mul #2
-; CHECK-NEXT:    add z1.h, z0.h, z1.h
+; CHECK-NEXT:    add z0.h, z1.h, z0.h
 ; CHECK-NEXT:    mov z2.h, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z3.h, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    and z1.d, z1.d, z2.d
-; CHECK-NEXT:    and z0.d, z0.d, z3.d
+; CHECK-NEXT:    and z0.d, z0.d, z2.d
+; CHECK-NEXT:    and z1.d, z1.d, z3.d
 ; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    umaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w8, s0
@@ -161,16 +162,16 @@ define i32 @ctz_nxv8i1_no_range(<vscale x 8 x i1> %a) {
 ; CHECK-LABEL: ctz_nxv8i1_no_range:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    index z0.s, #0, #-1
-; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    cntw x8
-; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    neg x8, x8
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    cnth x9
 ; CHECK-NEXT:    mov z1.s, w8
-; CHECK-NEXT:    incw z0.s, all, mul #2
 ; CHECK-NEXT:    mov z2.s, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z3.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    incw z0.s, all, mul #2
 ; CHECK-NEXT:    add z1.s, z0.s, z1.s
 ; CHECK-NEXT:    and z0.d, z0.d, z2.d
 ; CHECK-NEXT:    and z1.d, z1.d, z3.d
@@ -212,8 +213,8 @@ define i32 @ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
 define i32 @ctz_and_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: ctz_and_nxv16i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, z1.b
+; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
 ; CHECK-NEXT:    cntp x0, p0, p0.b
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index f5a7b5dc9f492..ae4ced258bb8e 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -6555,18 +6555,18 @@ define <3 x bfloat> @stofp_v3i64_v3bf16(<3 x i64> %a) {
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    scvtf v1.2d, v2.2d
-; CHECK-NEXT:    movi v2.4s, #1
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    scvtf v0.2d, v0.2d
 ; CHECK-NEXT:    fcvtn v0.2s, v0.2d
 ; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
-; CHECK-NEXT:    add v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    and v2.16b, v3.16b, v2.16b
-; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    fcmeq v2.4s, v0.4s, v0.4s
+; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
 ; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    bit v0.16b, v1.16b, v3.16b
 ; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
 ; CHECK-NEXT:    ret
 entry:
@@ -6582,18 +6582,18 @@ define <3 x bfloat> @utofp_v3i64_v3bf16(<3 x i64> %a) {
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    ucvtf v1.2d, v2.2d
-; CHECK-NEXT:    movi v2.4s, #1
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    ucvtf v0.2d, v0.2d
 ; CHECK-NEXT:    fcvtn v0.2s, v0.2d
 ; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
-; CHECK-NEXT:    add v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    and v2.16b, v3.16b, v2.16b
-; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    fcmeq v2.4s, v0.4s, v0.4s
+; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
 ; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    bit v0.16b, v1.16b, v3.16b
 ; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
 ; CHECK-NEXT:    ret
 entry:
@@ -6606,17 +6606,17 @@ define <4 x bfloat> @stofp_v4i64_v4bf16(<4 x i64> %a) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    scvtf v0.2d, v0.2d
 ; CHECK-NEXT:    scvtf v1.2d, v1.2d
-; CHECK-NEXT:    movi v2.4s, #1
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    fcvtn v0.2s, v0.2d
 ; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
-; CHECK-NEXT:    add v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    and v2.16b, v3.16b, v2.16b
-; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    fcmeq v2.4s, v0.4s, v0.4s
+; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
 ; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    bit v0.16b, v1.16b, v3.16b
 ; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
 ; CHECK-NEXT:    ret
 entry:
@@ -6629,17 +6629,17 @@ define <4 x bfloat> @utofp_v4i64_v4bf16(<4 x i64> %a) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ucvtf v0.2d, v0.2d
 ; CHECK-NEXT:    ucvtf v1.2d, v1.2d
-; CHECK-NEXT:    movi v2.4s, #1
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    fcvtn v0.2s, v0.2d
 ; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
-; CHECK-NEXT:    add v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    and v2.16b, v3.16b, v2.16b
-; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    fcmeq v2.4s, v0.4s, v0.4s
+; CHECK-NEXT:    add v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
 ; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    bit v0.16b, v1.16b, v2.16b
+; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    bit v0.16b, v1.16b, v3.16b
 ; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
 ; CHECK-NEXT:    ret
 entry:
@@ -6658,22 +6658,22 @@ define <8 x bfloat> @stofp_v8i64_v8bf16(<8 x i64> %a) {
 ; CHECK-NEXT:    fcvtn v0.2s, v0.2d
 ; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
 ; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
-; CHECK-NEXT:    movi v3.4s, #1
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    movi v3.4s, #127, msl #8
 ; CHECK-NEXT:    ushr v4.4s, v2.4s, #16
 ; CHECK-NEXT:    ushr v5.4s, v0.4s, #16
-; CHECK-NEXT:    add v6.4s, v2.4s, v1.4s
-; CHECK-NEXT:    add v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    and v4.16b, v4.16b, v3.16b
-; CHECK-NEXT:    and v3.16b, v5.16b, v3.16b
+; CHECK-NEXT:    add v6.4s, v2.4s, v3.4s
+; CHECK-NEXT:    add v3.4s, v0.4s, v3.4s
+; CHECK-NEXT:    and v4.16b, v4.16b, v1.16b
+; CHECK-NEXT:    and v1.16b, v5.16b, v1.16b
 ; CHECK-NEXT:    fcmeq v5.4s, v2.4s, v2.4s
 ; CHECK-NEXT:    orr v2.4s, #64, lsl #16
 ; CHECK-NEXT:    add v4.4s, v4.4s, v6.4s
-; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
+; CHECK-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    orr v0.4s, #64, lsl #16
 ; CHECK-NEXT:    bit v2.16b, v4.16b, v5.16b
-; CHECK-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-NEXT:    bit v0.16b, v1.16b, v6.16b
 ; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    ret
 entry:
@@ -6692,22 +6692,22 @@ define <8 x bfloat> @utofp_v8i64_v8bf16(<8 x i64> %a) {
 ; CHECK-NEXT:    fcvtn v0.2s, v0.2d
 ; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
 ; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
-; CHECK-NEXT:    movi v3.4s, #1
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    movi v3.4s, #127, msl #8
 ; CHECK-NEXT:    ushr v4.4s, v2.4s, #16
 ; CHECK-NEXT:    ushr v5.4s, v0.4s, #16
-; CHECK-NEXT:    add v6.4s, v2.4s, v1.4s
-; CHECK-NEXT:    add v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    and v4.16b, v4.16b, v3.16b
-; CHECK-NEXT:    and v3.16b, v5.16b, v3.16b
+; CHECK-NEXT:    add v6.4s, v2.4s, v3.4s
+; CHECK-NEXT:    add v3.4s, v0.4s, v3.4s
+; CHECK-NEXT:    and v4.16b, v4.16b, v1.16b
+; CHECK-NEXT:    and v1.16b, v5.16b, v1.16b
 ; CHECK-NEXT:    fcmeq v5.4s, v2.4s, v2.4s
 ; CHECK-NEXT:    orr v2.4s, #64, lsl #16
 ; CHECK-NEXT:    add v4.4s, v4.4s, v6.4s
-; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    fcmeq v3.4s, v0.4s, v0.4s
+; CHECK-NEXT:    fcmeq v6.4s, v0.4s, v0.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    orr v0.4s, #64, lsl #16
 ; CHECK-NEXT:    bit v2.16b, v4.16b, v5.16b
-; CHECK-NEXT:    bit v0.16b, v1.16b, v3.16b
+; CHECK-NEXT:    bit v0.16b, v1.16b, v6.16b
 ; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    ret
 entry:
@@ -6718,51 +6718,51 @@ entry:
 define <16 x bfloat> @stofp_v16i64_v16bf16(<16 x i64> %a) {
 ; CHECK-LABEL: stofp_v16i64_v16bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v2.2d, v2.2d
 ; CHECK-NEXT:    scvtf v0.2d, v0.2d
+; CHECK-NEXT:    scvtf v2.2d, v2.2d
 ; CHECK-NEXT:    scvtf v6.2d, v6.2d
 ; CHECK-NEXT:    scvtf v4.2d, v4.2d
-; CHECK-NEXT:    scvtf v3.2d, v3.2d
 ; CHECK-NEXT:    scvtf v1.2d, v1.2d
+; CHECK-NEXT:    scvtf v3.2d, v3.2d
 ; CHECK-NEXT:    scvtf v7.2d, v7.2d
 ; CHECK-NEXT:    scvtf v5.2d, v5.2d
-; CHECK-NEXT:    fcvtn v2.2s, v2.2d
 ; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtn v2.2s, v2.2d
 ; CHECK-NEXT:    fcvtn v6.2s, v6.2d
 ; CHECK-NEXT:    fcvtn v4.2s, v4.2d
-; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
 ; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
 ; CHECK-NEXT:    fcvtn2 v6.4s, v7.2d
 ; CHECK-NEXT:    fcvtn2 v4.4s, v5.2d
-; CHECK-NEXT:    movi v3.4s, #1
-; CHECK-NEXT:    ushr v5.4s, v2.4s, #16
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    movi v3.4s, #127, msl #8
 ; CHECK-NEXT:    ushr v7.4s, v0.4s, #16
-; CHECK-NEXT:    add v17.4s, v2.4s, v1.4s
-; CHECK-NEXT:    add v19.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ushr v5.4s, v2.4s, #16
 ; CHECK-NEXT:    ushr v16.4s, v6.4s, #16
-; CHECK-NEXT:    ushr v18.4s, v4.4s, #16
-; CHECK-NEXT:    add v20.4s, v6.4s, v1.4s
-; CHECK-NEXT:    add v1.4s, v4.4s, v1.4s
-; CHECK-NEXT:    and v5.16b, v5.16b, v3.16b
-; CHECK-NEXT:    and v7.16b, v7.16b, v3.16b
-; CHECK-NEXT:    and v16.16b, v16.16b, v3.16b
-; CHECK-NEXT:    and v3.16b, v18.16b, v3.16b
-; CHECK-NEXT:    fcmeq v18.4s, v0.4s, v0.4s
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    add v5.4s, v5.4s, v17.4s
+; CHECK-NEXT:    ushr v17.4s, v4.4s, #16
+; CHECK-NEXT:    add v19.4s, v0.4s, v3.4s
+; CHECK-NEXT:    add v18.4s, v2.4s, v3.4s
+; CHECK-NEXT:    add v20.4s, v6.4s, v3.4s
+; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
+; CHECK-NEXT:    and v7.16b, v7.16b, v1.16b
+; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
+; CHECK-NEXT:    and v16.16b, v16.16b, v1.16b
+; CHECK-NEXT:    and v1.16b, v17.16b, v1.16b
 ; CHECK-NEXT:    fcmeq v17.4s, v2.4s, v2.4s
+; CHECK-NEXT:    orr v2.4s, #64, lsl #16
 ; CHECK-NEXT:    add v7.4s, v7.4s, v19.4s
 ; CHECK-NEXT:    fcmeq v19.4s, v6.4s, v6.4s
-; CHECK-NEXT:    orr v2.4s, #64, lsl #16
-; CHECK-NEXT:    add v16.4s, v16.4s, v20.4s
-; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT:    add v5.4s, v5.4s, v18.4s
+; CHECK-NEXT:    fcmeq v18.4s, v0.4s, v0.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    fcmeq v3.4s, v4.4s, v4.4s
+; CHECK-NEXT:    add v16.4s, v16.4s, v20.4s
+; CHECK-NEXT:    orr v0.4s, #64, lsl #16
 ; CHECK-NEXT:    orr v6.4s, #64, lsl #16
 ; CHECK-NEXT:    orr v4.4s, #64, lsl #16
 ; CHECK-NEXT:    bit v2.16b, v5.16b, v17.16b
-; CHECK-NEXT:    bit v0.16b, v7.16b, v18.16b
 ; CHECK-NEXT:    mov v5.16b, v19.16b
+; CHECK-NEXT:    bit v0.16b, v7.16b, v18.16b
 ; CHECK-NEXT:    bif v1.16b, v4.16b, v3.16b
 ; CHECK-NEXT:    bsl v5.16b, v16.16b, v6.16b
 ; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
@@ -6776,51 +6776,51 @@ entry:
 define <16 x bfloat> @utofp_v16i64_v16bf16(<16 x i64> %a) {
 ; CHECK-LABEL: utofp_v16i64_v16bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v2.2d, v2.2d
 ; CHECK-NEXT:    ucvtf v0.2d, v0.2d
+; CHECK-NEXT:    ucvtf v2.2d, v2.2d
 ; CHECK-NEXT:    ucvtf v6.2d, v6.2d
 ; CHECK-NEXT:    ucvtf v4.2d, v4.2d
-; CHECK-NEXT:    ucvtf v3.2d, v3.2d
 ; CHECK-NEXT:    ucvtf v1.2d, v1.2d
+; CHECK-NEXT:    ucvtf v3.2d, v3.2d
 ; CHECK-NEXT:    ucvtf v7.2d, v7.2d
 ; CHECK-NEXT:    ucvtf v5.2d, v5.2d
-; CHECK-NEXT:    fcvtn v2.2s, v2.2d
 ; CHECK-NEXT:    fcvtn v0.2s, v0.2d
+; CHECK-NEXT:    fcvtn v2.2s, v2.2d
 ; CHECK-NEXT:    fcvtn v6.2s, v6.2d
 ; CHECK-NEXT:    fcvtn v4.2s, v4.2d
-; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
 ; CHECK-NEXT:    fcvtn2 v0.4s, v1.2d
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
+; CHECK-NEXT:    fcvtn2 v2.4s, v3.2d
 ; CHECK-NEXT:    fcvtn2 v6.4s, v7.2d
 ; CHECK-NEXT:    fcvtn2 v4.4s, v5.2d
-; CHECK-NEXT:    movi v3.4s, #1
-; CHECK-NEXT:    ushr v5.4s, v2.4s, #16
+; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    movi v3.4s, #127, msl #8
 ; CHECK-NEXT:    ushr v7.4s, v0.4s, #16
-; CHECK-NEXT:    add v17.4s, v2.4s, v1.4s
-; CHECK-NEXT:    add v19.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ushr v5.4s, v2.4s, #16
 ; CHECK-NEXT:    ushr v16.4s, v6.4s, #16
-; CHECK-NEXT:    ushr v18.4s, v4.4s, #16
-; CHECK-NEXT:    add v20.4s, v6.4s, v1.4s
-; CHECK-NEXT:    add v1.4s, v4.4s, v1.4s
-; CHECK-NEXT:    and v5.16b, v5.16b, v3.16b
-; CHECK-NEXT:    and v7.16b, v7.16b, v3.16b
-; CHECK-NEXT:    and v16.16b, v16.16b, v3.16b
-; CHECK-NEXT:    and v3.16b, v18.16b, v3.16b
-; CHECK-NEXT:    fcmeq v18.4s, v0.4s, v0.4s
-; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    add v5.4s, v5.4s, v17.4s
+; CHECK-NEXT:    ushr v17.4s, v4.4s, #16
+; CHECK-NEXT:    add v19.4s, v0.4s, v3.4s
+; CHECK-NEXT:    add v18.4s, v2.4s, v3.4s
+; CHECK-NEXT:    add v20.4s, v6.4s, v3.4s
+; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
+; CHECK-NEXT:    and v7.16b, v7.16b, v1.16b
+; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
+; CHECK-NEXT:    and v16.16b, v16.16b, v1.16b
+; CHECK-NEXT:    and v1.16b, v17.16b, v1.16b
 ; CHECK-NEXT:    fcmeq v17.4s, v2.4s, v2.4s
+; CHECK-NEXT:    orr v2.4s, #64, lsl #16
 ; CHECK-NEXT:    add v7.4s, v7.4s, v19.4s
 ; CHECK-NEXT:    fcmeq v19.4s, v6.4s, v6.4s
-; CHECK-NEXT:    orr v2.4s, #64, lsl #16
-; CHECK-NEXT:    add v16.4s, v16.4s, v20.4s
-; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
+; CHECK-NEXT:    add v5.4s, v5.4s, v18.4s
+; CHECK-NEXT:    fcmeq v18.4s, v0.4s, v0.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    fcmeq v3.4s, v4.4s, v4.4s
+; CHECK-NEXT:    add v16.4s, v16.4s, v20.4s
+; CHECK-NEXT:    orr v0.4s, #64, lsl #16
 ; CHECK-NEXT:    orr v6.4s, #64, lsl #16
 ; CHECK-NEXT:    orr v4.4s, #64, lsl #16
 ; CHECK-NEXT:    bit v2.16b, v5.16b, v17.16b
-; CHECK-NEXT:    bit v0.16b, v7.16b, v18.16b
 ; CHECK-NEXT:    mov v5.16b, v19.16b
+; CHECK-NEXT:    bit v0.16b, v7.16b, v18.16b
 ; CHECK-NEXT:    bif v1.16b, v4.16b, v3.16b
 ; CHECK-NEXT:    bsl v5.16b, v16.16b, v6.16b
 ; CHECK-NEXT:    uzp2 v0.8h, v0.8h, v2.8h
@@ -6834,107 +6834,107 @@ entry:
 define <32 x bfloat> @stofp_v32i64_v32bf16(<32 x i64> %a) {
 ; CHECK-LABEL: stofp_v32i64_v32bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v16.2d, v2.2d
-; CHECK-NEXT:    scvtf v17.2d, v0.2d
-; CHECK-NEXT:    scvtf v18.2d, v3.2d
-; CHECK-NEXT:    scvtf v19.2d, v6.2d
-; CHECK-NEXT:    ldp q24, q23, [sp, #96]
-; CHECK-NEXT:    scvtf v21.2d, v1.2d
-; CHECK-NEXT:    scvtf v22.2d, v4.2d
+; CHECK-NEXT:    scvtf v17.2d, v2.2d
+; CHECK-NEXT:    scvtf v18.2d, v0.2d
+; CHECK-NEXT:    scvtf v19.2d, v3.2d
+; CHECK-NEXT:    scvtf v3.2d, v6.2d
+; CHECK-NEXT:    ldp q21, q20, [sp, #32]
+; CHECK-NEXT:    scvtf v4.2d, v4.2d
 ; CHECK-NEXT:    scvtf v6.2d, v7.2d
-; CHECK-NEXT:    scvtf v7.2d, v5.2d
-; CHECK-NEXT:    movi v3.4s, #127, msl #8
-; CHECK-NEXT:    movi v2.4s, #1
-; CHECK-NEXT:    fcvtn v0.2s, v16.2d
-; CHECK-NEXT:    ldp q20, q16, [sp, #32]
-; CHECK-NEXT:    fcvtn v1.2s, v17.2d
-; CHECK-NEXT:    ldp q5, q17, [sp]
-; CHECK-NEXT:    fcvtn v4.2s, v19.2d
-; CHECK-NEXT:    scvtf v23.2d, v23.2d
+; CHECK-NEXT:    scvtf v5.2d, v5.2d
+; CHECK-NEXT:    ldp q24, q23, [sp, #64]
+; CHECK-NEXT:    movi v16.4s, #1
+; CHECK-NEXT:    fcvtn v0.2s, v17.2d
+; CHECK-NEXT:    scvtf v17.2d, v1.2d
+; CHECK-NEXT:    fcvtn v1.2s, v18.2d
+; CHECK-NEXT:    fcvtn v3.2s, v3.2d
+; CHECK-NEXT:    ldp q18, q7, [sp]
+; CHECK-NEXT:    scvtf v21.2d, v21.2d
+; CHECK-NEXT:    fcvtn v4.2s, v4.2d
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    scvtf v20.2d, v20.2d
-; CHECK-NEXT:    scvtf v16.2d, v16.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v18.2d
-; CHECK-NEXT:    ldp q19, q18, [sp, #64]
-; CHECK-NEXT:    scvtf v25.2d, v5.2d
-; CHECK-NEXT:    fcvtn v5.2s, v22.2d
-; CHECK-NEXT:    fcvtn2 v1.4s, v21.2d
-; CHECK-NEXT:    scvtf v21.2d, v24.2d
-; CHECK-NEXT:    scvtf v17.2d, v17.2d
-; CHECK-NEXT:    fcvtn2 v4.4s, v6.2d
+; CHECK-NEXT:    fcvtn2 v0.4s, v19.2d
+; CHECK-NEXT:    ldp q22, q19, [sp, #96]
+; CHECK-NEXT:    fcvtn2 v1.4s, v17.2d
+; CHECK-NEXT:    fcvtn2 v3.4s, v6.2d
+; CHECK-NEXT:    scvtf v18.2d, v18.2d
+; CHECK-NEXT:    scvtf v17.2d, v24.2d
+; CHECK-NEXT:    fcvtn v6.2s, v21.2d
+; CHECK-NEXT:    fcvtn2 v4.4s, v5.2d
+; CHECK-NEXT:    scvtf v22.2d, v22.2d
+; CHECK-NEXT:    scvtf v21.2d, v23.2d
+; CHECK-NEXT:    scvtf v7.2d, v7.2d
+; CHECK-NEXT:    ushr v24.4s, v0.4s, #16
+; CHECK-NEXT:    add v5.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    scvtf v19.2d, v19.2d
-; CHECK-NEXT:    scvtf v6.2d, v18.2d
-; CHECK-NEXT:    fcvtn v18.2s, v20.2d
-; CHECK-NEXT:    ushr v22.4s, v0.4s, #16
-; CHECK-NEXT:    add v20.4s, v0.4s, v3.4s
-; CHECK-NEXT:    fcvtn2 v5.4s, v7.2d
-; CHECK-NEXT:    fcvtn v24.2s, v25.2d
-; CHECK-NEXT:    ushr v7.4s, v1.4s, #16
-; CHECK-NEXT:    fcvtn v21.2s, v21.2d
-; CHECK-NEXT:    add v26.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ushr v27.4s, v4.4s, #16
-; CHECK-NEXT:    fcvtn v19.2s, v19.2d
-; CHECK-NEXT:    fcvtn2 v18.4s, v16.2d
-; CHECK-NEXT:    and v22.16b, v22.16b, v2.16b
-; CHECK-NEXT:    and v7.16b, v7.16b, v2.16b
-; CHECK-NEXT:    fcmeq v25.4s, v0.4s, v0.4s
+; CHECK-NEXT:    ushr v23.4s, v1.4s, #16
+; CHECK-NEXT:    ushr v25.4s, v3.4s, #16
+; CHECK-NEXT:    fcvtn v18.2s, v18.2d
+; CHECK-NEXT:    fcvtn2 v6.4s, v20.2d
+; CHECK-NEXT:    add v26.4s, v1.4s, v2.4s
+; CHECK-NEXT:    fcvtn v17.2s, v17.2d
+; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
+; CHECK-NEXT:    fcvtn v22.2s, v22.2d
+; CHECK-NEXT:    fcmeq v20.4s, v0.4s, v0.4s
+; CHECK-NEXT:    and v23.16b, v23.16b, v16.16b
 ; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    ushr v28.4s, v5.4s, #16
-; CHECK-NEXT:    fcvtn2 v24.4s, v17.2d
-; CHECK-NEXT:    add v17.4s, v5.4s, v3.4s
-; CHECK-NEXT:    fcvtn2 v21.4s, v23.2d
-; CHECK-NEXT:    and v16.16b, v27.16b, v2.16b
-; CHECK-NEXT:    add v20.4s, v22.4s, v20.4s
-; CHECK-NEXT:    fcvtn2 v19.4s, v6.2d
-; CHECK-NEXT:    add v7.4s, v7.4s, v26.4s
-; CHECK-NEXT:    ushr v26.4s, v18.4s, #16
-; CHECK-NEXT:    and v23.16b, v28.16b, v2.16b
-; CHECK-NEXT:    add v22.4s, v4.4s, v3.4s
-; CHECK-NEXT:    fcmeq v6.4s, v1.4s, v1.4s
-; CHECK-NEXT:    ushr v27.4s, v24.4s, #16
-; CHECK-NEXT:    add v30.4s, v24.4s, v3.4s
+; CHECK-NEXT:    fcmeq v27.4s, v3.4s, v3.4s
+; CHECK-NEXT:    fcvtn2 v18.4s, v7.2d
+; CHECK-NEXT:    add v7.4s, v3.4s, v2.4s
+; CHECK-NEXT:    orr v3.4s, #64, lsl #16
+; CHECK-NEXT:    add v5.4s, v24.4s, v5.4s
+; CHECK-NEXT:    and v24.16b, v25.16b, v16.16b
+; CHECK-NEXT:    ushr v25.4s, v4.4s, #16
+; CHECK-NEXT:    fcvtn2 v22.4s, v19.2d
+; CHECK-NEXT:    add v19.4s, v23.4s, v26.4s
+; CHECK-NEXT:    ushr v26.4s, v6.4s, #16
+; CHECK-NEXT:    fcvtn2 v17.4s, v21.2d
+; CHECK-NEXT:    fcmeq v21.4s, v1.4s, v1.4s
 ; CHECK-NEXT:    orr v1.4s, #64, lsl #16
-; CHECK-NEXT:    ushr v28.4s, v21.4s, #16
-; CHECK-NEXT:    add v31.4s, v21.4s, v3.4s
-; CHECK-NEXT:    and v26.16b, v26.16b, v2.16b
-; CHECK-NEXT:    add v17.4s, v23.4s, v17.4s
-; CHECK-NEXT:    add v23.4s, v18.4s, v3.4s
-; CHECK-NEXT:    ushr v29.4s, v19.4s, #16
-; CHECK-NEXT:    and v27.16b, v27.16b, v2.16b
-; CHECK-NEXT:    add v3.4s, v19.4s, v3.4s
-; CHECK-NEXT:    add v16.4s, v16.4s, v22.4s
-; CHECK-NEXT:    and v28.16b, v28.16b, v2.16b
-; CHECK-NEXT:    fcmeq v22.4s, v4.4s, v4.4s
-; CHECK-NEXT:    orr v4.4s, #64, lsl #16
-; CHECK-NEXT:    and v2.16b, v29.16b, v2.16b
-; CHECK-NEXT:    fcmeq v29.4s, v5.4s, v5.4s
-; CHECK-NEXT:    orr v5.4s, #64, lsl #16
-; CHECK-NEXT:    add v23.4s, v26.4s, v23.4s
-; CHECK-NEXT:    fcmeq v26.4s, v18.4s, v18.4s
-; CHECK-NEXT:    add v27.4s, v27.4s, v30.4s
-; CHECK-NEXT:    fcmeq v30.4s, v24.4s, v24.4s
+; CHECK-NEXT:    and v23.16b, v25.16b, v16.16b
+; CHECK-NEXT:    add v25.4s, v4.4s, v2.4s
+; CHECK-NEXT:    add v7.4s, v24.4s, v7.4s
+; CHECK-NEXT:    ushr v24.4s, v18.4s, #16
+; CHECK-NEXT:    add v30.4s, v18.4s, v2.4s
+; CHECK-NEXT:    bit v0.16b, v5.16b, v20.16b
+; CHECK-NEXT:    ushr v28.4s, v22.4s, #16
+; CHECK-NEXT:    add v31.4s, v22.4s, v2.4s
+; CHECK-NEXT:    add v23.4s, v23.4s, v25.4s
+; CHECK-NEXT:    and v25.16b, v26.16b, v16.16b
+; CHECK-NEXT:    add v26.4s, v6.4s, v2.4s
+; CHECK-NEXT:    ushr v29.4s, v17.4s, #16
+; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
+; CHECK-NEXT:    add v2.4s, v17.4s, v2.4s
+; CHECK-NEXT:    and v28.16b, v28.16b, v16.16b
+; CHECK-NEXT:    bit v3.16b, v7.16b, v27.16b
+; CHECK-NEXT:    bit v1.16b, v19.16b, v21.16b
+; CHECK-NEXT:    add v25.4s, v25.4s, v26.4s
+; CHECK-NEXT:    fcmeq v26.4s, v6.4s, v6.4s
+; CHECK-NEXT:    orr v6.4s, #64, lsl #16
+; CHECK-NEXT:    and v16.16b, v29.16b, v16.16b
+; CHECK-NEXT:    add v24.4s, v24.4s, v30.4s
+; CHECK-NEXT:    fcmeq v30.4s, v18.4s, v18.4s
 ; CHECK-NEXT:    add v28.4s, v28.4s, v31.4s
-; CHECK-NEXT:    fcmeq v31.4s, v21.4s, v21.4s
-; CHECK-NEXT:    add v2.4s, v2.4s, v3.4s
-; CHECK-NEXT:    fcmeq v3.4s, v19.4s, v19.4s
+; CHECK-NEXT:    fcmeq v31.4s, v22.4s, v22.4s
+; CHECK-NEXT:    fcmeq v29.4s, v4.4s, v4.4s
+; CHECK-NEXT:    orr v4.4s, #64, lsl #16
 ; CHECK-NEXT:    orr v18.4s, #64, lsl #16
-; CHECK-NEXT:    orr v24.4s, #64, lsl #16
-; CHECK-NEXT:    orr v21.4s, #64, lsl #16
-; CHECK-NEXT:    orr v19.4s, #64, lsl #16
-; CHECK-NEXT:    bit v1.16b, v7.16b, v6.16b
-; CHECK-NEXT:    bit v4.16b, v16.16b, v22.16b
-; CHECK-NEXT:    mov v6.16b, v26.16b
-; CHECK-NEXT:    mov v7.16b, v30.16b
-; CHECK-NEXT:    mov v16.16b, v31.16b
-; CHECK-NEXT:    bit v0.16b, v20.16b, v25.16b
-; CHECK-NEXT:    bit v5.16b, v17.16b, v29.16b
-; CHECK-NEXT:    bsl v3.16b, v2.16b, v19.16b
-; CHECK-NEXT:    bsl v6.16b, v23.16b, v18.16b
-; CHECK-NEXT:    bsl v7.16b, v27.16b, v24.16b
-; CHECK-NEXT:    bsl v16.16b, v28.16b, v21.16b
+; CHECK-NEXT:    orr v22.4s, #64, lsl #16
+; CHECK-NEXT:    mov v5.16b, v26.16b
+; CHECK-NEXT:    add v2.4s, v16.4s, v2.4s
+; CHECK-NEXT:    fcmeq v16.4s, v17.4s, v17.4s
+; CHECK-NEXT:    orr v17.4s, #64, lsl #16
 ; CHECK-NEXT:    uzp2 v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    uzp2 v1.8h, v5.8h, v4.8h
-; CHECK-NEXT:    uzp2 v2.8h, v7.8h, v6.8h
-; CHECK-NEXT:    uzp2 v3.8h, v3.8h, v16.8h
+; CHECK-NEXT:    mov v7.16b, v31.16b
+; CHECK-NEXT:    bit v4.16b, v23.16b, v29.16b
+; CHECK-NEXT:    bsl v5.16b, v25.16b, v6.16b
+; CHECK-NEXT:    mov v6.16b, v30.16b
+; CHECK-NEXT:    bsl v16.16b, v2.16b, v17.16b
+; CHECK-NEXT:    bsl v7.16b, v28.16b, v22.16b
+; CHECK-NEXT:    bsl v6.16b, v24.16b, v18.16b
+; CHECK-NEXT:    uzp2 v1.8h, v4.8h, v3.8h
+; CHECK-NEXT:    uzp2 v3.8h, v16.8h, v7.8h
+; CHECK-NEXT:    uzp2 v2.8h, v6.8h, v5.8h
 ; CHECK-NEXT:    ret
 entry:
   %c = sitofp <32 x i64> %a to <32 x bfloat>
@@ -6944,107 +6944,107 @@ entry:
 define <32 x bfloat> @utofp_v32i64_v32bf16(<32 x i64> %a) {
 ; CHECK-LABEL: utofp_v32i64_v32bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v16.2d, v2.2d
-; CHECK-NEXT:    ucvtf v17.2d, v0.2d
-; CHECK-NEXT:    ucvtf v18.2d, v3.2d
-; CHECK-NEXT:    ucvtf v19.2d, v6.2d
-; CHECK-NEXT:    ldp q24, q23, [sp, #96]
-; CHECK-NEXT:    ucvtf v21.2d, v1.2d
-; CHECK-NEXT:    ucvtf v22.2d, v4.2d
+; CHECK-NEXT:    ucvtf v17.2d, v2.2d
+; CHECK-NEXT:    ucvtf v18.2d, v0.2d
+; CHECK-NEXT:    ucvtf v19.2d, v3.2d
+; CHECK-NEXT:    ucvtf v3.2d, v6.2d
+; CHECK-NEXT:    ldp q21, q20, [sp, #32]
+; CHECK-NEXT:    ucvtf v4.2d, v4.2d
 ; CHECK-NEXT:    ucvtf v6.2d, v7.2d
-; CHECK-NEXT:    ucvtf v7.2d, v5.2d
-; CHECK-NEXT:    movi v3.4s, #127, msl #8
-; CHECK-NEXT:    movi v2.4s, #1
-; CHECK-NEXT:    fcvtn v0.2s, v16.2d
-; CHECK-NEXT:    ldp q20, q16, [sp, #32]
-; CHECK-NEXT:    fcvtn v1.2s, v17.2d
-; CHECK-NEXT:    ldp q5, q17, [sp]
-; CHECK-NEXT:    fcvtn v4.2s, v19.2d
-; CHECK-NEXT:    ucvtf v23.2d, v23.2d
+; CHECK-NEXT:    ucvtf v5.2d, v5.2d
+; CHECK-NEXT:    ldp q24, q23, [sp, #64]
+; CHECK-NEXT:    movi v16.4s, #1
+; CHECK-NEXT:    fcvtn v0.2s, v17.2d
+; CHECK-NEXT:    ucvtf v17.2d, v1.2d
+; CHECK-NEXT:    fcvtn v1.2s, v18.2d
+; CHECK-NEXT:    fcvtn v3.2s, v3.2d
+; CHECK-NEXT:    ldp q18, q7, [sp]
+; CHECK-NEXT:    ucvtf v21.2d, v21.2d
+; CHECK-NEXT:    fcvtn v4.2s, v4.2d
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    ucvtf v20.2d, v20.2d
-; CHECK-NEXT:    ucvtf v16.2d, v16.2d
-; CHECK-NEXT:    fcvtn2 v0.4s, v18.2d
-; CHECK-NEXT:    ldp q19, q18, [sp, #64]
-; CHECK-NEXT:    ucvtf v25.2d, v5.2d
-; CHECK-NEXT:    fcvtn v5.2s, v22.2d
-; CHECK-NEXT:    fcvtn2 v1.4s, v21.2d
-; CHECK-NEXT:    ucvtf v21.2d, v24.2d
-; CHECK-NEXT:    ucvtf v17.2d, v17.2d
-; CHECK-NEXT:    fcvtn2 v4.4s, v6.2d
+; CHECK-NEXT:    fcvtn2 v0.4s, v19.2d
+; CHECK-NEXT:    ldp q22, q19, [sp, #96]
+; CHECK-NEXT:    fcvtn2 v1.4s, v17.2d
+; CHECK-NEXT:    fcvtn2 v3.4s, v6.2d
+; CHECK-NEXT:    ucvtf v18.2d, v18.2d
+; CHECK-NEXT:    ucvtf v17.2d, v24.2d
+; CHECK-NEXT:    fcvtn v6.2s, v21.2d
+; CHECK-NEXT:    fcvtn2 v4.4s, v5.2d
+; CHECK-NEXT:    ucvtf v22.2d, v22.2d
+; CHECK-NEXT:    ucvtf v21.2d, v23.2d
+; CHECK-NEXT:    ucvtf v7.2d, v7.2d
+; CHECK-NEXT:    ushr v24.4s, v0.4s, #16
+; CHECK-NEXT:    add v5.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ucvtf v19.2d, v19.2d
-; CHECK-NEXT:    ucvtf v6.2d, v18.2d
-; CHECK-NEXT:    fcvtn v18.2s, v20.2d
-; CHECK-NEXT:    ushr v22.4s, v0.4s, #16
-; CHECK-NEXT:    add v20.4s, v0.4s, v3.4s
-; CHECK-NEXT:    fcvtn2 v5.4s, v7.2d
-; CHECK-NEXT:    fcvtn v24.2s, v25.2d
-; CHECK-NEXT:    ushr v7.4s, v1.4s, #16
-; CHECK-NEXT:    fcvtn v21.2s, v21.2d
-; CHECK-NEXT:    add v26.4s, v1.4s, v3.4s
-; CHECK-NEXT:    ushr v27.4s, v4.4s, #16
-; CHECK-NEXT:    fcvtn v19.2s, v19.2d
-; CHECK-NEXT:    fcvtn2 v18.4s, v16.2d
-; CHECK-NEXT:    and v22.16b, v22.16b, v2.16b
-; CHECK-NEXT:    and v7.16b, v7.16b, v2.16b
-; CHECK-NEXT:    fcmeq v25.4s, v0.4s, v0.4s
+; CHECK-NEXT:    ushr v23.4s, v1.4s, #16
+; CHECK-NEXT:    ushr v25.4s, v3.4s, #16
+; CHECK-NEXT:    fcvtn v18.2s, v18.2d
+; CHECK-NEXT:    fcvtn2 v6.4s, v20.2d
+; CHECK-NEXT:    add v26.4s, v1.4s, v2.4s
+; CHECK-NEXT:    fcvtn v17.2s, v17.2d
+; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
+; CHECK-NEXT:    fcvtn v22.2s, v22.2d
+; CHECK-NEXT:    fcmeq v20.4s, v0.4s, v0.4s
+; CHECK-NEXT:    and v23.16b, v23.16b, v16.16b
 ; CHECK-NEXT:    orr v0.4s, #64, lsl #16
-; CHECK-NEXT:    ushr v28.4s, v5.4s, #16
-; CHECK-NEXT:    fcvtn2 v24.4s, v17.2d
-; CHECK-NEXT:    add v17.4s, v5.4s, v3.4s
-; CHECK-NEXT:    fcvtn2 v21.4s, v23.2d
-; CHECK-NEXT:    and v16.16b, v27.16b, v2.16b
-; CHECK-NEXT:    add v20.4s, v22.4s, v20.4s
-; CHECK-NEXT:    fcvtn2 v19.4s, v6.2d
-; CHECK-NEXT:    add v7.4s, v7.4s, v26.4s
-; CHECK-NEXT:    ushr v26.4s, v18.4s, #16
-; CHECK-NEXT:    and v23.16b, v28.16b, v2.16b
-; CHECK-NEXT:    add v22.4s, v4.4s, v3.4s
-; CHECK-NEXT:    fcmeq v6.4s, v1.4s, v1.4s
-; CHECK-NEXT:    ushr v27.4s, v24.4s, #16
-; CHECK-NEXT:    add v30.4s, v24.4s, v3.4s
+; CHECK-NEXT:    fcmeq v27.4s, v3.4s, v3.4s
+; CHECK-NEXT:    fcvtn2 v18.4s, v7.2d
+; CHECK-NEXT:    add v7.4s, v3.4s, v2.4s
+; CHECK-NEXT:    orr v3.4s, #64, lsl #16
+; CHECK-NEXT:    add v5.4s, v24.4s, v5.4s
+; CHECK-NEXT:    and v24.16b, v25.16b, v16.16b
+; CHECK-NEXT:    ushr v25.4s, v4.4s, #16
+; CHECK-NEXT:    fcvtn2 v22.4s, v19.2d
+; CHECK-NEXT:    add v19.4s, v23.4s, v26.4s
+; CHECK-NEXT:    ushr v26.4s, v6.4s, #16
+; CHECK-NEXT:    fcvtn2 v17.4s, v21.2d
+; CHECK-NEXT:    fcmeq v21.4s, v1.4s, v1.4s
 ; CHECK-NEXT:    orr v1.4s, #64, lsl #16
-; CHECK-NEXT:    ushr v28.4s, v21.4s, #16
-; CHECK-NEXT:    add v31.4s, v21.4s, v3.4s
-; CHECK-NEXT:    and v26.16b, v26.16b, v2.16b
-; CHECK-NEXT:    add v17.4s, v23.4s, v17.4s
-; CHECK-NEXT:    add v23.4s, v18.4s, v3.4s
-; CHECK-NEXT:    ushr v29.4s, v19.4s, #16
-; CHECK-NEXT:    and v27.16b, v27.16b, v2.16b
-; CHECK-NEXT:    add v3.4s, v19.4s, v3.4s
-; CHECK-NEXT:    add v16.4s, v16.4s, v22.4s
-; CHECK-NEXT:    and v28.16b, v28.16b, v2.16b
-; CHECK-NEXT:    fcmeq v22.4s, v4.4s, v4.4s
-; CHECK-NEXT:    orr v4.4s, #64, lsl #16
-; CHECK-NEXT:    and v2.16b, v29.16b, v2.16b
-; CHECK-NEXT:    fcmeq v29.4s, v5.4s, v5.4s
-; CHECK-NEXT:    orr v5.4s, #64, lsl #16
-; CHECK-NEXT:    add v23.4s, v26.4s, v23.4s
-; CHECK-NEXT:    fcmeq v26.4s, v18.4s, v18.4s
-; CHECK-NEXT:    add v27.4s, v27.4s, v30.4s
-; CHECK-NEXT:    fcmeq v30.4s, v24.4s, v24.4s
+; CHECK-NEXT:    and v23.16b, v25.16b, v16.16b
+; CHECK-NEXT:    add v25.4s, v4.4s, v2.4s
+; CHECK-NEXT:    add v7.4s, v24.4s, v7.4s
+; CHECK-NEXT:    ushr v24.4s, v18.4s, #16
+; CHECK-NEXT:    add v30.4s, v18.4s, v2.4s
+; CHECK-NEXT:    bit v0.16b, v5.16b, v20.16b
+; CHECK-NEXT:    ushr v28.4s, v22.4s, #16
+; CHECK-NEXT:    add v31.4s, v22.4s, v2.4s
+; CHECK-NEXT:    add v23.4s, v23.4s, v25.4s
+; CHECK-NEXT:    and v25.16b, v26.16b, v16.16b
+; CHECK-NEXT:    add v26.4s, v6.4s, v2.4s
+; CHECK-NEXT:    ushr v29.4s, v17.4s, #16
+; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
+; CHECK-NEXT:    add v2.4s, v17.4s, v2.4s
+; CHECK-NEXT:    and v28.16b, v28.16b, v16.16b
+; CHECK-NEXT:    bit v3.16b, v7.16b, v27.16b
+; CHECK-NEXT:    bit v1.16b, v19.16b, v21.16b
+; CHECK-NEXT:    add v25.4s, v25.4s, v26.4s
+; CHECK-NEXT:    fcmeq v26.4s, v6.4s, v6.4s
+; CHECK-NEXT:    orr v6.4s, #64, lsl #16
+; CHECK-NEXT:    and v16.16b, v29.16b, v16.16b
+; CHECK-NEXT:    add v24.4s, v24.4s, v30.4s
+; CHECK-NEXT:    fcmeq v30.4s, v18.4s, v18.4s
 ; CHECK-NEXT:    add v28.4s, v28.4s, v31.4s
-; CHECK-NEXT:    fcmeq v31.4s, v21.4s, v21.4s
-; CHECK-NEXT:    add v2.4s, v2.4s, v3.4s
-; CHECK-NEXT:    fcmeq v3.4s, v19.4s, v19.4s
+; CHECK-NEXT:    fcmeq v31.4s, v22.4s, v22.4s
+; CHECK-NEXT:    fcmeq v29.4s, v4.4s, v4.4s
+; CHECK-NEXT:    orr v4.4s, #64, lsl #16
 ; CHECK-NEXT:    orr v18.4s, #64, lsl #16
-; CHECK-NEXT:    orr v24.4s, #64, lsl #16
-; CHECK-NEXT:    orr v21.4s, #64, lsl #16
-; CHECK-NEXT:    orr v19.4s, #64, lsl #16
-; CHECK-NEXT:    bit v1.16b, v7.16b, v6.16b
-; CHECK-NEXT:    bit v4.16b, v16.16b, v22.16b
-; CHECK-NEXT:    mov v6.16b, v26.16b
-; CHECK-NEXT:    mov v7.16b, v30.16b
-; CHECK-NEXT:    mov v16.16b, v31.16b
-; CHECK-NEXT:    bit v0.16b, v20.16b, v25.16b
-; CHECK-NEXT:    bit v5.16b, v17.16b, v29.16b
-; CHECK-NEXT:    bsl v3.16b, v2.16b, v19.16b
-; CHECK-NEXT:    bsl v6.16b, v23.16b, v18.16b
-; CHECK-NEXT:    bsl v7.16b, v27.16b, v24.16b
-; CHECK-NEXT:    bsl v16.16b, v28.16b, v21.16b
+; CHECK-NEXT:    orr v22.4s, #64, lsl #16
+; CHECK-NEXT:    mov v5.16b, v26.16b
+; CHECK-NEXT:    add v2.4s, v16.4s, v2.4s
+; CHECK-NEXT:    fcmeq v16.4s, v17.4s, v17.4s
+; CHECK-NEXT:    orr v17.4s, #64, lsl #16
 ; CHECK-NEXT:    uzp2 v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    uzp2 v1.8h, v5.8h, v4.8h
-; CHECK-NEXT:    uzp2 v2.8h, v7.8h, v6.8h
-; CHECK-NEXT:    uzp2 v3.8h, v3.8h, v16.8h
+; CHECK-NEXT:    mov v7.16b, v31.16b
+; CHECK-NEXT:    bit v4.16b, v23.16b, v29.16b
+; CHECK-NEXT:    bsl v5.16b, v25.16b, v6.16b
+; CHECK-NEXT:    mov v6.16b, v30.16b
+; CHECK-NEXT:    bsl v16.16b, v2.16b, v17.16b
+; CHECK-NEXT:    bsl v7.16b, v28.16b, v22.16b
+; CHECK-NEXT:    bsl v6.16b, v24.16b, v18.16b
+; CHECK-NEXT:    uzp2 v1.8h, v4.8h, v3.8h
+; CHECK-NEXT:    uzp2 v3.8h, v16.8h, v7.8h
+; CHECK-NEXT:    uzp2 v2.8h, v6.8h, v5.8h
 ; CHECK-NEXT:    ret
 entry:
   %c = uitofp <32 x i64> %a to <32 x bfloat>
@@ -7059,9 +7059,9 @@ define <2 x bfloat> @stofp_v2i32_v2bf16(<2 x i32> %a) {
 ; CHECK-NEXT:    scvtf v0.4s, v0.4s
 ; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
 ; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = sitofp <2 x i32> %a to <2 x bfloat>
@@ -7076,9 +7076,9 @@ define <2 x bfloat> @utofp_v2i32_v2bf16(<2 x i32> %a) {
 ; CHECK-NEXT:    ucvtf v0.4s, v0.4s
 ; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
 ; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = uitofp <2 x i32> %a to <2 x bfloat>
@@ -7092,9 +7092,9 @@ define <3 x bfloat> @stofp_v3i32_v3bf16(<3 x i32> %a) {
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
 ; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = sitofp <3 x i32> %a to <3 x bfloat>
@@ -7108,9 +7108,9 @@ define <3 x bfloat> @utofp_v3i32_v3bf16(<3 x i32> %a) {
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
 ; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = uitofp <3 x i32> %a to <3 x bfloat>
@@ -7124,9 +7124,9 @@ define <4 x bfloat> @stofp_v4i32_v4bf16(<4 x i32> %a) {
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
 ; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = sitofp <4 x i32> %a to <4 x bfloat>
@@ -7140,9 +7140,9 @@ define <4 x bfloat> @utofp_v4i32_v4bf16(<4 x i32> %a) {
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
 ; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = uitofp <4 x i32> %a to <4 x bfloat>
@@ -7155,15 +7155,15 @@ define <8 x bfloat> @stofp_v8i32_v8bf16(<8 x i32> %a) {
 ; CHECK-NEXT:    scvtf v0.4s, v0.4s
 ; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-NEXT:    movi v5.4s, #127, msl #8
 ; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
 ; CHECK-NEXT:    ushr v4.4s, v1.4s, #16
 ; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    and v2.16b, v4.16b, v2.16b
 ; CHECK-NEXT:    add v0.4s, v3.4s, v0.4s
-; CHECK-NEXT:    movi v3.4s, #127, msl #8
 ; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v3.4s
-; CHECK-NEXT:    addhn2 v0.8h, v1.4s, v3.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v5.4s
+; CHECK-NEXT:    addhn2 v0.8h, v1.4s, v5.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = sitofp <8 x i32> %a to <8 x bfloat>
@@ -7176,15 +7176,15 @@ define <8 x bfloat> @utofp_v8i32_v8bf16(<8 x i32> %a) {
 ; CHECK-NEXT:    ucvtf v0.4s, v0.4s
 ; CHECK-NEXT:    movi v2.4s, #1
 ; CHECK-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-NEXT:    movi v5.4s, #127, msl #8
 ; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
 ; CHECK-NEXT:    ushr v4.4s, v1.4s, #16
 ; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    and v2.16b, v4.16b, v2.16b
 ; CHECK-NEXT:    add v0.4s, v3.4s, v0.4s
-; CHECK-NEXT:    movi v3.4s, #127, msl #8
 ; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v3.4s
-; CHECK-NEXT:    addhn2 v0.8h, v1.4s, v3.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v5.4s
+; CHECK-NEXT:    addhn2 v0.8h, v1.4s, v5.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = uitofp <8 x i32> %a to <8 x bfloat>
@@ -7194,28 +7194,28 @@ entry:
 define <16 x bfloat> @stofp_v16i32_v16bf16(<16 x i32> %a) {
 ; CHECK-LABEL: stofp_v16i32_v16bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    scvtf v0.4s, v0.4s
 ; CHECK-NEXT:    scvtf v2.4s, v2.4s
-; CHECK-NEXT:    movi v4.4s, #1
-; CHECK-NEXT:    scvtf v1.4s, v1.4s
+; CHECK-NEXT:    scvtf v0.4s, v0.4s
+; CHECK-NEXT:    scvtf v4.4s, v1.4s
+; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    scvtf v3.4s, v3.4s
+; CHECK-NEXT:    movi v17.4s, #127, msl #8
 ; CHECK-NEXT:    ushr v5.4s, v0.4s, #16
 ; CHECK-NEXT:    ushr v6.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v7.4s, v1.4s, #16
+; CHECK-NEXT:    ushr v7.4s, v4.4s, #16
 ; CHECK-NEXT:    ushr v16.4s, v3.4s, #16
-; CHECK-NEXT:    and v5.16b, v5.16b, v4.16b
-; CHECK-NEXT:    and v6.16b, v6.16b, v4.16b
+; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
+; CHECK-NEXT:    and v6.16b, v6.16b, v1.16b
 ; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
 ; CHECK-NEXT:    add v2.4s, v6.4s, v2.4s
-; CHECK-NEXT:    movi v6.4s, #127, msl #8
-; CHECK-NEXT:    and v5.16b, v7.16b, v4.16b
-; CHECK-NEXT:    and v4.16b, v16.16b, v4.16b
-; CHECK-NEXT:    add v5.4s, v5.4s, v1.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v6.4s
-; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
-; CHECK-NEXT:    addhn v1.4h, v2.4s, v6.4s
-; CHECK-NEXT:    addhn2 v0.8h, v5.4s, v6.4s
-; CHECK-NEXT:    addhn2 v1.8h, v3.4s, v6.4s
+; CHECK-NEXT:    and v5.16b, v7.16b, v1.16b
+; CHECK-NEXT:    and v6.16b, v16.16b, v1.16b
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v17.4s
+; CHECK-NEXT:    addhn v1.4h, v2.4s, v17.4s
+; CHECK-NEXT:    add v2.4s, v5.4s, v4.4s
+; CHECK-NEXT:    add v3.4s, v6.4s, v3.4s
+; CHECK-NEXT:    addhn2 v0.8h, v2.4s, v17.4s
+; CHECK-NEXT:    addhn2 v1.8h, v3.4s, v17.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = sitofp <16 x i32> %a to <16 x bfloat>
@@ -7225,28 +7225,28 @@ entry:
 define <16 x bfloat> @utofp_v16i32_v16bf16(<16 x i32> %a) {
 ; CHECK-LABEL: utofp_v16i32_v16bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ucvtf v0.4s, v0.4s
 ; CHECK-NEXT:    ucvtf v2.4s, v2.4s
-; CHECK-NEXT:    movi v4.4s, #1
-; CHECK-NEXT:    ucvtf v1.4s, v1.4s
+; CHECK-NEXT:    ucvtf v0.4s, v0.4s
+; CHECK-NEXT:    ucvtf v4.4s, v1.4s
+; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    ucvtf v3.4s, v3.4s
+; CHECK-NEXT:    movi v17.4s, #127, msl #8
 ; CHECK-NEXT:    ushr v5.4s, v0.4s, #16
 ; CHECK-NEXT:    ushr v6.4s, v2.4s, #16
-; CHECK-NEXT:    ushr v7.4s, v1.4s, #16
+; CHECK-NEXT:    ushr v7.4s, v4.4s, #16
 ; CHECK-NEXT:    ushr v16.4s, v3.4s, #16
-; CHECK-NEXT:    and v5.16b, v5.16b, v4.16b
-; CHECK-NEXT:    and v6.16b, v6.16b, v4.16b
+; CHECK-NEXT:    and v5.16b, v5.16b, v1.16b
+; CHECK-NEXT:    and v6.16b, v6.16b, v1.16b
 ; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
 ; CHECK-NEXT:    add v2.4s, v6.4s, v2.4s
-; CHECK-NEXT:    movi v6.4s, #127, msl #8
-; CHECK-NEXT:    and v5.16b, v7.16b, v4.16b
-; CHECK-NEXT:    and v4.16b, v16.16b, v4.16b
-; CHECK-NEXT:    add v5.4s, v5.4s, v1.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v6.4s
-; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
-; CHECK-NEXT:    addhn v1.4h, v2.4s, v6.4s
-; CHECK-NEXT:    addhn2 v0.8h, v5.4s, v6.4s
-; CHECK-NEXT:    addhn2 v1.8h, v3.4s, v6.4s
+; CHECK-NEXT:    and v5.16b, v7.16b, v1.16b
+; CHECK-NEXT:    and v6.16b, v16.16b, v1.16b
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v17.4s
+; CHECK-NEXT:    addhn v1.4h, v2.4s, v17.4s
+; CHECK-NEXT:    add v2.4s, v5.4s, v4.4s
+; CHECK-NEXT:    add v3.4s, v6.4s, v3.4s
+; CHECK-NEXT:    addhn2 v0.8h, v2.4s, v17.4s
+; CHECK-NEXT:    addhn2 v1.8h, v3.4s, v17.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = uitofp <16 x i32> %a to <16 x bfloat>
@@ -7262,42 +7262,42 @@ define <32 x bfloat> @stofp_v32i32_v32bf16(<32 x i32> %a) {
 ; CHECK-NEXT:    scvtf v6.4s, v6.4s
 ; CHECK-NEXT:    movi v16.4s, #1
 ; CHECK-NEXT:    scvtf v1.4s, v1.4s
-; CHECK-NEXT:    scvtf v3.4s, v3.4s
+; CHECK-NEXT:    scvtf v17.4s, v3.4s
 ; CHECK-NEXT:    scvtf v5.4s, v5.4s
 ; CHECK-NEXT:    scvtf v7.4s, v7.4s
-; CHECK-NEXT:    ushr v17.4s, v0.4s, #16
+; CHECK-NEXT:    movi v21.4s, #127, msl #8
+; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
 ; CHECK-NEXT:    ushr v18.4s, v2.4s, #16
 ; CHECK-NEXT:    ushr v19.4s, v4.4s, #16
 ; CHECK-NEXT:    ushr v20.4s, v6.4s, #16
-; CHECK-NEXT:    ushr v21.4s, v1.4s, #16
-; CHECK-NEXT:    ushr v22.4s, v3.4s, #16
-; CHECK-NEXT:    ushr v23.4s, v5.4s, #16
-; CHECK-NEXT:    and v17.16b, v17.16b, v16.16b
+; CHECK-NEXT:    ushr v22.4s, v1.4s, #16
+; CHECK-NEXT:    ushr v23.4s, v17.4s, #16
+; CHECK-NEXT:    ushr v24.4s, v5.4s, #16
+; CHECK-NEXT:    ushr v25.4s, v7.4s, #16
+; CHECK-NEXT:    and v3.16b, v3.16b, v16.16b
 ; CHECK-NEXT:    and v18.16b, v18.16b, v16.16b
 ; CHECK-NEXT:    and v19.16b, v19.16b, v16.16b
 ; CHECK-NEXT:    and v20.16b, v20.16b, v16.16b
-; CHECK-NEXT:    and v21.16b, v21.16b, v16.16b
-; CHECK-NEXT:    and v22.16b, v22.16b, v16.16b
-; CHECK-NEXT:    add v0.4s, v17.4s, v0.4s
-; CHECK-NEXT:    ushr v17.4s, v7.4s, #16
+; CHECK-NEXT:    add v0.4s, v3.4s, v0.4s
+; CHECK-NEXT:    and v3.16b, v22.16b, v16.16b
 ; CHECK-NEXT:    add v2.4s, v18.4s, v2.4s
-; CHECK-NEXT:    movi v18.4s, #127, msl #8
 ; CHECK-NEXT:    add v4.4s, v19.4s, v4.4s
 ; CHECK-NEXT:    add v6.4s, v20.4s, v6.4s
-; CHECK-NEXT:    and v19.16b, v23.16b, v16.16b
-; CHECK-NEXT:    add v20.4s, v22.4s, v3.4s
-; CHECK-NEXT:    and v16.16b, v17.16b, v16.16b
-; CHECK-NEXT:    add v17.4s, v21.4s, v1.4s
+; CHECK-NEXT:    and v18.16b, v23.16b, v16.16b
+; CHECK-NEXT:    and v19.16b, v24.16b, v16.16b
+; CHECK-NEXT:    and v16.16b, v25.16b, v16.16b
+; CHECK-NEXT:    add v20.4s, v3.4s, v1.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v21.4s
+; CHECK-NEXT:    addhn v1.4h, v2.4s, v21.4s
+; CHECK-NEXT:    addhn v2.4h, v4.4s, v21.4s
+; CHECK-NEXT:    addhn v3.4h, v6.4s, v21.4s
+; CHECK-NEXT:    add v4.4s, v18.4s, v17.4s
 ; CHECK-NEXT:    add v5.4s, v19.4s, v5.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v18.4s
-; CHECK-NEXT:    addhn v1.4h, v2.4s, v18.4s
-; CHECK-NEXT:    addhn v2.4h, v4.4s, v18.4s
-; CHECK-NEXT:    add v4.4s, v16.4s, v7.4s
-; CHECK-NEXT:    addhn v3.4h, v6.4s, v18.4s
-; CHECK-NEXT:    addhn2 v0.8h, v17.4s, v18.4s
-; CHECK-NEXT:    addhn2 v1.8h, v20.4s, v18.4s
-; CHECK-NEXT:    addhn2 v2.8h, v5.4s, v18.4s
-; CHECK-NEXT:    addhn2 v3.8h, v4.4s, v18.4s
+; CHECK-NEXT:    add v6.4s, v16.4s, v7.4s
+; CHECK-NEXT:    addhn2 v0.8h, v20.4s, v21.4s
+; CHECK-NEXT:    addhn2 v1.8h, v4.4s, v21.4s
+; CHECK-NEXT:    addhn2 v2.8h, v5.4s, v21.4s
+; CHECK-NEXT:    addhn2 v3.8h, v6.4s, v21.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = sitofp <32 x i32> %a to <32 x bfloat>
@@ -7313,42 +7313,42 @@ define <32 x bfloat> @utofp_v32i32_v32bf16(<32 x i32> %a) {
 ; CHECK-NEXT:    ucvtf v6.4s, v6.4s
 ; CHECK-NEXT:    movi v16.4s, #1
 ; CHECK-NEXT:    ucvtf v1.4s, v1.4s
-; CHECK-NEXT:    ucvtf v3.4s, v3.4s
+; CHECK-NEXT:    ucvtf v17.4s, v3.4s
 ; CHECK-NEXT:    ucvtf v5.4s, v5.4s
 ; CHECK-NEXT:    ucvtf v7.4s, v7.4s
-; CHECK-NEXT:    ushr v17.4s, v0.4s, #16
+; CHECK-NEXT:    movi v21.4s, #127, msl #8
+; CHECK-NEXT:    ushr v3.4s, v0.4s, #16
 ; CHECK-NEXT:    ushr v18.4s, v2.4s, #16
 ; CHECK-NEXT:    ushr v19.4s, v4.4s, #16
 ; CHECK-NEXT:    ushr v20.4s, v6.4s, #16
-; CHECK-NEXT:    ushr v21.4s, v1.4s, #16
-; CHECK-NEXT:    ushr v22.4s, v3.4s, #16
-; CHECK-NEXT:    ushr v23.4s, v5.4s, #16
-; CHECK-NEXT:    and v17.16b, v17.16b, v16.16b
+; CHECK-NEXT:    ushr v22.4s, v1.4s, #16
+; CHECK-NEXT:    ushr v23.4s, v17.4s, #16
+; CHECK-NEXT:    ushr v24.4s, v5.4s, #16
+; CHECK-NEXT:    ushr v25.4s, v7.4s, #16
+; CHECK-NEXT:    and v3.16b, v3.16b, v16.16b
 ; CHECK-NEXT:    and v18.16b, v18.16b, v16.16b
 ; CHECK-NEXT:    and v19.16b, v19.16b, v16.16b
 ; CHECK-NEXT:    and v20.16b, v20.16b, v16.16b
-; CHECK-NEXT:    and v21.16b, v21.16b, v16.16b
-; CHECK-NEXT:    and v22.16b, v22.16b, v16.16b
-; CHECK-NEXT:    add v0.4s, v17.4s, v0.4s
-; CHECK-NEXT:    ushr v17.4s, v7.4s, #16
+; CHECK-NEXT:    add v0.4s, v3.4s, v0.4s
+; CHECK-NEXT:    and v3.16b, v22.16b, v16.16b
 ; CHECK-NEXT:    add v2.4s, v18.4s, v2.4s
-; CHECK-NEXT:    movi v18.4s, #127, msl #8
 ; CHECK-NEXT:    add v4.4s, v19.4s, v4.4s
 ; CHECK-NEXT:    add v6.4s, v20.4s, v6.4s
-; CHECK-NEXT:    and v19.16b, v23.16b, v16.16b
-; CHECK-NEXT:    add v20.4s, v22.4s, v3.4s
-; CHECK-NEXT:    and v16.16b, v17.16b, v16.16b
-; CHECK-NEXT:    add v17.4s, v21.4s, v1.4s
+; CHECK-NEXT:    and v18.16b, v23.16b, v16.16b
+; CHECK-NEXT:    and v19.16b, v24.16b, v16.16b
+; CHECK-NEXT:    and v16.16b, v25.16b, v16.16b
+; CHECK-NEXT:    add v20.4s, v3.4s, v1.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v21.4s
+; CHECK-NEXT:    addhn v1.4h, v2.4s, v21.4s
+; CHECK-NEXT:    addhn v2.4h, v4.4s, v21.4s
+; CHECK-NEXT:    addhn v3.4h, v6.4s, v21.4s
+; CHECK-NEXT:    add v4.4s, v18.4s, v17.4s
 ; CHECK-NEXT:    add v5.4s, v19.4s, v5.4s
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v18.4s
-; CHECK-NEXT:    addhn v1.4h, v2.4s, v18.4s
-; CHECK-NEXT:    addhn v2.4h, v4.4s, v18.4s
-; CHECK-NEXT:    add v4.4s, v16.4s, v7.4s
-; CHECK-NEXT:    addhn v3.4h, v6.4s, v18.4s
-; CHECK-NEXT:    addhn2 v0.8h, v17.4s, v18.4s
-; CHECK-NEXT:    addhn2 v1.8h, v20.4s, v18.4s
-; CHECK-NEXT:    addhn2 v2.8h, v5.4s, v18.4s
-; CHECK-NEXT:    addhn2 v3.8h, v4.4s, v18.4s
+; CHECK-NEXT:    add v6.4s, v16.4s, v7.4s
+; CHECK-NEXT:    addhn2 v0.8h, v20.4s, v21.4s
+; CHECK-NEXT:    addhn2 v1.8h, v4.4s, v21.4s
+; CHECK-NEXT:    addhn2 v2.8h, v5.4s, v21.4s
+; CHECK-NEXT:    addhn2 v3.8h, v6.4s, v21.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = uitofp <32 x i32> %a to <32 x bfloat>
@@ -7364,9 +7364,9 @@ define <2 x bfloat> @stofp_v2i16_v2bf16(<2 x i16> %a) {
 ; CHECK-NEXT:    scvtf v0.4s, v0.4s
 ; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
 ; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = sitofp <2 x i16> %a to <2 x bfloat>
@@ -7382,9 +7382,9 @@ define <2 x bfloat> @utofp_v2i16_v2bf16(<2 x i16> %a) {
 ; CHECK-NEXT:    ucvtf v0.4s, v0.4s
 ; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
 ; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = uitofp <2 x i16> %a to <2 x bfloat>
@@ -7399,9 +7399,9 @@ define <3 x bfloat> @stofp_v3i16_v3bf16(<3 x i16> %a) {
 ; CHECK-NEXT:    scvtf v0.4s, v0.4s
 ; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
 ; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = sitofp <3 x i16> %a to <3 x bfloat>
@@ -7416,9 +7416,9 @@ define <3 x bfloat> @utofp_v3i16_v3bf16(<3 x i16> %a) {
 ; CHECK-NEXT:    ucvtf v0.4s, v0.4s
 ; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
 ; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = uitofp <3 x i16> %a to <3 x bfloat>
@@ -7433,9 +7433,9 @@ define <4 x bfloat> @stofp_v4i16_v4bf16(<4 x i16> %a) {
 ; CHECK-NEXT:    scvtf v0.4s, v0.4s
 ; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
 ; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = sitofp <4 x i16> %a to <4 x bfloat>
@@ -7450,9 +7450,9 @@ define <4 x bfloat> @utofp_v4i16_v4bf16(<4 x i16> %a) {
 ; CHECK-NEXT:    ucvtf v0.4s, v0.4s
 ; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
 ; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = uitofp <4 x i16> %a to <4 x bfloat>
@@ -7513,27 +7513,27 @@ define <16 x bfloat> @stofp_v16i16_v16bf16(<16 x i16> %a) {
 ; CHECK-NEXT:    sshll2 v0.4s, v0.8h, #0
 ; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
 ; CHECK-NEXT:    movi v2.4s, #1
+; CHECK-NEXT:    movi v7.4s, #127, msl #8
 ; CHECK-NEXT:    scvtf v3.4s, v3.4s
 ; CHECK-NEXT:    scvtf v4.4s, v4.4s
-; CHECK-NEXT:    scvtf v6.4s, v0.4s
-; CHECK-NEXT:    scvtf v7.4s, v1.4s
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
-; CHECK-NEXT:    ushr v5.4s, v3.4s, #16
-; CHECK-NEXT:    ushr v0.4s, v4.4s, #16
-; CHECK-NEXT:    ushr v16.4s, v6.4s, #16
-; CHECK-NEXT:    ushr v17.4s, v7.4s, #16
-; CHECK-NEXT:    and v5.16b, v5.16b, v2.16b
+; CHECK-NEXT:    scvtf v5.4s, v0.4s
+; CHECK-NEXT:    scvtf v6.4s, v1.4s
+; CHECK-NEXT:    ushr v0.4s, v3.4s, #16
+; CHECK-NEXT:    ushr v1.4s, v4.4s, #16
+; CHECK-NEXT:    ushr v16.4s, v5.4s, #16
+; CHECK-NEXT:    ushr v17.4s, v6.4s, #16
 ; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    and v16.16b, v16.16b, v2.16b
 ; CHECK-NEXT:    and v2.16b, v17.16b, v2.16b
-; CHECK-NEXT:    add v5.4s, v5.4s, v1.4s
-; CHECK-NEXT:    add v18.4s, v0.4s, v1.4s
-; CHECK-NEXT:    add v2.4s, v2.4s, v1.4s
-; CHECK-NEXT:    addhn v0.4h, v3.4s, v5.4s
-; CHECK-NEXT:    add v3.4s, v16.4s, v1.4s
-; CHECK-NEXT:    addhn v1.4h, v4.4s, v18.4s
-; CHECK-NEXT:    addhn2 v0.8h, v6.4s, v3.4s
-; CHECK-NEXT:    addhn2 v1.8h, v7.4s, v2.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v7.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v7.4s
+; CHECK-NEXT:    add v2.4s, v2.4s, v7.4s
+; CHECK-NEXT:    addhn v0.4h, v3.4s, v0.4s
+; CHECK-NEXT:    addhn v1.4h, v4.4s, v1.4s
+; CHECK-NEXT:    add v3.4s, v16.4s, v7.4s
+; CHECK-NEXT:    addhn2 v0.8h, v5.4s, v3.4s
+; CHECK-NEXT:    addhn2 v1.8h, v6.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = sitofp <16 x i16> %a to <16 x bfloat>
@@ -7548,27 +7548,27 @@ define <16 x bfloat> @utofp_v16i16_v16bf16(<16 x i16> %a) {
 ; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #0
 ; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
 ; CHECK-NEXT:    movi v2.4s, #1
+; CHECK-NEXT:    movi v7.4s, #127, msl #8
 ; CHECK-NEXT:    ucvtf v3.4s, v3.4s
 ; CHECK-NEXT:    ucvtf v4.4s, v4.4s
-; CHECK-NEXT:    ucvtf v6.4s, v0.4s
-; CHECK-NEXT:    ucvtf v7.4s, v1.4s
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
-; CHECK-NEXT:    ushr v5.4s, v3.4s, #16
-; CHECK-NEXT:    ushr v0.4s, v4.4s, #16
-; CHECK-NEXT:    ushr v16.4s, v6.4s, #16
-; CHECK-NEXT:    ushr v17.4s, v7.4s, #16
-; CHECK-NEXT:    and v5.16b, v5.16b, v2.16b
+; CHECK-NEXT:    ucvtf v5.4s, v0.4s
+; CHECK-NEXT:    ucvtf v6.4s, v1.4s
+; CHECK-NEXT:    ushr v0.4s, v3.4s, #16
+; CHECK-NEXT:    ushr v1.4s, v4.4s, #16
+; CHECK-NEXT:    ushr v16.4s, v5.4s, #16
+; CHECK-NEXT:    ushr v17.4s, v6.4s, #16
 ; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
 ; CHECK-NEXT:    and v16.16b, v16.16b, v2.16b
 ; CHECK-NEXT:    and v2.16b, v17.16b, v2.16b
-; CHECK-NEXT:    add v5.4s, v5.4s, v1.4s
-; CHECK-NEXT:    add v18.4s, v0.4s, v1.4s
-; CHECK-NEXT:    add v2.4s, v2.4s, v1.4s
-; CHECK-NEXT:    addhn v0.4h, v3.4s, v5.4s
-; CHECK-NEXT:    add v3.4s, v16.4s, v1.4s
-; CHECK-NEXT:    addhn v1.4h, v4.4s, v18.4s
-; CHECK-NEXT:    addhn2 v0.8h, v6.4s, v3.4s
-; CHECK-NEXT:    addhn2 v1.8h, v7.4s, v2.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v7.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v7.4s
+; CHECK-NEXT:    add v2.4s, v2.4s, v7.4s
+; CHECK-NEXT:    addhn v0.4h, v3.4s, v0.4s
+; CHECK-NEXT:    addhn v1.4h, v4.4s, v1.4s
+; CHECK-NEXT:    add v3.4s, v16.4s, v7.4s
+; CHECK-NEXT:    addhn2 v0.8h, v5.4s, v3.4s
+; CHECK-NEXT:    addhn2 v1.8h, v6.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = uitofp <16 x i16> %a to <16 x bfloat>
@@ -7578,56 +7578,56 @@ entry:
 define <32 x bfloat> @stofp_v32i16_v32bf16(<32 x i16> %a) {
 ; CHECK-LABEL: stofp_v32i16_v32bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sshll v5.4s, v1.4h, #0
-; CHECK-NEXT:    sshll v4.4s, v0.4h, #0
+; CHECK-NEXT:    sshll v4.4s, v1.4h, #0
+; CHECK-NEXT:    sshll v5.4s, v0.4h, #0
 ; CHECK-NEXT:    sshll v6.4s, v2.4h, #0
 ; CHECK-NEXT:    sshll v7.4s, v3.4h, #0
 ; CHECK-NEXT:    sshll2 v0.4s, v0.8h, #0
-; CHECK-NEXT:    movi v16.4s, #1
 ; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
 ; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
 ; CHECK-NEXT:    sshll2 v3.4s, v3.8h, #0
+; CHECK-NEXT:    movi v16.4s, #1
 ; CHECK-NEXT:    scvtf v5.4s, v5.4s
 ; CHECK-NEXT:    scvtf v4.4s, v4.4s
 ; CHECK-NEXT:    scvtf v6.4s, v6.4s
 ; CHECK-NEXT:    scvtf v7.4s, v7.4s
-; CHECK-NEXT:    scvtf v19.4s, v0.4s
-; CHECK-NEXT:    movi v18.4s, #127, msl #8
-; CHECK-NEXT:    scvtf v20.4s, v1.4s
-; CHECK-NEXT:    scvtf v21.4s, v2.4s
-; CHECK-NEXT:    scvtf v22.4s, v3.4s
+; CHECK-NEXT:    scvtf v17.4s, v0.4s
+; CHECK-NEXT:    scvtf v18.4s, v1.4s
+; CHECK-NEXT:    scvtf v19.4s, v2.4s
+; CHECK-NEXT:    scvtf v20.4s, v3.4s
+; CHECK-NEXT:    movi v21.4s, #127, msl #8
 ; CHECK-NEXT:    ushr v0.4s, v5.4s, #16
-; CHECK-NEXT:    ushr v17.4s, v4.4s, #16
-; CHECK-NEXT:    ushr v1.4s, v6.4s, #16
-; CHECK-NEXT:    ushr v2.4s, v7.4s, #16
-; CHECK-NEXT:    ushr v23.4s, v20.4s, #16
-; CHECK-NEXT:    ushr v25.4s, v22.4s, #16
+; CHECK-NEXT:    ushr v1.4s, v4.4s, #16
+; CHECK-NEXT:    ushr v2.4s, v6.4s, #16
+; CHECK-NEXT:    ushr v3.4s, v7.4s, #16
+; CHECK-NEXT:    ushr v22.4s, v17.4s, #16
+; CHECK-NEXT:    ushr v23.4s, v18.4s, #16
+; CHECK-NEXT:    ushr v24.4s, v19.4s, #16
+; CHECK-NEXT:    ushr v25.4s, v20.4s, #16
 ; CHECK-NEXT:    and v0.16b, v0.16b, v16.16b
-; CHECK-NEXT:    and v3.16b, v17.16b, v16.16b
 ; CHECK-NEXT:    and v1.16b, v1.16b, v16.16b
 ; CHECK-NEXT:    and v2.16b, v2.16b, v16.16b
-; CHECK-NEXT:    ushr v17.4s, v19.4s, #16
+; CHECK-NEXT:    and v3.16b, v3.16b, v16.16b
+; CHECK-NEXT:    and v22.16b, v22.16b, v16.16b
 ; CHECK-NEXT:    and v23.16b, v23.16b, v16.16b
-; CHECK-NEXT:    add v24.4s, v0.4s, v18.4s
-; CHECK-NEXT:    ushr v0.4s, v21.4s, #16
-; CHECK-NEXT:    add v3.4s, v3.4s, v18.4s
-; CHECK-NEXT:    add v26.4s, v1.4s, v18.4s
-; CHECK-NEXT:    add v27.4s, v2.4s, v18.4s
-; CHECK-NEXT:    and v17.16b, v17.16b, v16.16b
-; CHECK-NEXT:    and v28.16b, v0.16b, v16.16b
+; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
 ; CHECK-NEXT:    and v16.16b, v25.16b, v16.16b
-; CHECK-NEXT:    addhn v0.4h, v4.4s, v3.4s
-; CHECK-NEXT:    addhn v1.4h, v5.4s, v24.4s
-; CHECK-NEXT:    add v4.4s, v17.4s, v18.4s
-; CHECK-NEXT:    addhn v2.4h, v6.4s, v26.4s
-; CHECK-NEXT:    add v5.4s, v23.4s, v18.4s
-; CHECK-NEXT:    addhn v3.4h, v7.4s, v27.4s
-; CHECK-NEXT:    add v6.4s, v28.4s, v18.4s
-; CHECK-NEXT:    add v16.4s, v16.4s, v18.4s
-; CHECK-NEXT:    addhn2 v0.8h, v19.4s, v4.4s
-; CHECK-NEXT:    addhn2 v1.8h, v20.4s, v5.4s
-; CHECK-NEXT:    addhn2 v2.8h, v21.4s, v6.4s
-; CHECK-NEXT:    addhn2 v3.8h, v22.4s, v16.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v21.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v21.4s
+; CHECK-NEXT:    add v2.4s, v2.4s, v21.4s
+; CHECK-NEXT:    add v3.4s, v3.4s, v21.4s
+; CHECK-NEXT:    addhn v0.4h, v5.4s, v0.4s
+; CHECK-NEXT:    addhn v1.4h, v4.4s, v1.4s
+; CHECK-NEXT:    addhn v2.4h, v6.4s, v2.4s
+; CHECK-NEXT:    addhn v3.4h, v7.4s, v3.4s
+; CHECK-NEXT:    add v4.4s, v22.4s, v21.4s
+; CHECK-NEXT:    add v5.4s, v23.4s, v21.4s
+; CHECK-NEXT:    add v6.4s, v24.4s, v21.4s
+; CHECK-NEXT:    add v7.4s, v16.4s, v21.4s
+; CHECK-NEXT:    addhn2 v0.8h, v17.4s, v4.4s
+; CHECK-NEXT:    addhn2 v1.8h, v18.4s, v5.4s
+; CHECK-NEXT:    addhn2 v2.8h, v19.4s, v6.4s
+; CHECK-NEXT:    addhn2 v3.8h, v20.4s, v7.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = sitofp <32 x i16> %a to <32 x bfloat>
@@ -7637,56 +7637,56 @@ entry:
 define <32 x bfloat> @utofp_v32i16_v32bf16(<32 x i16> %a) {
 ; CHECK-LABEL: utofp_v32i16_v32bf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ushll v5.4s, v1.4h, #0
-; CHECK-NEXT:    ushll v4.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v4.4s, v1.4h, #0
+; CHECK-NEXT:    ushll v5.4s, v0.4h, #0
 ; CHECK-NEXT:    ushll v6.4s, v2.4h, #0
 ; CHECK-NEXT:    ushll v7.4s, v3.4h, #0
 ; CHECK-NEXT:    ushll2 v0.4s, v0.8h, #0
-; CHECK-NEXT:    movi v16.4s, #1
 ; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
 ; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
 ; CHECK-NEXT:    ushll2 v3.4s, v3.8h, #0
+; CHECK-NEXT:    movi v16.4s, #1
 ; CHECK-NEXT:    ucvtf v5.4s, v5.4s
 ; CHECK-NEXT:    ucvtf v4.4s, v4.4s
 ; CHECK-NEXT:    ucvtf v6.4s, v6.4s
 ; CHECK-NEXT:    ucvtf v7.4s, v7.4s
-; CHECK-NEXT:    ucvtf v19.4s, v0.4s
-; CHECK-NEXT:    movi v18.4s, #127, msl #8
-; CHECK-NEXT:    ucvtf v20.4s, v1.4s
-; CHECK-NEXT:    ucvtf v21.4s, v2.4s
-; CHECK-NEXT:    ucvtf v22.4s, v3.4s
+; CHECK-NEXT:    ucvtf v17.4s, v0.4s
+; CHECK-NEXT:    ucvtf v18.4s, v1.4s
+; CHECK-NEXT:    ucvtf v19.4s, v2.4s
+; CHECK-NEXT:    ucvtf v20.4s, v3.4s
+; CHECK-NEXT:    movi v21.4s, #127, msl #8
 ; CHECK-NEXT:    ushr v0.4s, v5.4s, #16
-; CHECK-NEXT:    ushr v17.4s, v4.4s, #16
-; CHECK-NEXT:    ushr v1.4s, v6.4s, #16
-; CHECK-NEXT:    ushr v2.4s, v7.4s, #16
-; CHECK-NEXT:    ushr v23.4s, v20.4s, #16
-; CHECK-NEXT:    ushr v25.4s, v22.4s, #16
+; CHECK-NEXT:    ushr v1.4s, v4.4s, #16
+; CHECK-NEXT:    ushr v2.4s, v6.4s, #16
+; CHECK-NEXT:    ushr v3.4s, v7.4s, #16
+; CHECK-NEXT:    ushr v22.4s, v17.4s, #16
+; CHECK-NEXT:    ushr v23.4s, v18.4s, #16
+; CHECK-NEXT:    ushr v24.4s, v19.4s, #16
+; CHECK-NEXT:    ushr v25.4s, v20.4s, #16
 ; CHECK-NEXT:    and v0.16b, v0.16b, v16.16b
-; CHECK-NEXT:    and v3.16b, v17.16b, v16.16b
 ; CHECK-NEXT:    and v1.16b, v1.16b, v16.16b
 ; CHECK-NEXT:    and v2.16b, v2.16b, v16.16b
-; CHECK-NEXT:    ushr v17.4s, v19.4s, #16
+; CHECK-NEXT:    and v3.16b, v3.16b, v16.16b
+; CHECK-NEXT:    and v22.16b, v22.16b, v16.16b
 ; CHECK-NEXT:    and v23.16b, v23.16b, v16.16b
-; CHECK-NEXT:    add v24.4s, v0.4s, v18.4s
-; CHECK-NEXT:    ushr v0.4s, v21.4s, #16
-; CHECK-NEXT:    add v3.4s, v3.4s, v18.4s
-; CHECK-NEXT:    add v26.4s, v1.4s, v18.4s
-; CHECK-NEXT:    add v27.4s, v2.4s, v18.4s
-; CHECK-NEXT:    and v17.16b, v17.16b, v16.16b
-; CHECK-NEXT:    and v28.16b, v0.16b, v16.16b
+; CHECK-NEXT:    and v24.16b, v24.16b, v16.16b
 ; CHECK-NEXT:    and v16.16b, v25.16b, v16.16b
-; CHECK-NEXT:    addhn v0.4h, v4.4s, v3.4s
-; CHECK-NEXT:    addhn v1.4h, v5.4s, v24.4s
-; CHECK-NEXT:    add v4.4s, v17.4s, v18.4s
-; CHECK-NEXT:    addhn v2.4h, v6.4s, v26.4s
-; CHECK-NEXT:    add v5.4s, v23.4s, v18.4s
-; CHECK-NEXT:    addhn v3.4h, v7.4s, v27.4s
-; CHECK-NEXT:    add v6.4s, v28.4s, v18.4s
-; CHECK-NEXT:    add v16.4s, v16.4s, v18.4s
-; CHECK-NEXT:    addhn2 v0.8h, v19.4s, v4.4s
-; CHECK-NEXT:    addhn2 v1.8h, v20.4s, v5.4s
-; CHECK-NEXT:    addhn2 v2.8h, v21.4s, v6.4s
-; CHECK-NEXT:    addhn2 v3.8h, v22.4s, v16.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v21.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v21.4s
+; CHECK-NEXT:    add v2.4s, v2.4s, v21.4s
+; CHECK-NEXT:    add v3.4s, v3.4s, v21.4s
+; CHECK-NEXT:    addhn v0.4h, v5.4s, v0.4s
+; CHECK-NEXT:    addhn v1.4h, v4.4s, v1.4s
+; CHECK-NEXT:    addhn v2.4h, v6.4s, v2.4s
+; CHECK-NEXT:    addhn v3.4h, v7.4s, v3.4s
+; CHECK-NEXT:    add v4.4s, v22.4s, v21.4s
+; CHECK-NEXT:    add v5.4s, v23.4s, v21.4s
+; CHECK-NEXT:    add v6.4s, v24.4s, v21.4s
+; CHECK-NEXT:    add v7.4s, v16.4s, v21.4s
+; CHECK-NEXT:    addhn2 v0.8h, v17.4s, v4.4s
+; CHECK-NEXT:    addhn2 v1.8h, v18.4s, v5.4s
+; CHECK-NEXT:    addhn2 v2.8h, v19.4s, v6.4s
+; CHECK-NEXT:    addhn2 v3.8h, v20.4s, v7.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = uitofp <32 x i16> %a to <32 x bfloat>
@@ -7768,9 +7768,9 @@ define <3 x bfloat> @stofp_v3i8_v3bf16(<3 x i8> %a) {
 ; CHECK-NEXT:    scvtf v0.4s, v0.4s
 ; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
 ; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = sitofp <3 x i8> %a to <3 x bfloat>
@@ -7789,9 +7789,9 @@ define <3 x bfloat> @utofp_v3i8_v3bf16(<3 x i8> %a) {
 ; CHECK-NEXT:    ucvtf v0.4s, v0.4s
 ; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
 ; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = uitofp <3 x i8> %a to <3 x bfloat>
@@ -7808,9 +7808,9 @@ define <4 x bfloat> @stofp_v4i8_v4bf16(<4 x i8> %a) {
 ; CHECK-NEXT:    scvtf v0.4s, v0.4s
 ; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
 ; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = sitofp <4 x i8> %a to <4 x bfloat>
@@ -7826,9 +7826,9 @@ define <4 x bfloat> @utofp_v4i8_v4bf16(<4 x i8> %a) {
 ; CHECK-NEXT:    ucvtf v0.4s, v0.4s
 ; CHECK-NEXT:    ushr v2.4s, v0.4s, #16
 ; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    movi v2.4s, #127, msl #8
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    movi v1.4s, #127, msl #8
-; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = uitofp <4 x i8> %a to <4 x bfloat>
@@ -7909,11 +7909,11 @@ define <16 x bfloat> @stofp_v16i8_v16bf16(<16 x i8> %a) {
 ; CHECK-NEXT:    add v5.4s, v5.4s, v7.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v7.4s
 ; CHECK-NEXT:    addhn v1.4h, v3.4s, v5.4s
-; CHECK-NEXT:    add v3.4s, v16.4s, v7.4s
-; CHECK-NEXT:    add v5.4s, v17.4s, v7.4s
 ; CHECK-NEXT:    addhn v0.4h, v4.4s, v0.4s
+; CHECK-NEXT:    add v3.4s, v16.4s, v7.4s
+; CHECK-NEXT:    add v4.4s, v17.4s, v7.4s
 ; CHECK-NEXT:    addhn2 v1.8h, v2.4s, v3.4s
-; CHECK-NEXT:    addhn2 v0.8h, v6.4s, v5.4s
+; CHECK-NEXT:    addhn2 v0.8h, v6.4s, v4.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = sitofp <16 x i8> %a to <16 x bfloat>
@@ -7946,11 +7946,11 @@ define <16 x bfloat> @utofp_v16i8_v16bf16(<16 x i8> %a) {
 ; CHECK-NEXT:    add v5.4s, v5.4s, v7.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v7.4s
 ; CHECK-NEXT:    addhn v1.4h, v3.4s, v5.4s
-; CHECK-NEXT:    add v3.4s, v16.4s, v7.4s
-; CHECK-NEXT:    add v5.4s, v17.4s, v7.4s
 ; CHECK-NEXT:    addhn v0.4h, v4.4s, v0.4s
+; CHECK-NEXT:    add v3.4s, v16.4s, v7.4s
+; CHECK-NEXT:    add v4.4s, v17.4s, v7.4s
 ; CHECK-NEXT:    addhn2 v1.8h, v2.4s, v3.4s
-; CHECK-NEXT:    addhn2 v0.8h, v6.4s, v5.4s
+; CHECK-NEXT:    addhn2 v0.8h, v6.4s, v4.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = uitofp <16 x i8> %a to <16 x bfloat>
@@ -7961,14 +7961,14 @@ define <32 x bfloat> @stofp_v32i8_v32bf16(<32 x i8> %a) {
 ; CHECK-LABEL: stofp_v32i8_v32bf16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    sshll2 v3.8h, v0.16b, #0
-; CHECK-NEXT:    sshll2 v4.8h, v1.16b, #0
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    sshll2 v4.8h, v1.16b, #0
 ; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
 ; CHECK-NEXT:    movi v2.4s, #1
-; CHECK-NEXT:    movi v20.4s, #127, msl #8
+; CHECK-NEXT:    movi v21.4s, #127, msl #8
 ; CHECK-NEXT:    sshll v5.4s, v3.4h, #0
-; CHECK-NEXT:    sshll v6.4s, v4.4h, #0
-; CHECK-NEXT:    sshll v7.4s, v0.4h, #0
+; CHECK-NEXT:    sshll v6.4s, v0.4h, #0
+; CHECK-NEXT:    sshll v7.4s, v4.4h, #0
 ; CHECK-NEXT:    sshll v16.4s, v1.4h, #0
 ; CHECK-NEXT:    sshll2 v3.4s, v3.8h, #0
 ; CHECK-NEXT:    sshll2 v4.4s, v4.8h, #0
@@ -7980,40 +7980,40 @@ define <32 x bfloat> @stofp_v32i8_v32bf16(<32 x i8> %a) {
 ; CHECK-NEXT:    scvtf v16.4s, v16.4s
 ; CHECK-NEXT:    scvtf v17.4s, v3.4s
 ; CHECK-NEXT:    scvtf v4.4s, v4.4s
-; CHECK-NEXT:    scvtf v19.4s, v0.4s
-; CHECK-NEXT:    scvtf v21.4s, v1.4s
-; CHECK-NEXT:    ushr v3.4s, v5.4s, #16
-; CHECK-NEXT:    ushr v18.4s, v6.4s, #16
-; CHECK-NEXT:    ushr v0.4s, v7.4s, #16
-; CHECK-NEXT:    ushr v1.4s, v16.4s, #16
-; CHECK-NEXT:    ushr v22.4s, v17.4s, #16
-; CHECK-NEXT:    ushr v23.4s, v4.4s, #16
-; CHECK-NEXT:    ushr v24.4s, v19.4s, #16
-; CHECK-NEXT:    ushr v25.4s, v21.4s, #16
-; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
-; CHECK-NEXT:    and v18.16b, v18.16b, v2.16b
+; CHECK-NEXT:    scvtf v18.4s, v0.4s
+; CHECK-NEXT:    scvtf v19.4s, v1.4s
+; CHECK-NEXT:    ushr v0.4s, v5.4s, #16
+; CHECK-NEXT:    ushr v3.4s, v6.4s, #16
+; CHECK-NEXT:    ushr v1.4s, v7.4s, #16
+; CHECK-NEXT:    ushr v20.4s, v16.4s, #16
+; CHECK-NEXT:    ushr v23.4s, v17.4s, #16
+; CHECK-NEXT:    ushr v24.4s, v4.4s, #16
+; CHECK-NEXT:    ushr v22.4s, v18.4s, #16
+; CHECK-NEXT:    ushr v25.4s, v19.4s, #16
 ; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v22.16b, v22.16b, v2.16b
+; CHECK-NEXT:    and v20.16b, v20.16b, v2.16b
 ; CHECK-NEXT:    and v23.16b, v23.16b, v2.16b
 ; CHECK-NEXT:    and v24.16b, v24.16b, v2.16b
-; CHECK-NEXT:    and v2.16b, v25.16b, v2.16b
-; CHECK-NEXT:    add v3.4s, v3.4s, v20.4s
-; CHECK-NEXT:    add v18.4s, v18.4s, v20.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v20.4s
-; CHECK-NEXT:    add v26.4s, v1.4s, v20.4s
-; CHECK-NEXT:    addhn v1.4h, v5.4s, v3.4s
-; CHECK-NEXT:    addhn v3.4h, v6.4s, v18.4s
-; CHECK-NEXT:    addhn v0.4h, v7.4s, v0.4s
-; CHECK-NEXT:    add v5.4s, v22.4s, v20.4s
-; CHECK-NEXT:    add v6.4s, v24.4s, v20.4s
-; CHECK-NEXT:    add v7.4s, v23.4s, v20.4s
-; CHECK-NEXT:    add v18.4s, v2.4s, v20.4s
-; CHECK-NEXT:    addhn v2.4h, v16.4s, v26.4s
-; CHECK-NEXT:    addhn2 v0.8h, v19.4s, v6.4s
-; CHECK-NEXT:    addhn2 v1.8h, v17.4s, v5.4s
+; CHECK-NEXT:    and v22.16b, v22.16b, v2.16b
+; CHECK-NEXT:    and v25.16b, v25.16b, v2.16b
+; CHECK-NEXT:    add v0.4s, v0.4s, v21.4s
+; CHECK-NEXT:    add v3.4s, v3.4s, v21.4s
+; CHECK-NEXT:    add v26.4s, v1.4s, v21.4s
+; CHECK-NEXT:    add v20.4s, v20.4s, v21.4s
+; CHECK-NEXT:    addhn v1.4h, v5.4s, v0.4s
+; CHECK-NEXT:    addhn v0.4h, v6.4s, v3.4s
+; CHECK-NEXT:    addhn v3.4h, v7.4s, v26.4s
+; CHECK-NEXT:    addhn v2.4h, v16.4s, v20.4s
+; CHECK-NEXT:    add v5.4s, v22.4s, v21.4s
+; CHECK-NEXT:    add v6.4s, v23.4s, v21.4s
+; CHECK-NEXT:    add v7.4s, v24.4s, v21.4s
+; CHECK-NEXT:    add v16.4s, v25.4s, v21.4s
+; CHECK-NEXT:    addhn2 v0.8h, v18.4s, v5.4s
+; CHECK-NEXT:    addhn2 v1.8h, v17.4s, v6.4s
 ; CHECK-NEXT:    addhn2 v3.8h, v4.4s, v7.4s
-; CHECK-NEXT:    addhn2 v2.8h, v21.4s, v18.4s
+; CHECK-NEXT:    addhn2 v2.8h, v19.4s, v16.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = sitofp <32 x i8> %a to <32 x bfloat>
@@ -8024,14 +8024,14 @@ define <32 x bfloat> @utofp_v32i8_v32bf16(<32 x i8> %a) {
 ; CHECK-LABEL: utofp_v32i8_v32bf16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ushll2 v3.8h, v0.16b, #0
-; CHECK-NEXT:    ushll2 v4.8h, v1.16b, #0
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll2 v4.8h, v1.16b, #0
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
 ; CHECK-NEXT:    movi v2.4s, #1
-; CHECK-NEXT:    movi v20.4s, #127, msl #8
+; CHECK-NEXT:    movi v21.4s, #127, msl #8
 ; CHECK-NEXT:    ushll v5.4s, v3.4h, #0
-; CHECK-NEXT:    ushll v6.4s, v4.4h, #0
-; CHECK-NEXT:    ushll v7.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v6.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v7.4s, v4.4h, #0
 ; CHECK-NEXT:    ushll v16.4s, v1.4h, #0
 ; CHECK-NEXT:    ushll2 v3.4s, v3.8h, #0
 ; CHECK-NEXT:    ushll2 v4.4s, v4.8h, #0
@@ -8043,40 +8043,40 @@ define <32 x bfloat> @utofp_v32i8_v32bf16(<32 x i8> %a) {
 ; CHECK-NEXT:    ucvtf v16.4s, v16.4s
 ; CHECK-NEXT:    ucvtf v17.4s, v3.4s
 ; CHECK-NEXT:    ucvtf v4.4s, v4.4s
-; CHECK-NEXT:    ucvtf v19.4s, v0.4s
-; CHECK-NEXT:    ucvtf v21.4s, v1.4s
-; CHECK-NEXT:    ushr v3.4s, v5.4s, #16
-; CHECK-NEXT:    ushr v18.4s, v6.4s, #16
-; CHECK-NEXT:    ushr v0.4s, v7.4s, #16
-; CHECK-NEXT:    ushr v1.4s, v16.4s, #16
-; CHECK-NEXT:    ushr v22.4s, v17.4s, #16
-; CHECK-NEXT:    ushr v23.4s, v4.4s, #16
-; CHECK-NEXT:    ushr v24.4s, v19.4s, #16
-; CHECK-NEXT:    ushr v25.4s, v21.4s, #16
-; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
-; CHECK-NEXT:    and v18.16b, v18.16b, v2.16b
+; CHECK-NEXT:    ucvtf v18.4s, v0.4s
+; CHECK-NEXT:    ucvtf v19.4s, v1.4s
+; CHECK-NEXT:    ushr v0.4s, v5.4s, #16
+; CHECK-NEXT:    ushr v3.4s, v6.4s, #16
+; CHECK-NEXT:    ushr v1.4s, v7.4s, #16
+; CHECK-NEXT:    ushr v20.4s, v16.4s, #16
+; CHECK-NEXT:    ushr v23.4s, v17.4s, #16
+; CHECK-NEXT:    ushr v24.4s, v4.4s, #16
+; CHECK-NEXT:    ushr v22.4s, v18.4s, #16
+; CHECK-NEXT:    ushr v25.4s, v19.4s, #16
 ; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    and v3.16b, v3.16b, v2.16b
 ; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v22.16b, v22.16b, v2.16b
+; CHECK-NEXT:    and v20.16b, v20.16b, v2.16b
 ; CHECK-NEXT:    and v23.16b, v23.16b, v2.16b
 ; CHECK-NEXT:    and v24.16b, v24.16b, v2.16b
-; CHECK-NEXT:    and v2.16b, v25.16b, v2.16b
-; CHECK-NEXT:    add v3.4s, v3.4s, v20.4s
-; CHECK-NEXT:    add v18.4s, v18.4s, v20.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v20.4s
-; CHECK-NEXT:    add v26.4s, v1.4s, v20.4s
-; CHECK-NEXT:    addhn v1.4h, v5.4s, v3.4s
-; CHECK-NEXT:    addhn v3.4h, v6.4s, v18.4s
-; CHECK-NEXT:    addhn v0.4h, v7.4s, v0.4s
-; CHECK-NEXT:    add v5.4s, v22.4s, v20.4s
-; CHECK-NEXT:    add v6.4s, v24.4s, v20.4s
-; CHECK-NEXT:    add v7.4s, v23.4s, v20.4s
-; CHECK-NEXT:    add v18.4s, v2.4s, v20.4s
-; CHECK-NEXT:    addhn v2.4h, v16.4s, v26.4s
-; CHECK-NEXT:    addhn2 v0.8h, v19.4s, v6.4s
-; CHECK-NEXT:    addhn2 v1.8h, v17.4s, v5.4s
+; CHECK-NEXT:    and v22.16b, v22.16b, v2.16b
+; CHECK-NEXT:    and v25.16b, v25.16b, v2.16b
+; CHECK-NEXT:    add v0.4s, v0.4s, v21.4s
+; CHECK-NEXT:    add v3.4s, v3.4s, v21.4s
+; CHECK-NEXT:    add v26.4s, v1.4s, v21.4s
+; CHECK-NEXT:    add v20.4s, v20.4s, v21.4s
+; CHECK-NEXT:    addhn v1.4h, v5.4s, v0.4s
+; CHECK-NEXT:    addhn v0.4h, v6.4s, v3.4s
+; CHECK-NEXT:    addhn v3.4h, v7.4s, v26.4s
+; CHECK-NEXT:    addhn v2.4h, v16.4s, v20.4s
+; CHECK-NEXT:    add v5.4s, v22.4s, v21.4s
+; CHECK-NEXT:    add v6.4s, v23.4s, v21.4s
+; CHECK-NEXT:    add v7.4s, v24.4s, v21.4s
+; CHECK-NEXT:    add v16.4s, v25.4s, v21.4s
+; CHECK-NEXT:    addhn2 v0.8h, v18.4s, v5.4s
+; CHECK-NEXT:    addhn2 v1.8h, v17.4s, v6.4s
 ; CHECK-NEXT:    addhn2 v3.8h, v4.4s, v7.4s
-; CHECK-NEXT:    addhn2 v2.8h, v21.4s, v18.4s
+; CHECK-NEXT:    addhn2 v2.8h, v19.4s, v16.4s
 ; CHECK-NEXT:    ret
 entry:
   %c = uitofp <32 x i8> %a to <32 x bfloat>
diff --git a/llvm/test/CodeGen/AArch64/ldexp.ll b/llvm/test/CodeGen/AArch64/ldexp.ll
index 4b491051a88aa..ba04ba1d7bb6a 100644
--- a/llvm/test/CodeGen/AArch64/ldexp.ll
+++ b/llvm/test/CodeGen/AArch64/ldexp.ll
@@ -4,9 +4,9 @@
 define double @testExp(double %val, i32 %a) {
 ; CHECK-LABEL: testExp:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    sxtw x8, w0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    fmov d1, x8
 ; CHECK-NEXT:    fscale z0.d, p0/m, z0.d, z1.d
@@ -22,8 +22,8 @@ declare double @ldexp(double, i32) memory(none)
 define float @testExpf(float %val, i32 %a) {
 ; CHECK-LABEL: testExpf:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
 ; CHECK-NEXT:    fscale z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
@@ -49,9 +49,9 @@ declare fp128 @ldexpl(fp128, i32) memory(none)
 define half @testExpf16(half %val, i32 %a) {
 ; CHECK-LABEL: testExpf16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcvt s0, h0
 ; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fscale z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    fcvt h0, s0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/llrint-conv-fp16.ll b/llvm/test/CodeGen/AArch64/llrint-conv-fp16.ll
index 1adbbab76abf5..7e28c863b07a9 100644
--- a/llvm/test/CodeGen/AArch64/llrint-conv-fp16.ll
+++ b/llvm/test/CodeGen/AArch64/llrint-conv-fp16.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK-NOFP16
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK-FP16
+; RUN: llc < %s -mtriple=aarch64 -global-isel | FileCheck %s --check-prefixes=CHECK-NOFP16
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 -global-isel | FileCheck %s --check-prefixes=CHECK-FP16
 
 define i16 @testmhhs(half %x) {
 ; CHECK-NOFP16-LABEL: testmhhs:
diff --git a/llvm/test/CodeGen/AArch64/llrint-conv.ll b/llvm/test/CodeGen/AArch64/llrint-conv.ll
index fa11b007eeb3d..3a6396d120f79 100644
--- a/llvm/test/CodeGen/AArch64/llrint-conv.ll
+++ b/llvm/test/CodeGen/AArch64/llrint-conv.ll
@@ -1,59 +1,75 @@
-; RUN: llc < %s -mtriple=aarch64 -mattr=+neon | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -global-isel | FileCheck %s
 
-; CHECK-LABEL: testmsws:
-; CHECK:       frintx  [[REG:s[0-9]]], s0
-; CHECK-NEXT:  fcvtzs  x0, [[REG]]
-; CHECK:       ret
 define i32 @testmsws(float %x) {
+; CHECK-LABEL: testmsws:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintx s0, s0
+; CHECK-NEXT:    fcvtzs x0, s0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.llrint.f32(float %x)
   %conv = trunc i64 %0 to i32
   ret i32 %conv
 }
 
-; CHECK-LABEL: testmsxs:
-; CHECK:       frintx  [[REG:s[0-9]]], s0
-; CHECK-NEXT:  fcvtzs  x0, [[REG]]
-; CHECK-NEXT:  ret
 define i64 @testmsxs(float %x) {
+; CHECK-LABEL: testmsxs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintx s0, s0
+; CHECK-NEXT:    fcvtzs x0, s0
+; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.llrint.f32(float %x)
   ret i64 %0
 }
 
-; CHECK-LABEL: testmswd:
-; CHECK:       frintx  [[REG:d[0-9]]], d0
-; CHECK-NEXT:  fcvtzs  x0, [[REG]]
-; CHECK:       ret
 define i32 @testmswd(double %x) {
+; CHECK-LABEL: testmswd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintx d0, d0
+; CHECK-NEXT:    fcvtzs x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.llrint.f64(double %x)
   %conv = trunc i64 %0 to i32
   ret i32 %conv
 }
 
-; CHECK-LABEL: testmsxd:
-; CHECK:       frintx  [[REG:d[0-9]]], d0
-; CHECK-NEXT:  fcvtzs  x0, [[REG]]
-; CHECK-nEXT:  ret
 define i64 @testmsxd(double %x) {
+; CHECK-LABEL: testmsxd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintx d0, d0
+; CHECK-NEXT:    fcvtzs x0, d0
+; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.llrint.f64(double %x)
   ret i64 %0
 }
 
-; CHECK-LABEL: testmswl:
-; CHECK:       bl      llrintl
 define i32 @testmswl(fp128 %x) {
+; CHECK-LABEL: testmswl:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl llrintl
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.llrint.f128(fp128 %x)
   %conv = trunc i64 %0 to i32
   ret i32 %conv
 }
 
-; CHECK-LABEL: testmsll:
-; CHECK:       b       llrintl
 define i64 @testmsll(fp128 %x) {
+; CHECK-LABEL: testmsll:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b llrintl
 entry:
   %0 = tail call i64 @llvm.llrint.f128(fp128 %x)
   ret i64 %0
diff --git a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll
index ab15bf564ec42..59a460923e8b7 100644
--- a/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll
+++ b/llvm/test/CodeGen/AArch64/llvm-ir-to-intrinsic.ll
@@ -10,9 +10,9 @@ define <vscale x 16 x i8> @sdiv_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sunpkhi z2.h, z1.b
 ; CHECK-NEXT:    sunpkhi z3.h, z0.b
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpklo z1.h, z1.b
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpkhi z4.s, z2.h
 ; CHECK-NEXT:    sunpkhi z5.s, z3.h
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
@@ -36,11 +36,11 @@ define <vscale x 16 x i8> @sdiv_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
 define <vscale x 8 x i16> @sdiv_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
 ; CHECK-LABEL: sdiv_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpkhi z2.s, z1.h
 ; CHECK-NEXT:    sunpkhi z3.s, z0.h
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
@@ -140,9 +140,9 @@ define <vscale x 16 x i8> @srem_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
 define <vscale x 8 x i16> @srem_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
 ; CHECK-LABEL: srem_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpkhi z2.s, z1.h
 ; CHECK-NEXT:    sunpkhi z3.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpklo z4.s, z0.h
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    sunpklo z3.s, z1.h
@@ -188,9 +188,9 @@ define <vscale x 16 x i8> @udiv_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uunpkhi z2.h, z1.b
 ; CHECK-NEXT:    uunpkhi z3.h, z0.b
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpklo z1.h, z1.b
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpkhi z4.s, z2.h
 ; CHECK-NEXT:    uunpkhi z5.s, z3.h
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
@@ -214,11 +214,11 @@ define <vscale x 16 x i8> @udiv_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
 define <vscale x 8 x i16> @udiv_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
 ; CHECK-LABEL: udiv_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpkhi z2.s, z1.h
 ; CHECK-NEXT:    uunpkhi z3.s, z0.h
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z2.h
@@ -261,9 +261,9 @@ define <vscale x 8 x i32> @udiv_split_i32(<vscale x 8 x i32> %a, <vscale x 8 x i
 define <vscale x 2 x i32> @udiv_widen_i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: udiv_widen_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and z1.d, z1.d, #0xffffffff
 ; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    udiv z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %div = udiv <vscale x 2 x i32> %a, %b
@@ -319,9 +319,9 @@ define <vscale x 16 x i8> @urem_i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
 define <vscale x 8 x i16> @urem_i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
 ; CHECK-LABEL: urem_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpkhi z2.s, z1.h
 ; CHECK-NEXT:    uunpkhi z3.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpklo z4.s, z0.h
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    uunpklo z3.s, z1.h
@@ -558,9 +558,9 @@ define <vscale x 4 x i64> @umin_split_i64(<vscale x 4 x i64> %a, <vscale x 4 x i
 define <vscale x 8 x i8> @umin_promote_i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
 ; CHECK-LABEL: umin_promote_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    and z1.h, z1.h, #0xff
 ; CHECK-NEXT:    and z0.h, z0.h, #0xff
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %cmp = icmp ult <vscale x 8 x i8> %a, %b
@@ -704,9 +704,9 @@ define <vscale x 16 x i16> @umax_split_i16(<vscale x 16 x i16> %a, <vscale x 16
 define <vscale x 2 x i32> @umax_promote_i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b) {
 ; CHECK-LABEL: umax_promote_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and z1.d, z1.d, #0xffffffff
 ; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %cmp = icmp ugt <vscale x 2 x i32> %a, %b
@@ -883,8 +883,8 @@ define <vscale x 4 x i64> @lsl_split_i64(<vscale x 4 x i64> %a, <vscale x 4 x i6
 define <vscale x 4 x i16> @lsl_promote_i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b){
 ; CHECK-LABEL: lsl_promote_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and z1.s, z1.s, #0xffff
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %shl = shl <vscale x 4 x i16> %a, %b
@@ -982,9 +982,9 @@ define <vscale x 2 x i64> @lsr_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
 define <vscale x 8 x i8> @lsr_promote_i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b){
 ; CHECK-LABEL: lsr_promote_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    and z1.h, z1.h, #0xff
 ; CHECK-NEXT:    and z0.h, z0.h, #0xff
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %shr = lshr <vscale x 8 x i8> %a, %b
@@ -1081,10 +1081,10 @@ declare <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x
 define <vscale x 2 x i64> @fshl_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c){
 ; CHECK-LABEL: fshl_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z3.d, #63 // =0x3f
 ; CHECK-NEXT:    mov z4.d, z2.d
 ; CHECK-NEXT:    lsr z1.d, z1.d, #1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    bic z2.d, z3.d, z2.d
 ; CHECK-NEXT:    and z4.d, z4.d, #0x3f
 ; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z4.d
@@ -1098,17 +1098,16 @@ define <vscale x 2 x i64> @fshl_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b
 define <vscale x 4 x i64> @fshl_illegal_i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, <vscale x 4 x i64> %c){
 ; CHECK-LABEL: fshl_illegal_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z6.d, #63 // =0x3f
-; CHECK-NEXT:    mov z7.d, z4.d
 ; CHECK-NEXT:    lsr z2.d, z2.d, #1
 ; CHECK-NEXT:    lsr z3.d, z3.d, #1
-; CHECK-NEXT:    bic z4.d, z6.d, z4.d
-; CHECK-NEXT:    and z7.d, z7.d, #0x3f
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    bic z7.d, z6.d, z4.d
+; CHECK-NEXT:    and z4.d, z4.d, #0x3f
 ; CHECK-NEXT:    bic z6.d, z6.d, z5.d
 ; CHECK-NEXT:    and z5.d, z5.d, #0x3f
-; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z7.d
-; CHECK-NEXT:    lsr z2.d, p0/m, z2.d, z4.d
+; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z4.d
+; CHECK-NEXT:    lsr z2.d, p0/m, z2.d, z7.d
 ; CHECK-NEXT:    lsr z3.d, p0/m, z3.d, z6.d
 ; CHECK-NEXT:    lsl z1.d, p0/m, z1.d, z5.d
 ; CHECK-NEXT:    orr z0.d, z0.d, z2.d
@@ -1121,9 +1120,9 @@ define <vscale x 4 x i64> @fshl_illegal_i64(<vscale x 4 x i64> %a, <vscale x 4 x
 define <vscale x 2 x i64> @fshl_rot_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b){
 ; CHECK-LABEL: fshl_rot_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    subr z1.d, z1.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and z2.d, z2.d, #0x3f
 ; CHECK-NEXT:    and z1.d, z1.d, #0x3f
 ; CHECK-NEXT:    lslr z2.d, p0/m, z2.d, z0.d
@@ -1138,11 +1137,11 @@ define <vscale x 2 x i64> @fshl_rot_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64
 define <vscale x 4 x i64> @fshl_rot_illegal_i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b){
 ; CHECK-LABEL: fshl_rot_illegal_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z4.d, z2.d
 ; CHECK-NEXT:    subr z2.d, z2.d, #0 // =0x0
 ; CHECK-NEXT:    mov z5.d, z3.d
 ; CHECK-NEXT:    subr z3.d, z3.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and z4.d, z4.d, #0x3f
 ; CHECK-NEXT:    and z2.d, z2.d, #0x3f
 ; CHECK-NEXT:    and z5.d, z5.d, #0x3f
@@ -1175,10 +1174,10 @@ define <vscale x 2 x i64> @fshl_rot_const_i64(<vscale x 2 x i64> %a){
 define <vscale x 2 x i64> @fshr_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c){
 ; CHECK-LABEL: fshr_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z3.d, #63 // =0x3f
 ; CHECK-NEXT:    mov z4.d, z2.d
 ; CHECK-NEXT:    lsl z0.d, z0.d, #1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    bic z2.d, z3.d, z2.d
 ; CHECK-NEXT:    and z4.d, z4.d, #0x3f
 ; CHECK-NEXT:    lsr z1.d, p0/m, z1.d, z4.d
@@ -1192,9 +1191,9 @@ define <vscale x 2 x i64> @fshr_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b
 define <vscale x 2 x i64> @fshr_rot_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b){
 ; CHECK-LABEL: fshr_rot_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    subr z1.d, z1.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and z2.d, z2.d, #0x3f
 ; CHECK-NEXT:    and z1.d, z1.d, #0x3f
 ; CHECK-NEXT:    lsrr z2.d, p0/m, z2.d, z0.d
diff --git a/llvm/test/CodeGen/AArch64/load-insert-zero.ll b/llvm/test/CodeGen/AArch64/load-insert-zero.ll
index 993af08a66ddd..23d545459295f 100644
--- a/llvm/test/CodeGen/AArch64/load-insert-zero.ll
+++ b/llvm/test/CodeGen/AArch64/load-insert-zero.ll
@@ -469,18 +469,18 @@ define void @predictor_4x4_neon(ptr nocapture noundef writeonly %0, i64 noundef
 ; CHECK-NEXT:    lsr w8, w8, #24
 ; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
 ; CHECK-NEXT:    urhadd v1.8b, v1.8b, v2.8b
-; CHECK-NEXT:    str s1, [x0]
 ; CHECK-NEXT:    add v0.8h, v0.8h, v3.8h
 ; CHECK-NEXT:    dup v3.8b, w8
+; CHECK-NEXT:    str s1, [x0]
 ; CHECK-NEXT:    lsl x8, x1, #1
 ; CHECK-NEXT:    rshrn v0.8b, v0.8h, #2
 ; CHECK-NEXT:    zip1 v2.2s, v1.2s, v3.2s
 ; CHECK-NEXT:    str s0, [x0, x1]
 ; CHECK-NEXT:    zip1 v3.2s, v0.2s, v3.2s
 ; CHECK-NEXT:    ext v2.8b, v2.8b, v0.8b, #1
+; CHECK-NEXT:    ext v1.8b, v3.8b, v0.8b, #1
 ; CHECK-NEXT:    str s2, [x0, x8]
 ; CHECK-NEXT:    add x8, x8, x1
-; CHECK-NEXT:    ext v1.8b, v3.8b, v0.8b, #1
 ; CHECK-NEXT:    str s1, [x0, x8]
 ; CHECK-NEXT:    ret
   %5 = load i32, ptr %2, align 4
@@ -608,9 +608,9 @@ define void @predictor_4x4_neon_new(ptr nocapture noundef writeonly %0, i64 noun
 define <vscale x 8 x i8> @loadnxv8i8(ptr %p) {
 ; CHECK-LABEL: loadnxv8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl1
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    ldrb w8, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl1
 ; CHECK-NEXT:    mov z0.h, p0/m, w8
 ; CHECK-NEXT:    ret
   %l = load i8, ptr %p
@@ -631,9 +631,9 @@ define <vscale x 16 x i8> @loadnxv16i8(ptr %p) {
 define <vscale x 4 x i16> @loadnxv4i16(ptr %p) {
 ; CHECK-LABEL: loadnxv4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl1
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
 ; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl1
 ; CHECK-NEXT:    mov z0.s, p0/m, w8
 ; CHECK-NEXT:    ret
   %l = load i16, ptr %p
@@ -654,9 +654,9 @@ define <vscale x 8 x i16> @loadnxv8i16(ptr %p) {
 define <vscale x 2 x i32> @loadnxv2i32(ptr %p) {
 ; CHECK-LABEL: loadnxv2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl1
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl1
 ; CHECK-NEXT:    mov z0.d, p0/m, x8
 ; CHECK-NEXT:    ret
   %l = load i32, ptr %p
@@ -688,9 +688,9 @@ define <vscale x 2 x i64> @loadnxv2i64(ptr %p) {
 define <vscale x 4 x half> @loadnxv4f16(ptr %p) {
 ; CHECK-LABEL: loadnxv4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
@@ -715,9 +715,9 @@ define <vscale x 8 x half> @loadnxv8f16(ptr %p) {
 define <vscale x 4 x bfloat> @loadnxv4bf16(ptr %p) {
 ; CHECK-LABEL: loadnxv4bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
@@ -742,9 +742,9 @@ define <vscale x 8 x bfloat> @loadnxv8bf16(ptr %p) {
 define <vscale x 2 x float> @loadnxv2f32(ptr %p) {
 ; CHECK-LABEL: loadnxv2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    index z0.d, #0, #1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, z1.d
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
@@ -782,9 +782,9 @@ define <vscale x 2 x double> @loadnxv2f64(ptr %p) {
 define <vscale x 8 x i8> @loadnxv8i8_offset(ptr %p) {
 ; CHECK-LABEL: loadnxv8i8_offset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl1
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
 ; CHECK-NEXT:    ldrb w8, [x0, #1]
+; CHECK-NEXT:    ptrue p0.h, vl1
 ; CHECK-NEXT:    mov z0.h, p0/m, w8
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 1
@@ -807,9 +807,9 @@ define <vscale x 16 x i8> @loadnxv16i8_offset(ptr %p) {
 define <vscale x 4 x i16> @loadnxv4i16_offset(ptr %p) {
 ; CHECK-LABEL: loadnxv4i16_offset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl1
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
 ; CHECK-NEXT:    ldurh w8, [x0, #1]
+; CHECK-NEXT:    ptrue p0.s, vl1
 ; CHECK-NEXT:    mov z0.s, p0/m, w8
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 1
@@ -832,9 +832,9 @@ define <vscale x 8 x i16> @loadnxv8i16_offset(ptr %p) {
 define <vscale x 2 x i32> @loadnxv2i32_offset(ptr %p) {
 ; CHECK-LABEL: loadnxv2i32_offset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl1
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    ldur w8, [x0, #1]
+; CHECK-NEXT:    ptrue p0.d, vl1
 ; CHECK-NEXT:    mov z0.d, p0/m, x8
 ; CHECK-NEXT:    ret
   %g = getelementptr inbounds i8, ptr %p, i64 1
@@ -869,9 +869,9 @@ define <vscale x 2 x i64> @loadnxv2i64_offset(ptr %p) {
 define <vscale x 4 x half> @loadnxv4f16_offset(ptr %p) {
 ; CHECK-LABEL: loadnxv4f16_offset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
@@ -898,9 +898,9 @@ define <vscale x 8 x half> @loadnxv8f16_offset(ptr %p) {
 define <vscale x 4 x bfloat> @loadnxv4bf16_offset(ptr %p) {
 ; CHECK-LABEL: loadnxv4bf16_offset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
@@ -927,9 +927,9 @@ define <vscale x 8 x bfloat> @loadnxv8bf16_offset(ptr %p) {
 define <vscale x 2 x float> @loadnxv2f32_offset(ptr %p) {
 ; CHECK-LABEL: loadnxv2f32_offset:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    index z0.d, #0, #1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, z1.d
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
diff --git a/llvm/test/CodeGen/AArch64/logic-shift.ll b/llvm/test/CodeGen/AArch64/logic-shift.ll
index 39f82dd4593fb..31047954401cf 100644
--- a/llvm/test/CodeGen/AArch64/logic-shift.ll
+++ b/llvm/test/CodeGen/AArch64/logic-shift.ll
@@ -34,9 +34,9 @@ define i32 @or_lshr_commute1(i32 %x0, i32 %x1, i32 %y, i32 %z) {
 define <8 x i16> @or_lshr_commute2(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %y, <8 x i16> %z) {
 ; CHECK-LABEL: or_lshr_commute2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v2.8h, v2.8h
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    neg v1.8h, v2.8h
+; CHECK-NEXT:    ushl v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %sh1 = lshr <8 x i16> %x0, %y
@@ -49,9 +49,9 @@ define <8 x i16> @or_lshr_commute2(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %y, <
 define <2 x i64> @or_lshr_commute3(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %y, <2 x i64> %z) {
 ; CHECK-LABEL: or_lshr_commute3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v2.2d, v2.2d
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ushl v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    neg v1.2d, v2.2d
+; CHECK-NEXT:    ushl v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %sh1 = lshr <2 x i64> %x0, %y
@@ -94,9 +94,9 @@ define i64 @or_ashr_commute1(i64 %x0, i64 %x1, i64 %y, i64 %z) {
 define <4 x i32> @or_ashr_commute2(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: or_ashr_commute2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v2.4s, v2.4s
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    sshl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    neg v1.4s, v2.4s
+; CHECK-NEXT:    sshl v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %sh1 = ashr <4 x i32> %x0, %y
@@ -109,9 +109,9 @@ define <4 x i32> @or_ashr_commute2(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %y, <
 define <16 x i8> @or_ashr_commute3(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %y, <16 x i8> %z) {
 ; CHECK-LABEL: or_ashr_commute3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v2.16b, v2.16b
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    sshl v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    neg v1.16b, v2.16b
+; CHECK-NEXT:    sshl v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %sh1 = ashr <16 x i8> %x0, %y
@@ -262,9 +262,9 @@ define i32 @xor_lshr_commute1(i32 %x0, i32 %x1, i32 %y, i32 %z) {
 define <8 x i16> @xor_lshr_commute2(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %y, <8 x i16> %z) {
 ; CHECK-LABEL: xor_lshr_commute2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v2.8h, v2.8h
 ; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    neg v1.8h, v2.8h
+; CHECK-NEXT:    ushl v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    eor v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %sh1 = lshr <8 x i16> %x0, %y
@@ -277,9 +277,9 @@ define <8 x i16> @xor_lshr_commute2(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %y,
 define <2 x i64> @xor_lshr_commute3(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %y, <2 x i64> %z) {
 ; CHECK-LABEL: xor_lshr_commute3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v2.2d, v2.2d
 ; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ushl v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    neg v1.2d, v2.2d
+; CHECK-NEXT:    ushl v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    eor v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %sh1 = lshr <2 x i64> %x0, %y
@@ -322,9 +322,9 @@ define i64 @xor_ashr_commute1(i64 %x0, i64 %x1, i64 %y, i64 %z) {
 define <4 x i32> @xor_ashr_commute2(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: xor_ashr_commute2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v2.4s, v2.4s
 ; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    sshl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    neg v1.4s, v2.4s
+; CHECK-NEXT:    sshl v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    eor v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %sh1 = ashr <4 x i32> %x0, %y
@@ -337,9 +337,9 @@ define <4 x i32> @xor_ashr_commute2(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %y,
 define <16 x i8> @xor_ashr_commute3(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %y, <16 x i8> %z) {
 ; CHECK-LABEL: xor_ashr_commute3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v2.16b, v2.16b
 ; CHECK-NEXT:    eor v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    sshl v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    neg v1.16b, v2.16b
+; CHECK-NEXT:    sshl v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    eor v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %sh1 = ashr <16 x i8> %x0, %y
@@ -490,9 +490,9 @@ define i32 @and_lshr_commute1(i32 %x0, i32 %x1, i32 %y, i32 %z) {
 define <8 x i16> @and_lshr_commute2(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %y, <8 x i16> %z) {
 ; CHECK-LABEL: and_lshr_commute2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v2.8h, v2.8h
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    neg v1.8h, v2.8h
+; CHECK-NEXT:    ushl v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %sh1 = lshr <8 x i16> %x0, %y
@@ -505,9 +505,9 @@ define <8 x i16> @and_lshr_commute2(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %y,
 define <2 x i64> @and_lshr_commute3(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %y, <2 x i64> %z) {
 ; CHECK-LABEL: and_lshr_commute3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v2.2d, v2.2d
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    ushl v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    neg v1.2d, v2.2d
+; CHECK-NEXT:    ushl v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %sh1 = lshr <2 x i64> %x0, %y
@@ -550,9 +550,9 @@ define i64 @and_ashr_commute1(i64 %x0, i64 %x1, i64 %y, i64 %z) {
 define <4 x i32> @and_ashr_commute2(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: and_ashr_commute2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v2.4s, v2.4s
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    sshl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    neg v1.4s, v2.4s
+; CHECK-NEXT:    sshl v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %sh1 = ashr <4 x i32> %x0, %y
@@ -565,9 +565,9 @@ define <4 x i32> @and_ashr_commute2(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %y,
 define <16 x i8> @and_ashr_commute3(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %y, <16 x i8> %z) {
 ; CHECK-LABEL: and_ashr_commute3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v2.16b, v2.16b
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    sshl v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    neg v1.16b, v2.16b
+; CHECK-NEXT:    sshl v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    and v0.16b, v0.16b, v3.16b
 ; CHECK-NEXT:    ret
   %sh1 = ashr <16 x i8> %x0, %y
diff --git a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
index 06570b4539cc1..fac96e07de541 100644
--- a/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
+++ b/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
@@ -258,10 +258,10 @@ define <vscale x 2 x i1> @splice_nxv2i1_idx(<vscale x 2 x i1> %a, <vscale x 2 x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.d, p1/z, #1 // =0x1
 ; CHECK-NEXT:    mov z1.d, p0/z, #1 // =0x1
-; CHECK-NEXT:    ptrue p2.d
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    and z1.d, z1.d, #0x1
-; CHECK-NEXT:    cmpne p0.d, p2/z, z1.d, #0
+; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 2 x i1> @llvm.experimental.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 1)
   ret <vscale x 2 x i1> %res
@@ -273,10 +273,10 @@ define <vscale x 4 x i1> @splice_nxv4i1_idx(<vscale x 4 x i1> %a, <vscale x 4 x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.s, p1/z, #1 // =0x1
 ; CHECK-NEXT:    mov z1.s, p0/z, #1 // =0x1
-; CHECK-NEXT:    ptrue p2.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    and z1.s, z1.s, #0x1
-; CHECK-NEXT:    cmpne p0.s, p2/z, z1.s, #0
+; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 4 x i1> @llvm.experimental.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 2)
   ret <vscale x 4 x i1> %res
@@ -288,10 +288,10 @@ define <vscale x 8 x i1> @splice_nxv8i1_idx(<vscale x 8 x i1> %a, <vscale x 8 x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.h, p1/z, #1 // =0x1
 ; CHECK-NEXT:    mov z1.h, p0/z, #1 // =0x1
-; CHECK-NEXT:    ptrue p2.h
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    and z1.h, z1.h, #0x1
-; CHECK-NEXT:    cmpne p0.h, p2/z, z1.h, #0
+; CHECK-NEXT:    cmpne p0.h, p0/z, z1.h, #0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 8 x i1> @llvm.experimental.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 4)
   ret <vscale x 8 x i1> %res
@@ -303,10 +303,10 @@ define <vscale x 16 x i1> @splice_nxv16i1_idx(<vscale x 16 x i1> %a, <vscale x 1
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.b, p1/z, #1 // =0x1
 ; CHECK-NEXT:    mov z1.b, p0/z, #1 // =0x1
-; CHECK-NEXT:    ptrue p2.b
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
 ; CHECK-NEXT:    and z1.b, z1.b, #0x1
-; CHECK-NEXT:    cmpne p0.b, p2/z, z1.b, #0
+; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
 ; CHECK-NEXT:    ret
   %res = call <vscale x 16 x i1> @llvm.experimental.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 8)
   ret <vscale x 16 x i1> %res
@@ -350,16 +350,16 @@ define <vscale x 16 x float> @splice_nxv16f32_16(<vscale x 16 x float> %a, <vsca
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-8
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    rdvl x8, #1
 ; CHECK-NEXT:    mov w9, #16 // =0x10
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    cmp x8, #16
+; CHECK-NEXT:    st1w { z3.s }, p0, [sp, #3, mul vl]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    add x10, x9, x8, lsl #2
-; CHECK-NEXT:    st1w { z3.s }, p0, [sp, #3, mul vl]
 ; CHECK-NEXT:    st1w { z2.s }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    add x10, x9, x8, lsl #2
 ; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK-NEXT:    st1w { z7.s }, p0, [sp, #7, mul vl]
@@ -452,16 +452,16 @@ define <vscale x 16 x i8> @splice_nxv16i8_neg17(<vscale x 16 x i8> %a, <vscale x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-2
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov w9, #17 // =0x11
-; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    cmp x8, #17
-; CHECK-NEXT:    add x10, x10, x8
-; CHECK-NEXT:    csel x8, x8, x9, lo
-; CHECK-NEXT:    sub x8, x10, x8
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    csel x9, x8, x9, lo
+; CHECK-NEXT:    add x8, x10, x8
 ; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
 ; CHECK-NEXT:    st1b { z1.b }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    sub x8, x8, x9
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x8]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -497,16 +497,16 @@ define <vscale x 8 x i16> @splice_nxv8i16_neg9(<vscale x 8 x i16> %a, <vscale x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-2
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w9, #18 // =0x12
-; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    cmp x8, #18
-; CHECK-NEXT:    add x10, x10, x8
-; CHECK-NEXT:    csel x8, x8, x9, lo
-; CHECK-NEXT:    sub x8, x10, x8
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    csel x9, x8, x9, lo
+; CHECK-NEXT:    add x8, x10, x8
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    sub x8, x8, x9
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -608,16 +608,16 @@ define <vscale x 8 x half> @splice_nxv8f16_neg9(<vscale x 8 x half> %a, <vscale
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-2
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w9, #18 // =0x12
-; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    cmp x8, #18
-; CHECK-NEXT:    add x10, x10, x8
-; CHECK-NEXT:    csel x8, x8, x9, lo
-; CHECK-NEXT:    sub x8, x10, x8
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    csel x9, x8, x9, lo
+; CHECK-NEXT:    add x8, x10, x8
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    sub x8, x8, x9
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -699,9 +699,9 @@ define <vscale x 2 x i1> @splice_nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1>
 ; CHECK-NEXT:    ptrue p2.d, vl1
 ; CHECK-NEXT:    mov z0.d, p1/z, #1 // =0x1
 ; CHECK-NEXT:    mov z1.d, p0/z, #1 // =0x1
+; CHECK-NEXT:    rev p0.d, p2.d
+; CHECK-NEXT:    splice z1.d, p0, z1.d, z0.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    rev p2.d, p2.d
-; CHECK-NEXT:    splice z1.d, p2, z1.d, z0.d
 ; CHECK-NEXT:    and z1.d, z1.d, #0x1
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
 ; CHECK-NEXT:    ret
@@ -716,9 +716,9 @@ define <vscale x 4 x i1> @splice_nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1>
 ; CHECK-NEXT:    ptrue p2.s, vl1
 ; CHECK-NEXT:    mov z0.s, p1/z, #1 // =0x1
 ; CHECK-NEXT:    mov z1.s, p0/z, #1 // =0x1
+; CHECK-NEXT:    rev p0.s, p2.s
+; CHECK-NEXT:    splice z1.s, p0, z1.s, z0.s
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    rev p2.s, p2.s
-; CHECK-NEXT:    splice z1.s, p2, z1.s, z0.s
 ; CHECK-NEXT:    and z1.s, z1.s, #0x1
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
 ; CHECK-NEXT:    ret
@@ -733,9 +733,9 @@ define <vscale x 8 x i1> @splice_nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1>
 ; CHECK-NEXT:    ptrue p2.h, vl1
 ; CHECK-NEXT:    mov z0.h, p1/z, #1 // =0x1
 ; CHECK-NEXT:    mov z1.h, p0/z, #1 // =0x1
+; CHECK-NEXT:    rev p0.h, p2.h
+; CHECK-NEXT:    splice z1.h, p0, z1.h, z0.h
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    rev p2.h, p2.h
-; CHECK-NEXT:    splice z1.h, p2, z1.h, z0.h
 ; CHECK-NEXT:    and z1.h, z1.h, #0x1
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z1.h, #0
 ; CHECK-NEXT:    ret
@@ -750,9 +750,9 @@ define <vscale x 16 x i1> @splice_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x
 ; CHECK-NEXT:    ptrue p2.b, vl1
 ; CHECK-NEXT:    mov z0.b, p1/z, #1 // =0x1
 ; CHECK-NEXT:    mov z1.b, p0/z, #1 // =0x1
+; CHECK-NEXT:    rev p0.b, p2.b
+; CHECK-NEXT:    splice z1.b, p0, z1.b, z0.b
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    rev p2.b, p2.b
-; CHECK-NEXT:    splice z1.b, p2, z1.b, z0.b
 ; CHECK-NEXT:    and z1.b, z1.b, #0x1
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
 ; CHECK-NEXT:    ret
@@ -783,8 +783,8 @@ define <vscale x 8 x i32> @splice_nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    add x8, x9, x8
 ; CHECK-NEXT:    mov x9, #-8 // =0xfffffffffffffff8
-; CHECK-NEXT:    sub x10, x8, #32
 ; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    sub x10, x8, #32
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK-NEXT:    st1w { z3.s }, p0, [sp, #3, mul vl]
 ; CHECK-NEXT:    st1w { z2.s }, p0, [sp, #2, mul vl]
@@ -803,16 +803,16 @@ define <vscale x 16 x float> @splice_nxv16f32_neg17(<vscale x 16 x float> %a, <v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-8
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    rdvl x8, #4
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w9, #68 // =0x44
-; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    cmp x8, #68
+; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    csel x9, x8, x9, lo
 ; CHECK-NEXT:    add x8, x10, x8
-; CHECK-NEXT:    sub x8, x8, x9
 ; CHECK-NEXT:    st1w { z3.s }, p0, [sp, #3, mul vl]
 ; CHECK-NEXT:    st1w { z2.s }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    sub x8, x8, x9
 ; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK-NEXT:    st1w { z7.s }, p0, [sp, #7, mul vl]
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index 706aa4ad1b466..736f66c935e74 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -845,9 +845,9 @@ define i32 @test_sdot_v24i8_double(<24 x i8> %a, <24 x i8> %b, <24 x i8> %c, <24
 ; CHECK-NEXT:    ld1 { v5.b }[15], [x8]
 ; CHECK-NEXT:    ld1 { v7.b }[7], [x10]
 ; CHECK-NEXT:    addp v1.2s, v19.2s, v19.2s
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    sdot v16.4s, v5.16b, v4.16b
 ; CHECK-NEXT:    sdot v18.2s, v7.8b, v6.8b
-; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    fmov w9, s1
 ; CHECK-NEXT:    addv s2, v16.4s
 ; CHECK-NEXT:    addp v3.2s, v18.2s, v18.2s
@@ -975,8 +975,8 @@ define i32 @test_sdot_v24i8_double_nomla(<24 x i8> %a, <24 x i8> %b, <24 x i8> %
 ; CHECK-NEXT:    addv s3, v5.4s
 ; CHECK-NEXT:    addp v1.2s, v17.2s, v17.2s
 ; CHECK-NEXT:    addp v2.2s, v7.2s, v7.2s
-; CHECK-NEXT:    addv s0, v6.4s
 ; CHECK-NEXT:    fmov w10, s3
+; CHECK-NEXT:    addv s0, v6.4s
 ; CHECK-NEXT:    fmov w9, s1
 ; CHECK-NEXT:    fmov w11, s2
 ; CHECK-NEXT:    fmov w8, s0
@@ -998,26 +998,26 @@ entry:
 define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-LABEL: test_udot_v25i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q4, q0, [x0]
+; CHECK-NEXT:    ldp q2, q0, [x0]
 ; CHECK-NEXT:    ldp q5, q1, [x1]
-; CHECK-NEXT:    ushll2 v2.8h, v0.16b, #0
-; CHECK-NEXT:    ushll v6.8h, v4.8b, #0
+; CHECK-NEXT:    ushll2 v3.8h, v0.16b, #0
+; CHECK-NEXT:    ushll v6.8h, v2.8b, #0
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll2 v3.8h, v1.16b, #0
+; CHECK-NEXT:    ushll2 v4.8h, v1.16b, #0
 ; CHECK-NEXT:    ushll v7.8h, v5.8b, #0
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    umull v2.4s, v3.4h, v2.4h
-; CHECK-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NEXT:    umull v3.4s, v4.4h, v3.4h
+; CHECK-NEXT:    movi v4.2d, #0000000000000000
 ; CHECK-NEXT:    umull2 v16.4s, v7.8h, v6.8h
 ; CHECK-NEXT:    umull v6.4s, v7.4h, v6.4h
-; CHECK-NEXT:    mov v3.s[0], v2.s[0]
-; CHECK-NEXT:    ushll2 v2.8h, v4.16b, #0
-; CHECK-NEXT:    ushll2 v4.8h, v5.16b, #0
-; CHECK-NEXT:    umlal v6.4s, v1.4h, v0.4h
+; CHECK-NEXT:    mov v4.s[0], v3.s[0]
+; CHECK-NEXT:    ushll2 v3.8h, v5.16b, #0
 ; CHECK-NEXT:    umlal2 v16.4s, v1.8h, v0.8h
-; CHECK-NEXT:    umlal v3.4s, v4.4h, v2.4h
-; CHECK-NEXT:    umlal2 v16.4s, v4.8h, v2.8h
-; CHECK-NEXT:    add v0.4s, v6.4s, v3.4s
+; CHECK-NEXT:    umlal v6.4s, v1.4h, v0.4h
+; CHECK-NEXT:    umlal v4.4s, v3.4h, v2.4h
+; CHECK-NEXT:    umlal2 v16.4s, v3.8h, v2.8h
+; CHECK-NEXT:    add v0.4s, v6.4s, v4.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v16.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
@@ -1039,17 +1039,17 @@ define i32 @test_udot_v25i8_nomla(ptr nocapture readonly %a1) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldp q2, q1, [x0]
 ; CHECK-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-NEXT:    ushll v3.8h, v1.8b, #0
+; CHECK-NEXT:    ushll2 v3.8h, v1.16b, #0
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
 ; CHECK-NEXT:    ushll v4.8h, v2.8b, #0
-; CHECK-NEXT:    ushll2 v1.8h, v1.16b, #0
 ; CHECK-NEXT:    ushll2 v2.8h, v2.16b, #0
-; CHECK-NEXT:    uaddl2 v5.4s, v4.8h, v3.8h
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    uaddl v3.4s, v4.4h, v3.4h
-; CHECK-NEXT:    mov v0.s[0], v1.s[0]
-; CHECK-NEXT:    uaddw2 v1.4s, v5.4s, v2.8h
+; CHECK-NEXT:    ushll v3.4s, v3.4h, #0
+; CHECK-NEXT:    uaddl2 v5.4s, v4.8h, v1.8h
+; CHECK-NEXT:    uaddl v1.4s, v4.4h, v1.4h
+; CHECK-NEXT:    mov v0.s[0], v3.s[0]
+; CHECK-NEXT:    uaddw2 v3.4s, v5.4s, v2.8h
+; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    uaddw v0.4s, v0.4s, v2.4h
-; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
@@ -1063,26 +1063,26 @@ entry:
 define i32 @test_sdot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-LABEL: test_sdot_v25i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q4, q0, [x0]
+; CHECK-NEXT:    ldp q2, q0, [x0]
 ; CHECK-NEXT:    ldp q5, q1, [x1]
-; CHECK-NEXT:    sshll2 v2.8h, v0.16b, #0
-; CHECK-NEXT:    sshll v6.8h, v4.8b, #0
+; CHECK-NEXT:    sshll2 v3.8h, v0.16b, #0
+; CHECK-NEXT:    sshll v6.8h, v2.8b, #0
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll2 v3.8h, v1.16b, #0
+; CHECK-NEXT:    sshll2 v4.8h, v1.16b, #0
 ; CHECK-NEXT:    sshll v7.8h, v5.8b, #0
 ; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    smull v2.4s, v3.4h, v2.4h
-; CHECK-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NEXT:    smull v3.4s, v4.4h, v3.4h
+; CHECK-NEXT:    movi v4.2d, #0000000000000000
 ; CHECK-NEXT:    smull2 v16.4s, v7.8h, v6.8h
 ; CHECK-NEXT:    smull v6.4s, v7.4h, v6.4h
-; CHECK-NEXT:    mov v3.s[0], v2.s[0]
-; CHECK-NEXT:    sshll2 v2.8h, v4.16b, #0
-; CHECK-NEXT:    sshll2 v4.8h, v5.16b, #0
-; CHECK-NEXT:    smlal v6.4s, v1.4h, v0.4h
+; CHECK-NEXT:    mov v4.s[0], v3.s[0]
+; CHECK-NEXT:    sshll2 v3.8h, v5.16b, #0
 ; CHECK-NEXT:    smlal2 v16.4s, v1.8h, v0.8h
-; CHECK-NEXT:    smlal v3.4s, v4.4h, v2.4h
-; CHECK-NEXT:    smlal2 v16.4s, v4.8h, v2.8h
-; CHECK-NEXT:    add v0.4s, v6.4s, v3.4s
+; CHECK-NEXT:    smlal v6.4s, v1.4h, v0.4h
+; CHECK-NEXT:    smlal v4.4s, v3.4h, v2.4h
+; CHECK-NEXT:    smlal2 v16.4s, v3.8h, v2.8h
+; CHECK-NEXT:    add v0.4s, v6.4s, v4.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v16.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
@@ -1105,222 +1105,222 @@ define i32 @test_sdot_v25i8_double(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    fmov s4, w0
-; CHECK-NEXT:    ldr b0, [sp, #80]
-; CHECK-NEXT:    add x8, sp, #88
 ; CHECK-NEXT:    ldr b1, [sp, #16]
-; CHECK-NEXT:    add x10, sp, #24
-; CHECK-NEXT:    ldr b2, [sp, #280]
-; CHECK-NEXT:    ld1 { v0.b }[1], [x8]
+; CHECK-NEXT:    ldr b0, [sp, #80]
+; CHECK-NEXT:    add x11, sp, #24
 ; CHECK-NEXT:    ldr b3, [sp, #216]
+; CHECK-NEXT:    add x10, sp, #88
+; CHECK-NEXT:    ldr b2, [sp, #280]
+; CHECK-NEXT:    ld1 { v1.b }[1], [x11]
 ; CHECK-NEXT:    add x11, sp, #224
-; CHECK-NEXT:    mov v4.b[1], w1
-; CHECK-NEXT:    ld1 { v1.b }[1], [x10]
+; CHECK-NEXT:    ldr b4, [sp, #152]
+; CHECK-NEXT:    ldr b6, [sp, #480]
+; CHECK-NEXT:    ld1 { v0.b }[1], [x10]
 ; CHECK-NEXT:    add x10, sp, #288
-; CHECK-NEXT:    ldr b5, [sp, #152]
-; CHECK-NEXT:    add x9, sp, #96
-; CHECK-NEXT:    ld1 { v2.b }[1], [x10]
+; CHECK-NEXT:    add x12, sp, #160
 ; CHECK-NEXT:    ld1 { v3.b }[1], [x11]
-; CHECK-NEXT:    add x10, sp, #160
-; CHECK-NEXT:    ld1 { v0.b }[2], [x9]
-; CHECK-NEXT:    ld1 { v5.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #32
-; CHECK-NEXT:    add x11, sp, #296
-; CHECK-NEXT:    mov v4.b[2], w2
-; CHECK-NEXT:    ld1 { v1.b }[2], [x10]
-; CHECK-NEXT:    add x10, sp, #232
+; CHECK-NEXT:    add x11, sp, #488
+; CHECK-NEXT:    ld1 { v2.b }[1], [x10]
+; CHECK-NEXT:    ld1 { v4.b }[1], [x12]
+; CHECK-NEXT:    ld1 { v6.b }[1], [x11]
+; CHECK-NEXT:    add x11, sp, #32
+; CHECK-NEXT:    add x9, sp, #96
 ; CHECK-NEXT:    add x8, sp, #104
-; CHECK-NEXT:    ld1 { v2.b }[2], [x11]
-; CHECK-NEXT:    ld1 { v3.b }[2], [x10]
+; CHECK-NEXT:    ld1 { v1.b }[2], [x11]
+; CHECK-NEXT:    add x11, sp, #232
+; CHECK-NEXT:    ld1 { v0.b }[2], [x9]
+; CHECK-NEXT:    add x9, sp, #296
+; CHECK-NEXT:    ld1 { v3.b }[2], [x11]
 ; CHECK-NEXT:    add x11, sp, #168
+; CHECK-NEXT:    ld1 { v2.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v4.b }[2], [x11]
+; CHECK-NEXT:    add x11, sp, #40
+; CHECK-NEXT:    ld1 { v1.b }[3], [x11]
 ; CHECK-NEXT:    ld1 { v0.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #40
-; CHECK-NEXT:    ld1 { v5.b }[2], [x11]
-; CHECK-NEXT:    ld1 { v1.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #240
-; CHECK-NEXT:    mov v4.b[3], w3
-; CHECK-NEXT:    ld1 { v3.b }[3], [x8]
-; CHECK-NEXT:    add x8, sp, #176
-; CHECK-NEXT:    add x12, sp, #112
-; CHECK-NEXT:    add x13, sp, #48
-; CHECK-NEXT:    add x9, sp, #120
-; CHECK-NEXT:    ld1 { v5.b }[3], [x8]
-; CHECK-NEXT:    ld1 { v0.b }[4], [x12]
-; CHECK-NEXT:    add x12, sp, #184
-; CHECK-NEXT:    ld1 { v1.b }[4], [x13]
-; CHECK-NEXT:    add x15, sp, #56
-; CHECK-NEXT:    add x14, sp, #128
-; CHECK-NEXT:    mov v4.b[4], w4
-; CHECK-NEXT:    add x11, sp, #304
-; CHECK-NEXT:    add x13, sp, #256
-; CHECK-NEXT:    ld1 { v5.b }[4], [x12]
-; CHECK-NEXT:    ld1 { v0.b }[5], [x9]
-; CHECK-NEXT:    add x9, sp, #192
-; CHECK-NEXT:    add x12, sp, #248
-; CHECK-NEXT:    ld1 { v1.b }[5], [x15]
-; CHECK-NEXT:    add x15, sp, #200
-; CHECK-NEXT:    ld1 { v3.b }[4], [x12]
-; CHECK-NEXT:    ld1 { v2.b }[3], [x11]
+; CHECK-NEXT:    add x8, sp, #304
+; CHECK-NEXT:    add x10, sp, #112
+; CHECK-NEXT:    add x11, sp, #240
+; CHECK-NEXT:    add x13, sp, #56
+; CHECK-NEXT:    ld1 { v2.b }[3], [x8]
+; CHECK-NEXT:    add x8, sp, #48
+; CHECK-NEXT:    ld1 { v3.b }[3], [x11]
+; CHECK-NEXT:    ld1 { v1.b }[4], [x8]
+; CHECK-NEXT:    ld1 { v0.b }[4], [x10]
+; CHECK-NEXT:    add x15, sp, #312
+; CHECK-NEXT:    add x12, sp, #120
+; CHECK-NEXT:    add x8, sp, #248
 ; CHECK-NEXT:    add x11, sp, #64
-; CHECK-NEXT:    mov v4.b[5], w5
-; CHECK-NEXT:    ld1 { v5.b }[5], [x9]
-; CHECK-NEXT:    ld1 { v0.b }[6], [x14]
-; CHECK-NEXT:    ldr b6, [sp, #352]
-; CHECK-NEXT:    add x10, sp, #136
-; CHECK-NEXT:    ld1 { v1.b }[6], [x11]
-; CHECK-NEXT:    add x11, sp, #360
-; CHECK-NEXT:    ld1 { v3.b }[5], [x13]
+; CHECK-NEXT:    ld1 { v2.b }[4], [x15]
+; CHECK-NEXT:    ld1 { v3.b }[4], [x8]
+; CHECK-NEXT:    add x15, sp, #320
+; CHECK-NEXT:    ld1 { v1.b }[5], [x13]
+; CHECK-NEXT:    ld1 { v0.b }[5], [x12]
 ; CHECK-NEXT:    ldr b18, [sp, #552]
-; CHECK-NEXT:    ld1 { v5.b }[6], [x15]
-; CHECK-NEXT:    add x14, sp, #208
-; CHECK-NEXT:    ld1 { v6.b }[1], [x11]
-; CHECK-NEXT:    mov v4.b[6], w6
-; CHECK-NEXT:    ld1 { v0.b }[7], [x10]
-; CHECK-NEXT:    add x10, sp, #560
-; CHECK-NEXT:    add x9, sp, #264
-; CHECK-NEXT:    ld1 { v18.b }[1], [x10]
+; CHECK-NEXT:    add x14, sp, #128
+; CHECK-NEXT:    add x16, sp, #256
+; CHECK-NEXT:    ldr b16, [sp, #352]
+; CHECK-NEXT:    ld1 { v2.b }[5], [x15]
+; CHECK-NEXT:    add x15, sp, #176
+; CHECK-NEXT:    ld1 { v3.b }[5], [x16]
+; CHECK-NEXT:    ld1 { v1.b }[6], [x11]
+; CHECK-NEXT:    add x11, sp, #560
+; CHECK-NEXT:    ld1 { v0.b }[6], [x14]
+; CHECK-NEXT:    add x16, sp, #360
+; CHECK-NEXT:    ld1 { v4.b }[3], [x15]
+; CHECK-NEXT:    ld1 { v18.b }[1], [x11]
+; CHECK-NEXT:    add x10, sp, #72
+; CHECK-NEXT:    ld1 { v16.b }[1], [x16]
+; CHECK-NEXT:    add x9, sp, #136
+; CHECK-NEXT:    add x14, sp, #184
+; CHECK-NEXT:    ld1 { v1.b }[7], [x10]
 ; CHECK-NEXT:    add x10, sp, #568
-; CHECK-NEXT:    ld1 { v5.b }[7], [x14]
-; CHECK-NEXT:    ld1 { v3.b }[6], [x9]
+; CHECK-NEXT:    ld1 { v0.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v4.b }[4], [x14]
 ; CHECK-NEXT:    add x9, sp, #368
-; CHECK-NEXT:    ld1 { v6.b }[2], [x9]
-; CHECK-NEXT:    add x11, sp, #488
-; CHECK-NEXT:    ldr b7, [sp, #144]
-; CHECK-NEXT:    mov v4.b[7], w7
 ; CHECK-NEXT:    ld1 { v18.b }[2], [x10]
-; CHECK-NEXT:    add x10, sp, #376
-; CHECK-NEXT:    sshll v17.8h, v5.8b, #0
-; CHECK-NEXT:    ldr b5, [sp, #480]
-; CHECK-NEXT:    sshll v7.8h, v7.8b, #0
-; CHECK-NEXT:    ld1 { v6.b }[3], [x10]
+; CHECK-NEXT:    add x11, sp, #496
+; CHECK-NEXT:    ld1 { v16.b }[2], [x9]
+; CHECK-NEXT:    fmov s5, w0
+; CHECK-NEXT:    add x9, sp, #192
+; CHECK-NEXT:    ld1 { v6.b }[2], [x11]
 ; CHECK-NEXT:    add x10, sp, #576
-; CHECK-NEXT:    add x8, sp, #312
-; CHECK-NEXT:    ld1 { v5.b }[1], [x11]
+; CHECK-NEXT:    ld1 { v4.b }[5], [x9]
+; CHECK-NEXT:    add x9, sp, #376
 ; CHECK-NEXT:    ld1 { v18.b }[3], [x10]
-; CHECK-NEXT:    add x11, sp, #496
-; CHECK-NEXT:    sshll v16.8h, v4.8b, #0
-; CHECK-NEXT:    ldr b4, [sp, #344]
-; CHECK-NEXT:    add x10, sp, #384
-; CHECK-NEXT:    ld1 { v6.b }[4], [x10]
+; CHECK-NEXT:    add x11, sp, #504
+; CHECK-NEXT:    ld1 { v16.b }[3], [x9]
+; CHECK-NEXT:    mov v5.b[1], w1
+; CHECK-NEXT:    ldr b7, [sp, #144]
+; CHECK-NEXT:    ldr b17, [sp, #344]
+; CHECK-NEXT:    add x9, sp, #200
+; CHECK-NEXT:    ld1 { v6.b }[3], [x11]
 ; CHECK-NEXT:    add x10, sp, #584
-; CHECK-NEXT:    ld1 { v2.b }[4], [x8]
-; CHECK-NEXT:    sshll v19.8h, v4.8b, #0
-; CHECK-NEXT:    ld1 { v5.b }[2], [x11]
+; CHECK-NEXT:    ld1 { v4.b }[6], [x9]
+; CHECK-NEXT:    add x9, sp, #384
 ; CHECK-NEXT:    ld1 { v18.b }[4], [x10]
-; CHECK-NEXT:    smull2 v4.4s, v16.8h, v17.8h
-; CHECK-NEXT:    smull v16.4s, v16.4h, v17.4h
-; CHECK-NEXT:    ldr b17, [sp, #416]
-; CHECK-NEXT:    add x11, sp, #504
-; CHECK-NEXT:    add x10, sp, #424
-; CHECK-NEXT:    add x16, sp, #320
-; CHECK-NEXT:    smull v19.4s, v7.4h, v19.4h
-; CHECK-NEXT:    movi v7.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v5.b }[3], [x11]
-; CHECK-NEXT:    add x11, sp, #392
-; CHECK-NEXT:    ld1 { v17.b }[1], [x10]
-; CHECK-NEXT:    add x10, sp, #592
-; CHECK-NEXT:    ld1 { v2.b }[5], [x16]
-; CHECK-NEXT:    ld1 { v6.b }[5], [x11]
-; CHECK-NEXT:    ld1 { v18.b }[5], [x10]
+; CHECK-NEXT:    sshll v7.8h, v7.8b, #0
+; CHECK-NEXT:    sshll v17.8h, v17.8b, #0
 ; CHECK-NEXT:    add x11, sp, #512
-; CHECK-NEXT:    add x10, sp, #432
+; CHECK-NEXT:    ld1 { v16.b }[4], [x9]
+; CHECK-NEXT:    ld1 { v6.b }[4], [x11]
+; CHECK-NEXT:    add x11, sp, #592
+; CHECK-NEXT:    mov v5.b[2], w2
+; CHECK-NEXT:    add x10, sp, #392
+; CHECK-NEXT:    ldr b19, [sp, #680]
+; CHECK-NEXT:    ld1 { v18.b }[5], [x11]
+; CHECK-NEXT:    smull v7.4s, v7.4h, v17.4h
+; CHECK-NEXT:    ldr b17, [sp, #416]
+; CHECK-NEXT:    ld1 { v16.b }[5], [x10]
+; CHECK-NEXT:    add x10, sp, #688
 ; CHECK-NEXT:    add x12, sp, #328
-; CHECK-NEXT:    mov v7.s[0], v19.s[0]
-; CHECK-NEXT:    ld1 { v5.b }[4], [x11]
-; CHECK-NEXT:    add x11, sp, #400
-; CHECK-NEXT:    ld1 { v17.b }[2], [x10]
+; CHECK-NEXT:    add x9, sp, #424
+; CHECK-NEXT:    ld1 { v19.b }[1], [x10]
 ; CHECK-NEXT:    add x10, sp, #600
-; CHECK-NEXT:    ldr b19, [sp, #680]
 ; CHECK-NEXT:    ldr b20, [sp, #616]
 ; CHECK-NEXT:    ld1 { v2.b }[6], [x12]
-; CHECK-NEXT:    ld1 { v6.b }[6], [x11]
+; CHECK-NEXT:    ld1 { v17.b }[1], [x9]
+; CHECK-NEXT:    add x11, sp, #400
 ; CHECK-NEXT:    ld1 { v18.b }[6], [x10]
-; CHECK-NEXT:    add x11, sp, #688
 ; CHECK-NEXT:    add x12, sp, #624
-; CHECK-NEXT:    ld1 { v19.b }[1], [x11]
+; CHECK-NEXT:    mov v5.b[3], w3
+; CHECK-NEXT:    ld1 { v16.b }[6], [x11]
+; CHECK-NEXT:    add x11, sp, #696
 ; CHECK-NEXT:    ld1 { v20.b }[1], [x12]
-; CHECK-NEXT:    add x10, sp, #408
+; CHECK-NEXT:    add x9, sp, #432
+; CHECK-NEXT:    ld1 { v19.b }[2], [x11]
 ; CHECK-NEXT:    add x11, sp, #608
-; CHECK-NEXT:    add x12, sp, #440
-; CHECK-NEXT:    ld1 { v6.b }[7], [x10]
+; CHECK-NEXT:    ld1 { v17.b }[2], [x9]
+; CHECK-NEXT:    add x10, sp, #408
 ; CHECK-NEXT:    ld1 { v18.b }[7], [x11]
-; CHECK-NEXT:    ld1 { v17.b }[3], [x12]
-; CHECK-NEXT:    add x10, sp, #696
 ; CHECK-NEXT:    add x11, sp, #632
-; CHECK-NEXT:    ld1 { v19.b }[2], [x10]
-; CHECK-NEXT:    add x10, sp, #448
+; CHECK-NEXT:    ld1 { v16.b }[7], [x10]
 ; CHECK-NEXT:    ld1 { v20.b }[2], [x11]
-; CHECK-NEXT:    add x11, sp, #640
-; CHECK-NEXT:    sshll v6.8h, v6.8b, #0
-; CHECK-NEXT:    ld1 { v17.b }[4], [x10]
+; CHECK-NEXT:    mov v5.b[4], w4
 ; CHECK-NEXT:    add x10, sp, #704
-; CHECK-NEXT:    sshll v18.8h, v18.8b, #0
+; CHECK-NEXT:    add x12, sp, #440
 ; CHECK-NEXT:    ld1 { v19.b }[3], [x10]
-; CHECK-NEXT:    add x10, sp, #712
-; CHECK-NEXT:    add x12, sp, #520
-; CHECK-NEXT:    ld1 { v20.b }[3], [x11]
-; CHECK-NEXT:    add x11, sp, #648
-; CHECK-NEXT:    ldr b21, [sp, #544]
-; CHECK-NEXT:    smull2 v22.4s, v6.8h, v18.8h
-; CHECK-NEXT:    smull v6.4s, v6.4h, v18.4h
-; CHECK-NEXT:    ldr b18, [sp, #744]
-; CHECK-NEXT:    ld1 { v19.b }[4], [x10]
-; CHECK-NEXT:    ld1 { v5.b }[5], [x12]
-; CHECK-NEXT:    add x12, sp, #656
-; CHECK-NEXT:    ld1 { v20.b }[4], [x11]
-; CHECK-NEXT:    add x11, sp, #456
-; CHECK-NEXT:    sshll v21.8h, v21.8b, #0
-; CHECK-NEXT:    ld1 { v17.b }[5], [x11]
-; CHECK-NEXT:    add x11, sp, #720
+; CHECK-NEXT:    add x10, sp, #448
+; CHECK-NEXT:    ld1 { v17.b }[3], [x12]
+; CHECK-NEXT:    add x12, sp, #640
+; CHECK-NEXT:    sshll v21.8h, v16.8b, #0
+; CHECK-NEXT:    ld1 { v20.b }[3], [x12]
 ; CHECK-NEXT:    sshll v18.8h, v18.8b, #0
+; CHECK-NEXT:    add x11, sp, #712
+; CHECK-NEXT:    mov v5.b[5], w5
+; CHECK-NEXT:    ld1 { v19.b }[4], [x11]
+; CHECK-NEXT:    add x9, sp, #520
+; CHECK-NEXT:    ld1 { v17.b }[4], [x10]
+; CHECK-NEXT:    add x10, sp, #648
+; CHECK-NEXT:    ldr b22, [sp, #544]
+; CHECK-NEXT:    ld1 { v20.b }[4], [x10]
+; CHECK-NEXT:    smull2 v16.4s, v21.8h, v18.8h
+; CHECK-NEXT:    smull v18.4s, v21.4h, v18.4h
+; CHECK-NEXT:    ldr b21, [sp, #744]
+; CHECK-NEXT:    add x11, sp, #720
+; CHECK-NEXT:    ld1 { v6.b }[5], [x9]
+; CHECK-NEXT:    add x9, sp, #456
 ; CHECK-NEXT:    ld1 { v19.b }[5], [x11]
+; CHECK-NEXT:    mov v5.b[6], w6
+; CHECK-NEXT:    ld1 { v17.b }[5], [x9]
+; CHECK-NEXT:    add x9, sp, #656
+; CHECK-NEXT:    sshll v22.8h, v22.8b, #0
+; CHECK-NEXT:    sshll v21.8h, v21.8b, #0
+; CHECK-NEXT:    ld1 { v20.b }[5], [x9]
 ; CHECK-NEXT:    add x10, sp, #528
-; CHECK-NEXT:    add x11, sp, #464
-; CHECK-NEXT:    ld1 { v20.b }[5], [x12]
-; CHECK-NEXT:    ld1 { v5.b }[6], [x10]
-; CHECK-NEXT:    add x12, sp, #728
-; CHECK-NEXT:    add x13, sp, #664
-; CHECK-NEXT:    add x8, sp, #72
-; CHECK-NEXT:    ld1 { v17.b }[6], [x11]
-; CHECK-NEXT:    ld1 { v19.b }[6], [x12]
-; CHECK-NEXT:    ld1 { v1.b }[7], [x8]
+; CHECK-NEXT:    add x11, sp, #728
+; CHECK-NEXT:    ld1 { v6.b }[6], [x10]
+; CHECK-NEXT:    add x10, sp, #464
+; CHECK-NEXT:    ld1 { v19.b }[6], [x11]
+; CHECK-NEXT:    add x11, sp, #664
+; CHECK-NEXT:    ld1 { v17.b }[6], [x10]
+; CHECK-NEXT:    smull v21.4s, v22.4h, v21.4h
+; CHECK-NEXT:    movi v22.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v20.b }[6], [x11]
+; CHECK-NEXT:    mov v5.b[7], w7
+; CHECK-NEXT:    add x9, sp, #536
+; CHECK-NEXT:    add x10, sp, #736
+; CHECK-NEXT:    add x11, sp, #208
+; CHECK-NEXT:    add x13, sp, #264
+; CHECK-NEXT:    ld1 { v6.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v19.b }[7], [x10]
+; CHECK-NEXT:    ld1 { v4.b }[7], [x11]
+; CHECK-NEXT:    add x9, sp, #472
+; CHECK-NEXT:    add x10, sp, #672
+; CHECK-NEXT:    ld1 { v3.b }[6], [x13]
+; CHECK-NEXT:    ld1 { v17.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v20.b }[7], [x10]
 ; CHECK-NEXT:    add x8, sp, #336
-; CHECK-NEXT:    ld1 { v20.b }[6], [x13]
-; CHECK-NEXT:    add x9, sp, #272
-; CHECK-NEXT:    smull v18.4s, v21.4h, v18.4h
+; CHECK-NEXT:    mov v22.s[0], v21.s[0]
 ; CHECK-NEXT:    movi v21.2d, #0000000000000000
-; CHECK-NEXT:    add x10, sp, #536
+; CHECK-NEXT:    sshll v5.8h, v5.8b, #0
+; CHECK-NEXT:    sshll v6.8h, v6.8b, #0
+; CHECK-NEXT:    sshll v19.8h, v19.8b, #0
 ; CHECK-NEXT:    ld1 { v2.b }[7], [x8]
-; CHECK-NEXT:    ld1 { v3.b }[7], [x9]
-; CHECK-NEXT:    ld1 { v5.b }[7], [x10]
-; CHECK-NEXT:    add x8, sp, #472
-; CHECK-NEXT:    add x9, sp, #736
-; CHECK-NEXT:    add x10, sp, #672
-; CHECK-NEXT:    ld1 { v17.b }[7], [x8]
-; CHECK-NEXT:    ld1 { v19.b }[7], [x9]
-; CHECK-NEXT:    ld1 { v20.b }[7], [x10]
+; CHECK-NEXT:    add x8, sp, #272
+; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
+; CHECK-NEXT:    ld1 { v3.b }[7], [x8]
+; CHECK-NEXT:    sshll v17.8h, v17.8b, #0
+; CHECK-NEXT:    sshll v20.8h, v20.8b, #0
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    mov v21.s[0], v18.s[0]
 ; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-NEXT:    smlal v18.4s, v6.4h, v19.4h
+; CHECK-NEXT:    smlal2 v16.4s, v6.8h, v19.8h
+; CHECK-NEXT:    mov v21.s[0], v7.s[0]
+; CHECK-NEXT:    smull v6.4s, v5.4h, v4.4h
 ; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
 ; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-NEXT:    sshll v5.8h, v5.8b, #0
-; CHECK-NEXT:    sshll v17.8h, v17.8b, #0
-; CHECK-NEXT:    sshll v18.8h, v19.8b, #0
-; CHECK-NEXT:    sshll v19.8h, v20.8b, #0
-; CHECK-NEXT:    smlal v16.4s, v0.4h, v2.4h
+; CHECK-NEXT:    smlal v22.4s, v17.4h, v20.4h
+; CHECK-NEXT:    smull2 v4.4s, v5.8h, v4.8h
+; CHECK-NEXT:    smlal v21.4s, v1.4h, v3.4h
+; CHECK-NEXT:    smlal2 v16.4s, v17.8h, v20.8h
+; CHECK-NEXT:    smlal v6.4s, v0.4h, v2.4h
+; CHECK-NEXT:    add v5.4s, v18.4s, v22.4s
 ; CHECK-NEXT:    smlal2 v4.4s, v0.8h, v2.8h
-; CHECK-NEXT:    smlal v7.4s, v1.4h, v3.4h
-; CHECK-NEXT:    smlal v6.4s, v5.4h, v18.4h
-; CHECK-NEXT:    smlal2 v22.4s, v5.8h, v18.8h
-; CHECK-NEXT:    smlal v21.4s, v17.4h, v19.4h
+; CHECK-NEXT:    add v0.4s, v6.4s, v21.4s
+; CHECK-NEXT:    add v2.4s, v5.4s, v16.4s
 ; CHECK-NEXT:    smlal2 v4.4s, v1.8h, v3.8h
-; CHECK-NEXT:    add v0.4s, v16.4s, v7.4s
-; CHECK-NEXT:    add v1.4s, v6.4s, v21.4s
-; CHECK-NEXT:    smlal2 v22.4s, v17.8h, v19.8h
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v4.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v22.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -1584,34 +1584,34 @@ entry:
 define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-LABEL: test_udot_v33i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q2, q3, [x0]
-; CHECK-NEXT:    movi v18.2d, #0000000000000000
-; CHECK-NEXT:    ldp q4, q5, [x1]
 ; CHECK-NEXT:    ldr b0, [x0, #32]
 ; CHECK-NEXT:    ldr b1, [x1, #32]
+; CHECK-NEXT:    ldp q2, q4, [x0]
+; CHECK-NEXT:    ldp q3, q6, [x1]
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll v6.8h, v2.8b, #0
-; CHECK-NEXT:    ushll2 v2.8h, v2.16b, #0
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ushll v7.8h, v4.8b, #0
-; CHECK-NEXT:    ushll2 v4.8h, v4.16b, #0
-; CHECK-NEXT:    ushll2 v16.8h, v3.16b, #0
-; CHECK-NEXT:    ushll v3.8h, v3.8b, #0
-; CHECK-NEXT:    ushll2 v19.8h, v5.16b, #0
-; CHECK-NEXT:    ushll v5.8h, v5.8b, #0
+; CHECK-NEXT:    ushll v5.8h, v2.8b, #0
+; CHECK-NEXT:    ushll2 v2.8h, v2.16b, #0
+; CHECK-NEXT:    ushll2 v16.8h, v4.16b, #0
+; CHECK-NEXT:    ushll v7.8h, v3.8b, #0
+; CHECK-NEXT:    ushll2 v3.8h, v3.16b, #0
+; CHECK-NEXT:    ushll v4.8h, v4.8b, #0
 ; CHECK-NEXT:    umull v0.4s, v1.4h, v0.4h
-; CHECK-NEXT:    umull2 v1.4s, v7.8h, v6.8h
-; CHECK-NEXT:    umull2 v17.4s, v4.8h, v2.8h
-; CHECK-NEXT:    umull v2.4s, v4.4h, v2.4h
-; CHECK-NEXT:    umlal2 v17.4s, v19.8h, v16.8h
-; CHECK-NEXT:    umlal2 v1.4s, v5.8h, v3.8h
-; CHECK-NEXT:    mov v18.s[0], v0.s[0]
-; CHECK-NEXT:    umlal v2.4s, v19.4h, v16.4h
-; CHECK-NEXT:    add v0.4s, v1.4s, v17.4s
-; CHECK-NEXT:    umlal v18.4s, v7.4h, v6.4h
-; CHECK-NEXT:    umlal v18.4s, v5.4h, v3.4h
-; CHECK-NEXT:    add v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    add v0.4s, v18.4s, v0.4s
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    ushll2 v19.8h, v6.16b, #0
+; CHECK-NEXT:    ushll v6.8h, v6.8b, #0
+; CHECK-NEXT:    umull2 v17.4s, v7.8h, v5.8h
+; CHECK-NEXT:    umull2 v18.4s, v3.8h, v2.8h
+; CHECK-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-NEXT:    umull v0.4s, v3.4h, v2.4h
+; CHECK-NEXT:    umlal2 v18.4s, v19.8h, v16.8h
+; CHECK-NEXT:    umlal2 v17.4s, v6.8h, v4.8h
+; CHECK-NEXT:    umlal v1.4s, v7.4h, v5.4h
+; CHECK-NEXT:    umlal v0.4s, v19.4h, v16.4h
+; CHECK-NEXT:    add v2.4s, v17.4s, v18.4s
+; CHECK-NEXT:    umlal v1.4s, v6.4h, v4.4h
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    add w0, w8, w2
@@ -1639,14 +1639,14 @@ define i32 @test_udot_v33i8_nomla(ptr nocapture readonly %a1) {
 ; CHECK-NEXT:    ushll2 v2.8h, v2.16b, #0
 ; CHECK-NEXT:    ushll2 v3.8h, v3.16b, #0
 ; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    uaddl2 v6.4s, v3.8h, v2.8h
-; CHECK-NEXT:    uaddl v2.4s, v3.4h, v2.4h
+; CHECK-NEXT:    uaddl2 v6.4s, v5.8h, v4.8h
 ; CHECK-NEXT:    mov v0.s[0], v1.s[0]
-; CHECK-NEXT:    uaddl2 v1.4s, v5.8h, v4.8h
-; CHECK-NEXT:    add v1.4s, v1.4s, v6.4s
+; CHECK-NEXT:    uaddl2 v1.4s, v3.8h, v2.8h
+; CHECK-NEXT:    uaddl v2.4s, v3.4h, v2.4h
+; CHECK-NEXT:    add v1.4s, v6.4s, v1.4s
 ; CHECK-NEXT:    uaddw v0.4s, v0.4s, v5.4h
-; CHECK-NEXT:    uaddw v0.4s, v0.4s, v4.4h
 ; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    uaddw v0.4s, v0.4s, v4.4h
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w0, s0
@@ -1660,34 +1660,34 @@ entry:
 define i32 @test_sdot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
 ; CHECK-LABEL: test_sdot_v33i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldp q2, q3, [x0]
-; CHECK-NEXT:    movi v18.2d, #0000000000000000
-; CHECK-NEXT:    ldp q4, q5, [x1]
 ; CHECK-NEXT:    ldr b0, [x0, #32]
 ; CHECK-NEXT:    ldr b1, [x1, #32]
+; CHECK-NEXT:    ldp q2, q4, [x0]
+; CHECK-NEXT:    ldp q3, q6, [x1]
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    sshll v6.8h, v2.8b, #0
-; CHECK-NEXT:    sshll2 v2.8h, v2.16b, #0
 ; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    sshll v7.8h, v4.8b, #0
-; CHECK-NEXT:    sshll2 v4.8h, v4.16b, #0
-; CHECK-NEXT:    sshll2 v16.8h, v3.16b, #0
-; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-NEXT:    sshll2 v19.8h, v5.16b, #0
-; CHECK-NEXT:    sshll v5.8h, v5.8b, #0
+; CHECK-NEXT:    sshll v5.8h, v2.8b, #0
+; CHECK-NEXT:    sshll2 v2.8h, v2.16b, #0
+; CHECK-NEXT:    sshll2 v16.8h, v4.16b, #0
+; CHECK-NEXT:    sshll v7.8h, v3.8b, #0
+; CHECK-NEXT:    sshll2 v3.8h, v3.16b, #0
+; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
 ; CHECK-NEXT:    smull v0.4s, v1.4h, v0.4h
-; CHECK-NEXT:    smull2 v1.4s, v7.8h, v6.8h
-; CHECK-NEXT:    smull2 v17.4s, v4.8h, v2.8h
-; CHECK-NEXT:    smull v2.4s, v4.4h, v2.4h
-; CHECK-NEXT:    smlal2 v17.4s, v19.8h, v16.8h
-; CHECK-NEXT:    smlal2 v1.4s, v5.8h, v3.8h
-; CHECK-NEXT:    mov v18.s[0], v0.s[0]
-; CHECK-NEXT:    smlal v2.4s, v19.4h, v16.4h
-; CHECK-NEXT:    add v0.4s, v1.4s, v17.4s
-; CHECK-NEXT:    smlal v18.4s, v7.4h, v6.4h
-; CHECK-NEXT:    smlal v18.4s, v5.4h, v3.4h
-; CHECK-NEXT:    add v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    add v0.4s, v18.4s, v0.4s
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    sshll2 v19.8h, v6.16b, #0
+; CHECK-NEXT:    sshll v6.8h, v6.8b, #0
+; CHECK-NEXT:    smull2 v17.4s, v7.8h, v5.8h
+; CHECK-NEXT:    smull2 v18.4s, v3.8h, v2.8h
+; CHECK-NEXT:    mov v1.s[0], v0.s[0]
+; CHECK-NEXT:    smull v0.4s, v3.4h, v2.4h
+; CHECK-NEXT:    smlal2 v18.4s, v19.8h, v16.8h
+; CHECK-NEXT:    smlal2 v17.4s, v6.8h, v4.8h
+; CHECK-NEXT:    smlal v1.4s, v7.4h, v5.4h
+; CHECK-NEXT:    smlal v0.4s, v19.4h, v16.4h
+; CHECK-NEXT:    add v2.4s, v17.4s, v18.4s
+; CHECK-NEXT:    smlal v1.4s, v6.4h, v4.4h
+; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    add w0, w8, w2
@@ -2018,151 +2018,151 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ldr b1, [sp, #80]
+; CHECK-NEXT:    ldr b0, [sp, #80]
 ; CHECK-NEXT:    add x8, sp, #88
 ; CHECK-NEXT:    ldr b2, [sp, #144]
-; CHECK-NEXT:    add x9, sp, #152
-; CHECK-NEXT:    ldr b3, [sp, #16]
-; CHECK-NEXT:    add x12, sp, #32
-; CHECK-NEXT:    ld1 { v1.b }[1], [x8]
-; CHECK-NEXT:    ld1 { v2.b }[1], [x9]
 ; CHECK-NEXT:    add x9, sp, #96
-; CHECK-NEXT:    add x8, sp, #24
-; CHECK-NEXT:    add x11, sp, #112
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    ld1 { v3.b }[1], [x8]
-; CHECK-NEXT:    add x8, sp, #160
-; CHECK-NEXT:    ldr b4, [sp, #480]
-; CHECK-NEXT:    ld1 { v1.b }[2], [x9]
-; CHECK-NEXT:    add x9, sp, #104
-; CHECK-NEXT:    ld1 { v2.b }[2], [x8]
+; CHECK-NEXT:    add x10, sp, #152
+; CHECK-NEXT:    add x11, sp, #160
+; CHECK-NEXT:    ld1 { v0.b }[1], [x8]
+; CHECK-NEXT:    ld1 { v2.b }[1], [x10]
+; CHECK-NEXT:    add x8, sp, #104
+; CHECK-NEXT:    ldr b3, [sp, #16]
+; CHECK-NEXT:    ldr b5, [sp, #480]
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    add x10, sp, #24
+; CHECK-NEXT:    add x13, sp, #488
+; CHECK-NEXT:    ldr b4, [sp, #608]
+; CHECK-NEXT:    ld1 { v0.b }[2], [x9]
+; CHECK-NEXT:    ld1 { v2.b }[2], [x11]
+; CHECK-NEXT:    add x12, sp, #112
+; CHECK-NEXT:    ld1 { v3.b }[1], [x10]
+; CHECK-NEXT:    ld1 { v5.b }[1], [x13]
+; CHECK-NEXT:    add x10, sp, #616
+; CHECK-NEXT:    mov v1.b[1], w1
+; CHECK-NEXT:    ld1 { v4.b }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #32
+; CHECK-NEXT:    ld1 { v0.b }[3], [x8]
 ; CHECK-NEXT:    add x8, sp, #168
-; CHECK-NEXT:    add x10, sp, #120
-; CHECK-NEXT:    add x13, sp, #48
-; CHECK-NEXT:    ld1 { v3.b }[2], [x12]
-; CHECK-NEXT:    add x12, sp, #40
-; CHECK-NEXT:    ldr b5, [sp, #608]
-; CHECK-NEXT:    ld1 { v1.b }[3], [x9]
+; CHECK-NEXT:    add x9, sp, #120
 ; CHECK-NEXT:    ld1 { v2.b }[3], [x8]
-; CHECK-NEXT:    mov v0.b[1], w1
-; CHECK-NEXT:    add x9, sp, #128
-; CHECK-NEXT:    add x14, sp, #184
-; CHECK-NEXT:    ldr b16, [sp, #544]
-; CHECK-NEXT:    ld1 { v3.b }[3], [x12]
-; CHECK-NEXT:    add x12, sp, #176
-; CHECK-NEXT:    ldr b17, [sp, #672]
-; CHECK-NEXT:    ld1 { v1.b }[4], [x11]
-; CHECK-NEXT:    add x11, sp, #488
-; CHECK-NEXT:    ld1 { v2.b }[4], [x12]
-; CHECK-NEXT:    ld1 { v4.b }[1], [x11]
-; CHECK-NEXT:    mov v0.b[2], w2
-; CHECK-NEXT:    add x11, sp, #192
-; CHECK-NEXT:    ld1 { v3.b }[4], [x13]
-; CHECK-NEXT:    add x13, sp, #616
-; CHECK-NEXT:    add x12, sp, #56
-; CHECK-NEXT:    ld1 { v1.b }[5], [x10]
-; CHECK-NEXT:    ld1 { v5.b }[1], [x13]
-; CHECK-NEXT:    add x13, sp, #496
+; CHECK-NEXT:    add x8, sp, #496
+; CHECK-NEXT:    ld1 { v3.b }[2], [x10]
+; CHECK-NEXT:    ld1 { v5.b }[2], [x8]
+; CHECK-NEXT:    add x8, sp, #176
+; CHECK-NEXT:    ldr b6, [sp, #544]
+; CHECK-NEXT:    ld1 { v0.b }[4], [x12]
+; CHECK-NEXT:    add x14, sp, #552
+; CHECK-NEXT:    ldr b7, [sp, #672]
+; CHECK-NEXT:    ld1 { v2.b }[4], [x8]
+; CHECK-NEXT:    add x13, sp, #40
+; CHECK-NEXT:    ld1 { v6.b }[1], [x14]
+; CHECK-NEXT:    mov v1.b[2], w2
+; CHECK-NEXT:    add x11, sp, #128
+; CHECK-NEXT:    ld1 { v3.b }[3], [x13]
+; CHECK-NEXT:    ld1 { v0.b }[5], [x9]
+; CHECK-NEXT:    add x9, sp, #680
+; CHECK-NEXT:    add x13, sp, #184
+; CHECK-NEXT:    ld1 { v7.b }[1], [x9]
+; CHECK-NEXT:    ld1 { v2.b }[5], [x13]
+; CHECK-NEXT:    add x13, sp, #624
+; CHECK-NEXT:    add x15, sp, #504
 ; CHECK-NEXT:    ld1 { v4.b }[2], [x13]
-; CHECK-NEXT:    ld1 { v2.b }[5], [x14]
-; CHECK-NEXT:    add x14, sp, #680
-; CHECK-NEXT:    ld1 { v17.b }[1], [x14]
-; CHECK-NEXT:    add x13, sp, #504
-; CHECK-NEXT:    ld1 { v3.b }[5], [x12]
-; CHECK-NEXT:    ld1 { v1.b }[6], [x9]
-; CHECK-NEXT:    add x9, sp, #552
-; CHECK-NEXT:    add x12, sp, #688
-; CHECK-NEXT:    ld1 { v16.b }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #624
-; CHECK-NEXT:    ld1 { v4.b }[3], [x13]
-; CHECK-NEXT:    ld1 { v2.b }[6], [x11]
+; CHECK-NEXT:    add x10, sp, #136
+; CHECK-NEXT:    ld1 { v0.b }[6], [x11]
 ; CHECK-NEXT:    add x11, sp, #560
-; CHECK-NEXT:    add x8, sp, #136
-; CHECK-NEXT:    ld1 { v17.b }[2], [x12]
-; CHECK-NEXT:    ld1 { v5.b }[2], [x9]
-; CHECK-NEXT:    ld1 { v1.b }[7], [x8]
-; CHECK-NEXT:    ld1 { v16.b }[2], [x11]
-; CHECK-NEXT:    add x8, sp, #512
-; CHECK-NEXT:    mov v0.b[3], w3
-; CHECK-NEXT:    ld1 { v4.b }[4], [x8]
-; CHECK-NEXT:    add x8, sp, #568
-; CHECK-NEXT:    add x9, sp, #696
-; CHECK-NEXT:    add x11, sp, #632
-; CHECK-NEXT:    ld1 { v17.b }[3], [x9]
-; CHECK-NEXT:    add x9, sp, #520
-; CHECK-NEXT:    ld1 { v16.b }[3], [x8]
-; CHECK-NEXT:    ld1 { v5.b }[3], [x11]
-; CHECK-NEXT:    add x8, sp, #640
-; CHECK-NEXT:    ld1 { v4.b }[5], [x9]
+; CHECK-NEXT:    ld1 { v5.b }[3], [x15]
+; CHECK-NEXT:    ld1 { v6.b }[2], [x11]
+; CHECK-NEXT:    add x11, sp, #688
+; CHECK-NEXT:    mov v1.b[3], w3
+; CHECK-NEXT:    ld1 { v7.b }[2], [x11]
+; CHECK-NEXT:    add x9, sp, #632
+; CHECK-NEXT:    add x11, sp, #512
+; CHECK-NEXT:    ld1 { v0.b }[7], [x10]
+; CHECK-NEXT:    ld1 { v4.b }[3], [x9]
+; CHECK-NEXT:    add x9, sp, #568
+; CHECK-NEXT:    add x10, sp, #696
+; CHECK-NEXT:    ld1 { v6.b }[3], [x9]
+; CHECK-NEXT:    ld1 { v5.b }[4], [x11]
+; CHECK-NEXT:    ld1 { v7.b }[3], [x10]
+; CHECK-NEXT:    add x9, sp, #640
+; CHECK-NEXT:    mov v1.b[4], w4
+; CHECK-NEXT:    ld1 { v4.b }[4], [x9]
 ; CHECK-NEXT:    add x9, sp, #576
-; CHECK-NEXT:    add x11, sp, #704
+; CHECK-NEXT:    add x10, sp, #704
+; CHECK-NEXT:    add x11, sp, #520
+; CHECK-NEXT:    ld1 { v6.b }[4], [x9]
 ; CHECK-NEXT:    ldr b18, [sp, #736]
-; CHECK-NEXT:    mov v0.b[4], w4
-; CHECK-NEXT:    ld1 { v17.b }[4], [x11]
-; CHECK-NEXT:    ld1 { v16.b }[4], [x9]
-; CHECK-NEXT:    ld1 { v5.b }[4], [x8]
-; CHECK-NEXT:    add x9, sp, #528
-; CHECK-NEXT:    sshll v18.8h, v18.8b, #0
-; CHECK-NEXT:    add x8, sp, #648
+; CHECK-NEXT:    ld1 { v7.b }[4], [x10]
+; CHECK-NEXT:    ld1 { v5.b }[5], [x11]
+; CHECK-NEXT:    add x12, sp, #192
+; CHECK-NEXT:    add x8, sp, #48
+; CHECK-NEXT:    ld1 { v2.b }[6], [x12]
+; CHECK-NEXT:    add x9, sp, #648
+; CHECK-NEXT:    ld1 { v3.b }[4], [x8]
+; CHECK-NEXT:    add x10, sp, #528
 ; CHECK-NEXT:    add x11, sp, #584
 ; CHECK-NEXT:    add x12, sp, #712
-; CHECK-NEXT:    ld1 { v4.b }[6], [x9]
-; CHECK-NEXT:    movi v7.2d, #0000000000000000
-; CHECK-NEXT:    ld1 { v16.b }[5], [x11]
-; CHECK-NEXT:    ld1 { v17.b }[5], [x12]
-; CHECK-NEXT:    ld1 { v5.b }[5], [x8]
-; CHECK-NEXT:    mov v0.b[5], w5
-; CHECK-NEXT:    add x9, sp, #536
-; CHECK-NEXT:    sshll v18.4s, v18.4h, #0
-; CHECK-NEXT:    add x8, sp, #656
+; CHECK-NEXT:    sshll v18.8h, v18.8b, #0
+; CHECK-NEXT:    mov v1.b[5], w5
+; CHECK-NEXT:    ld1 { v6.b }[5], [x11]
+; CHECK-NEXT:    ld1 { v7.b }[5], [x12]
+; CHECK-NEXT:    ld1 { v4.b }[5], [x9]
+; CHECK-NEXT:    ld1 { v5.b }[6], [x10]
+; CHECK-NEXT:    add x14, sp, #56
+; CHECK-NEXT:    movi v17.2d, #0000000000000000
+; CHECK-NEXT:    ld1 { v3.b }[5], [x14]
+; CHECK-NEXT:    add x9, sp, #656
+; CHECK-NEXT:    add x10, sp, #536
 ; CHECK-NEXT:    add x11, sp, #592
 ; CHECK-NEXT:    add x12, sp, #720
-; CHECK-NEXT:    ld1 { v4.b }[7], [x9]
-; CHECK-NEXT:    ld1 { v16.b }[6], [x11]
-; CHECK-NEXT:    ld1 { v17.b }[6], [x12]
-; CHECK-NEXT:    ld1 { v5.b }[6], [x8]
-; CHECK-NEXT:    ldr b6, [sp, #208]
-; CHECK-NEXT:    add x10, sp, #64
-; CHECK-NEXT:    mov v7.s[0], v18.s[0]
-; CHECK-NEXT:    mov v0.b[6], w6
-; CHECK-NEXT:    ld1 { v3.b }[6], [x10]
+; CHECK-NEXT:    sshll v18.4s, v18.4h, #0
+; CHECK-NEXT:    ldr b16, [sp, #208]
+; CHECK-NEXT:    ld1 { v6.b }[6], [x11]
+; CHECK-NEXT:    ld1 { v7.b }[6], [x12]
+; CHECK-NEXT:    ld1 { v4.b }[6], [x9]
+; CHECK-NEXT:    ld1 { v5.b }[7], [x10]
+; CHECK-NEXT:    add x8, sp, #64
+; CHECK-NEXT:    mov v1.b[6], w6
+; CHECK-NEXT:    sshll v16.8h, v16.8b, #0
+; CHECK-NEXT:    ld1 { v3.b }[6], [x8]
 ; CHECK-NEXT:    add x8, sp, #664
 ; CHECK-NEXT:    add x9, sp, #600
 ; CHECK-NEXT:    add x10, sp, #728
-; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
-; CHECK-NEXT:    sshll v6.8h, v6.8b, #0
-; CHECK-NEXT:    ld1 { v16.b }[7], [x9]
-; CHECK-NEXT:    ld1 { v17.b }[7], [x10]
-; CHECK-NEXT:    ld1 { v5.b }[7], [x8]
+; CHECK-NEXT:    mov v17.s[0], v18.s[0]
+; CHECK-NEXT:    ld1 { v6.b }[7], [x9]
+; CHECK-NEXT:    ld1 { v7.b }[7], [x10]
+; CHECK-NEXT:    ld1 { v4.b }[7], [x8]
+; CHECK-NEXT:    sshll v5.8h, v5.8b, #0
 ; CHECK-NEXT:    movi v18.2d, #0000000000000000
-; CHECK-NEXT:    mov v0.b[7], w7
+; CHECK-NEXT:    sshll v16.4s, v16.4h, #0
+; CHECK-NEXT:    mov v1.b[7], w7
 ; CHECK-NEXT:    add x9, sp, #200
-; CHECK-NEXT:    add x10, sp, #72
-; CHECK-NEXT:    saddw v7.4s, v7.4s, v4.4h
-; CHECK-NEXT:    sshll v6.4s, v6.4h, #0
-; CHECK-NEXT:    sshll v16.8h, v16.8b, #0
-; CHECK-NEXT:    sshll v17.8h, v17.8b, #0
-; CHECK-NEXT:    sshll v5.8h, v5.8b, #0
+; CHECK-NEXT:    add x8, sp, #72
 ; CHECK-NEXT:    ld1 { v2.b }[7], [x9]
-; CHECK-NEXT:    ld1 { v3.b }[7], [x10]
-; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
-; CHECK-NEXT:    mov v18.s[0], v6.s[0]
+; CHECK-NEXT:    sshll v6.8h, v6.8b, #0
+; CHECK-NEXT:    ld1 { v3.b }[7], [x8]
+; CHECK-NEXT:    sshll v7.8h, v7.8b, #0
+; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
+; CHECK-NEXT:    saddw v17.4s, v17.4s, v5.4h
+; CHECK-NEXT:    mov v18.s[0], v16.s[0]
 ; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
-; CHECK-NEXT:    saddl2 v6.4s, v17.8h, v16.8h
-; CHECK-NEXT:    saddl2 v4.4s, v5.8h, v4.8h
-; CHECK-NEXT:    saddl v16.4s, v17.4h, v16.4h
-; CHECK-NEXT:    saddw v5.4s, v7.4s, v5.4h
+; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
 ; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
 ; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
-; CHECK-NEXT:    saddl2 v17.4s, v0.8h, v1.8h
-; CHECK-NEXT:    saddw v0.4s, v18.4s, v0.4h
+; CHECK-NEXT:    saddl2 v16.4s, v7.8h, v6.8h
+; CHECK-NEXT:    saddl2 v5.4s, v4.8h, v5.8h
+; CHECK-NEXT:    saddl v6.4s, v7.4h, v6.4h
+; CHECK-NEXT:    saddw v4.4s, v17.4s, v4.4h
+; CHECK-NEXT:    saddl2 v17.4s, v1.8h, v0.8h
+; CHECK-NEXT:    saddw v1.4s, v18.4s, v1.4h
 ; CHECK-NEXT:    saddl2 v7.4s, v3.8h, v2.8h
-; CHECK-NEXT:    add v4.4s, v4.4s, v6.4s
-; CHECK-NEXT:    saddl v2.4s, v3.4h, v2.4h
 ; CHECK-NEXT:    add v5.4s, v5.4s, v16.4s
-; CHECK-NEXT:    saddw v0.4s, v0.4s, v1.4h
+; CHECK-NEXT:    saddl v2.4s, v3.4h, v2.4h
+; CHECK-NEXT:    add v4.4s, v4.4s, v6.4s
+; CHECK-NEXT:    saddw v0.4s, v1.4s, v0.4h
 ; CHECK-NEXT:    add v6.4s, v17.4s, v7.4s
-; CHECK-NEXT:    add v1.4s, v5.4s, v4.4s
+; CHECK-NEXT:    add v1.4s, v4.4s, v5.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    add v1.4s, v6.4s, v1.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
diff --git a/llvm/test/CodeGen/AArch64/neon-extadd.ll b/llvm/test/CodeGen/AArch64/neon-extadd.ll
index 913205f327536..16200435c5c31 100644
--- a/llvm/test/CodeGen/AArch64/neon-extadd.ll
+++ b/llvm/test/CodeGen/AArch64/neon-extadd.ll
@@ -515,15 +515,15 @@ define <20 x i32> @v20(<20 x i8> %s0, <20 x i8> %s1) {
 ; CHECK-NEXT:    mov v0.b[6], w6
 ; CHECK-NEXT:    ld1 { v1.b }[7], [x9]
 ; CHECK-NEXT:    uaddl v2.8h, v3.8b, v2.8b
+; CHECK-NEXT:    ushll v3.4s, v4.4h, #0
 ; CHECK-NEXT:    mov v0.b[7], w7
-; CHECK-NEXT:    ushll2 v3.4s, v2.8h, #0
-; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
 ; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    ushll v1.4s, v4.4h, #0
-; CHECK-NEXT:    stp q3, q1, [x8, #48]
-; CHECK-NEXT:    ushll2 v1.4s, v0.8h, #0
+; CHECK-NEXT:    ushll2 v1.4s, v2.8h, #0
+; CHECK-NEXT:    ushll v2.4s, v2.4h, #0
+; CHECK-NEXT:    stp q1, q3, [x8, #48]
+; CHECK-NEXT:    ushll2 v3.4s, v0.8h, #0
 ; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    stp q1, q2, [x8, #16]
+; CHECK-NEXT:    stp q3, q2, [x8, #16]
 ; CHECK-NEXT:    str q0, [x8]
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/neon-shift-neg.ll b/llvm/test/CodeGen/AArch64/neon-shift-neg.ll
index 881bbf315e8e9..45272143e8592 100644
--- a/llvm/test/CodeGen/AArch64/neon-shift-neg.ll
+++ b/llvm/test/CodeGen/AArch64/neon-shift-neg.ll
@@ -375,8 +375,8 @@ entry:
 define <vscale x 2 x i64> @shrn64x2(<vscale x 2 x i64> %a, i64 %b) {
 ; CHECK-LABEL: shrn64x2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    neg x8, x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    asr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -391,8 +391,8 @@ entry:
 define <vscale x 4 x i32> @shrn32x4(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-LABEL: shrn32x4:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    neg w8, w0
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    asr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -407,8 +407,8 @@ entry:
 define <vscale x 8 x i16> @shrn16x8(<vscale x 8 x i16> %a, i16 %b) {
 ; CHECK-LABEL: shrn16x8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    neg w8, w0
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    asr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
@@ -423,8 +423,8 @@ entry:
 define <vscale x 16 x i8> @shrn8x16(<vscale x 16 x i8> %a, i8 %b) {
 ; CHECK-LABEL: shrn8x16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    neg w8, w0
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z1.b, w8
 ; CHECK-NEXT:    asr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    ret
@@ -439,8 +439,8 @@ entry:
 define <vscale x 2 x i64> @lshrn64x2(<vscale x 2 x i64> %a, i64 %b) {
 ; CHECK-LABEL: lshrn64x2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    neg x8, x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -455,8 +455,8 @@ entry:
 define <vscale x 4 x i32> @lshrn32x4(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-LABEL: lshrn32x4:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    neg w8, w0
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -471,8 +471,8 @@ entry:
 define <vscale x 8 x i16> @lshrn16x8(<vscale x 8 x i16> %a, i16 %b) {
 ; CHECK-LABEL: lshrn16x8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    neg w8, w0
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
@@ -487,8 +487,8 @@ entry:
 define <vscale x 16 x i8> @lshrn8x16(<vscale x 16 x i8> %a, i8 %b) {
 ; CHECK-LABEL: lshrn8x16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    neg w8, w0
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z1.b, w8
 ; CHECK-NEXT:    lsr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    ret
@@ -503,8 +503,8 @@ entry:
 define <vscale x 2 x i64> @shln64x2(<vscale x 2 x i64> %a, i64 %b) {
 ; CHECK-LABEL: shln64x2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    neg x8, x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    lsl z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
@@ -519,8 +519,8 @@ entry:
 define <vscale x 4 x i32> @shln32x4(<vscale x 4 x i32> %a, i32 %b) {
 ; CHECK-LABEL: shln32x4:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    neg w8, w0
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -535,8 +535,8 @@ entry:
 define <vscale x 8 x i16> @shln16x8(<vscale x 8 x i16> %a, i16 %b) {
 ; CHECK-LABEL: shln16x8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    neg w8, w0
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
@@ -551,8 +551,8 @@ entry:
 define <vscale x 16 x i8> @shln8x16(<vscale x 16 x i8> %a, i8 %b) {
 ; CHECK-LABEL: shln8x16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    neg w8, w0
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z1.b, w8
 ; CHECK-NEXT:    lsl z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/predicated-add-sub.ll b/llvm/test/CodeGen/AArch64/predicated-add-sub.ll
index 6b3cfc040cb3d..20088354bdb75 100644
--- a/llvm/test/CodeGen/AArch64/predicated-add-sub.ll
+++ b/llvm/test/CodeGen/AArch64/predicated-add-sub.ll
@@ -83,11 +83,11 @@ define <vscale x 2 x i64> @zext.add.2xi64(<vscale x 2 x i64> %a, <vscale x 2 x i
 define <vscale x 8 x i32> @zext.add.8xi32(<vscale x 8 x i32> %a, <vscale x 8 x i1> %v) #0 {
 ; CHECK-LABEL: zext.add.8xi32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    punpkhi p1.h, p0.b
 ; CHECK-NEXT:    mov z2.s, #1 // =0x1
+; CHECK-NEXT:    punpkhi p1.h, p0.b
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    add z1.s, p1/m, z1.s, z2.s
 ; CHECK-NEXT:    add z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    add z1.s, p1/m, z1.s, z2.s
 ; CHECK-NEXT:    ret
   %extend = zext <vscale x 8 x i1> %v to <vscale x 8 x i32>
   %result = add <vscale x 8 x i32> %a, %extend
@@ -103,8 +103,8 @@ define <vscale x 16 x i32> @zext.add.16xi32(<vscale x 16 x i32> %a, <vscale x 16
 ; CHECK-NEXT:    punpkhi p2.h, p1.b
 ; CHECK-NEXT:    punpklo p1.h, p1.b
 ; CHECK-NEXT:    punpklo p3.h, p0.b
-; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    add z3.s, p2/m, z3.s, z4.s
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    add z2.s, p1/m, z2.s, z4.s
 ; CHECK-NEXT:    add z0.s, p3/m, z0.s, z4.s
 ; CHECK-NEXT:    add z1.s, p0/m, z1.s, z4.s
@@ -194,8 +194,8 @@ define <vscale x 2 x i64> @zext.sub.2xi64(<vscale x 2 x i64> %a, <vscale x 2 x i
 define <vscale x 8 x i32> @zext.sub.8xi32(<vscale x 8 x i32> %a, <vscale x 8 x i1> %v) #0 {
 ; CHECK-LABEL: zext.sub.8xi32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    mov z2.s, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    add z0.s, p1/m, z0.s, z2.s
 ; CHECK-NEXT:    add z1.s, p0/m, z1.s, z2.s
@@ -214,8 +214,8 @@ define <vscale x 16 x i32> @zext.sub.16xi32(<vscale x 16 x i32> %a, <vscale x 16
 ; CHECK-NEXT:    punpklo p2.h, p1.b
 ; CHECK-NEXT:    punpkhi p1.h, p1.b
 ; CHECK-NEXT:    punpklo p3.h, p0.b
-; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    add z0.s, p2/m, z0.s, z4.s
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    add z1.s, p1/m, z1.s, z4.s
 ; CHECK-NEXT:    add z2.s, p3/m, z2.s, z4.s
 ; CHECK-NEXT:    add z3.s, p0/m, z3.s, z4.s
@@ -305,11 +305,11 @@ define <vscale x 2 x i64> @sext.add.2xi64(<vscale x 2 x i64> %a, <vscale x 2 x i
 define <vscale x 8 x i32> @sext.add.8xi32(<vscale x 8 x i32> %a, <vscale x 8 x i1> %v) #0 {
 ; CHECK-LABEL: sext.add.8xi32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    punpkhi p1.h, p0.b
 ; CHECK-NEXT:    mov z2.s, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    punpkhi p1.h, p0.b
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    add z1.s, p1/m, z1.s, z2.s
 ; CHECK-NEXT:    add z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    add z1.s, p1/m, z1.s, z2.s
 ; CHECK-NEXT:    ret
   %extend = sext <vscale x 8 x i1> %v to <vscale x 8 x i32>
   %result = add <vscale x 8 x i32> %a, %extend
@@ -325,8 +325,8 @@ define <vscale x 16 x i32> @sext.add.16xi32(<vscale x 16 x i32> %a, <vscale x 16
 ; CHECK-NEXT:    punpkhi p2.h, p1.b
 ; CHECK-NEXT:    punpklo p1.h, p1.b
 ; CHECK-NEXT:    punpklo p3.h, p0.b
-; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    add z3.s, p2/m, z3.s, z4.s
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    add z2.s, p1/m, z2.s, z4.s
 ; CHECK-NEXT:    add z0.s, p3/m, z0.s, z4.s
 ; CHECK-NEXT:    add z1.s, p0/m, z1.s, z4.s
@@ -416,11 +416,11 @@ define <vscale x 2 x i64> @sext.sub.2xi64(<vscale x 2 x i64> %a, <vscale x 2 x i
 define <vscale x 8 x i32> @sext.sub.8xi32(<vscale x 8 x i32> %a, <vscale x 8 x i1> %v) #0 {
 ; CHECK-LABEL: sext.sub.8xi32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    punpkhi p1.h, p0.b
 ; CHECK-NEXT:    mov z2.s, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    punpkhi p1.h, p0.b
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    sub z1.s, p1/m, z1.s, z2.s
 ; CHECK-NEXT:    sub z0.s, p0/m, z0.s, z2.s
+; CHECK-NEXT:    sub z1.s, p1/m, z1.s, z2.s
 ; CHECK-NEXT:    ret
   %extend = sext <vscale x 8 x i1> %v to <vscale x 8 x i32>
   %result = sub <vscale x 8 x i32> %a, %extend
@@ -436,8 +436,8 @@ define <vscale x 16 x i32> @sext.sub.16xi32(<vscale x 16 x i32> %a, <vscale x 16
 ; CHECK-NEXT:    punpkhi p2.h, p1.b
 ; CHECK-NEXT:    punpklo p1.h, p1.b
 ; CHECK-NEXT:    punpklo p3.h, p0.b
-; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    sub z3.s, p2/m, z3.s, z4.s
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    sub z2.s, p1/m, z2.s, z4.s
 ; CHECK-NEXT:    sub z0.s, p3/m, z0.s, z4.s
 ; CHECK-NEXT:    sub z1.s, p0/m, z1.s, z4.s
diff --git a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
index 934ff44900c04..866b27b81d885 100644
--- a/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
+++ b/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
@@ -28,197 +28,193 @@ define dso_local void @run_test() local_unnamed_addr uwtable {
 ; CHECK-NEXT:    .cfi_offset b13, -80
 ; CHECK-NEXT:    .cfi_offset b14, -88
 ; CHECK-NEXT:    .cfi_offset b15, -96
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-NEXT:    // implicit-def: $q6
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    // implicit-def: $q1
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    mov x9, xzr
 ; CHECK-NEXT:    adrp x10, B+48
 ; CHECK-NEXT:    add x10, x10, :lo12:B+48
 ; CHECK-NEXT:    adrp x11, A
 ; CHECK-NEXT:    add x11, x11, :lo12:A
-; CHECK-NEXT:    // kill: killed $q6
-; CHECK-NEXT:    // implicit-def: $q6
+; CHECK-NEXT:    // kill: killed $q1
+; CHECK-NEXT:    // implicit-def: $q1
 ; CHECK-NEXT:    // implicit-def: $q0
-; CHECK-NEXT:    // implicit-def: $q2
 ; CHECK-NEXT:    // implicit-def: $q3
 ; CHECK-NEXT:    // implicit-def: $q4
 ; CHECK-NEXT:    // implicit-def: $q5
 ; CHECK-NEXT:    // implicit-def: $q7
-; CHECK-NEXT:    // implicit-def: $q16
-; CHECK-NEXT:    // implicit-def: $q17
 ; CHECK-NEXT:    // implicit-def: $q10
+; CHECK-NEXT:    // implicit-def: $q17
+; CHECK-NEXT:    // implicit-def: $q6
+; CHECK-NEXT:    // implicit-def: $q18
 ; CHECK-NEXT:    // implicit-def: $q19
 ; CHECK-NEXT:    // implicit-def: $q20
 ; CHECK-NEXT:    // implicit-def: $q21
 ; CHECK-NEXT:    // implicit-def: $q22
 ; CHECK-NEXT:    // implicit-def: $q23
 ; CHECK-NEXT:    // implicit-def: $q24
-; CHECK-NEXT:    // implicit-def: $q25
+; CHECK-NEXT:    // implicit-def: $q9
 ; CHECK-NEXT:    // implicit-def: $q27
-; CHECK-NEXT:    // implicit-def: $q26
+; CHECK-NEXT:    // implicit-def: $q12
 ; CHECK-NEXT:    // implicit-def: $q28
-; CHECK-NEXT:    // implicit-def: $q30
-; CHECK-NEXT:    // implicit-def: $q18
+; CHECK-NEXT:    // implicit-def: $q14
+; CHECK-NEXT:    // implicit-def: $q15
 ; CHECK-NEXT:    // implicit-def: $q29
+; CHECK-NEXT:    // implicit-def: $q30
+; CHECK-NEXT:    // implicit-def: $q11
 ; CHECK-NEXT:    // implicit-def: $q31
-; CHECK-NEXT:    // implicit-def: $q12
 ; CHECK-NEXT:    // implicit-def: $q13
-; CHECK-NEXT:    // implicit-def: $q11
-; CHECK-NEXT:    // kill: killed $q6
-; CHECK-NEXT:    // implicit-def: $q6
-; CHECK-NEXT:    // kill: killed $q6
+; CHECK-NEXT:    // kill: killed $q1
+; CHECK-NEXT:    // implicit-def: $q1
+; CHECK-NEXT:    // kill: killed $q1
 ; CHECK-NEXT:  .LBB0_1: // %for.cond1.preheader
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    str q14, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    ldr q14, [x8]
 ; CHECK-NEXT:    mov x12, xzr
-; CHECK-NEXT:    str q18, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    ldr x14, [x12]
-; CHECK-NEXT:    ldr q15, [x12]
-; CHECK-NEXT:    add x7, x11, x8
+; CHECK-NEXT:    stp q29, q15, [sp] // 32-byte Folded Spill
+; CHECK-NEXT:    add x19, x11, x8
 ; CHECK-NEXT:    fmov x15, d14
 ; CHECK-NEXT:    mov x16, v14.d[1]
-; CHECK-NEXT:    ldr q18, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    fmov x18, d15
-; CHECK-NEXT:    mov x13, v15.d[1]
-; CHECK-NEXT:    ldr x5, [x8]
+; CHECK-NEXT:    ldr q15, [x12]
 ; CHECK-NEXT:    ldr q14, [x10], #64
-; CHECK-NEXT:    ldr x7, [x7, #128]
+; CHECK-NEXT:    mov v8.16b, v28.16b
+; CHECK-NEXT:    fmov x13, d15
+; CHECK-NEXT:    mov x18, v15.d[1]
+; CHECK-NEXT:    mov v28.16b, v24.16b
 ; CHECK-NEXT:    mul x17, x15, x14
-; CHECK-NEXT:    mov v6.16b, v0.16b
-; CHECK-NEXT:    mov v9.16b, v27.16b
 ; CHECK-NEXT:    mov x12, v14.d[1]
 ; CHECK-NEXT:    fmov x4, d14
-; CHECK-NEXT:    mov v27.16b, v23.16b
+; CHECK-NEXT:    mov v24.16b, v20.16b
+; CHECK-NEXT:    mov v20.16b, v17.16b
+; CHECK-NEXT:    mov v17.16b, v5.16b
 ; CHECK-NEXT:    mul x1, x16, x14
-; CHECK-NEXT:    mov v23.16b, v19.16b
-; CHECK-NEXT:    mov v19.16b, v7.16b
-; CHECK-NEXT:    mov v7.16b, v2.16b
-; CHECK-NEXT:    stp q26, q31, [sp] // 32-byte Folded Spill
-; CHECK-NEXT:    mov v31.16b, v22.16b
-; CHECK-NEXT:    mul x0, x18, x14
-; CHECK-NEXT:    mov v26.16b, v10.16b
-; CHECK-NEXT:    mov v22.16b, v5.16b
+; CHECK-NEXT:    ldr q5, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x5, [x8]
+; CHECK-NEXT:    ldr x19, [x19, #128]
+; CHECK-NEXT:    mov v29.16b, v21.16b
+; CHECK-NEXT:    mov v21.16b, v0.16b
+; CHECK-NEXT:    mul x0, x13, x14
+; CHECK-NEXT:    mov v25.16b, v6.16b
+; CHECK-NEXT:    mov v6.16b, v2.16b
 ; CHECK-NEXT:    fmov d15, x17
-; CHECK-NEXT:    mov v5.16b, v1.16b
-; CHECK-NEXT:    mov v8.16b, v20.16b
-; CHECK-NEXT:    mul x2, x13, x14
-; CHECK-NEXT:    mov v20.16b, v16.16b
-; CHECK-NEXT:    mov v16.16b, v3.16b
-; CHECK-NEXT:    mov v10.16b, v21.16b
-; CHECK-NEXT:    mov v21.16b, v17.16b
-; CHECK-NEXT:    mov v17.16b, v4.16b
+; CHECK-NEXT:    mov v26.16b, v22.16b
+; CHECK-NEXT:    mov v22.16b, v18.16b
+; CHECK-NEXT:    mul x2, x18, x14
+; CHECK-NEXT:    mov v18.16b, v7.16b
+; CHECK-NEXT:    mov v7.16b, v3.16b
+; CHECK-NEXT:    mov v16.16b, v4.16b
+; CHECK-NEXT:    add x8, x8, #8
+; CHECK-NEXT:    add x9, x9, #1
 ; CHECK-NEXT:    mov v15.d[1], x1
 ; CHECK-NEXT:    mul x3, x12, x14
-; CHECK-NEXT:    add x8, x8, #8
-; CHECK-NEXT:    fmov d14, x0
 ; CHECK-NEXT:    cmp x8, #64
-; CHECK-NEXT:    add x9, x9, #1
+; CHECK-NEXT:    fmov d14, x0
 ; CHECK-NEXT:    mul x14, x4, x14
-; CHECK-NEXT:    add v18.2d, v18.2d, v15.2d
-; CHECK-NEXT:    mul x19, x15, x5
+; CHECK-NEXT:    add v5.2d, v5.2d, v15.2d
+; CHECK-NEXT:    mul x20, x15, x5
 ; CHECK-NEXT:    mov v14.d[1], x2
-; CHECK-NEXT:    mul x15, x15, x7
+; CHECK-NEXT:    mul x15, x15, x19
 ; CHECK-NEXT:    fmov d0, x14
-; CHECK-NEXT:    str q18, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    ldp q18, q15, [sp, #32] // 32-byte Folded Reload
-; CHECK-NEXT:    mul x6, x16, x5
-; CHECK-NEXT:    fmov d1, x19
+; CHECK-NEXT:    str q5, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    ldr q5, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    mul x21, x13, x19
+; CHECK-NEXT:    add v5.2d, v5.2d, v14.2d
+; CHECK-NEXT:    fmov d3, x20
+; CHECK-NEXT:    mul x7, x16, x5
 ; CHECK-NEXT:    mov v0.d[1], x3
-; CHECK-NEXT:    mul x16, x16, x7
-; CHECK-NEXT:    fmov d2, x15
-; CHECK-NEXT:    add v15.2d, v15.2d, v14.2d
-; CHECK-NEXT:    mul x21, x18, x7
-; CHECK-NEXT:    mov v1.d[1], x6
-; CHECK-NEXT:    mul x0, x4, x7
-; CHECK-NEXT:    str q15, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    add v15.2d, v11.2d, v14.2d
-; CHECK-NEXT:    mov v2.d[1], x16
-; CHECK-NEXT:    ldr q11, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT:    mul x20, x13, x7
-; CHECK-NEXT:    fmov d3, x21
-; CHECK-NEXT:    add v11.2d, v11.2d, v0.2d
-; CHECK-NEXT:    add v12.2d, v12.2d, v1.2d
-; CHECK-NEXT:    mul x22, x12, x7
-; CHECK-NEXT:    fmov d4, x0
-; CHECK-NEXT:    add v18.2d, v18.2d, v2.2d
-; CHECK-NEXT:    mov v2.16b, v7.16b
-; CHECK-NEXT:    mul x14, x18, x5
-; CHECK-NEXT:    mov v7.16b, v19.16b
-; CHECK-NEXT:    mov v19.16b, v23.16b
-; CHECK-NEXT:    mov v3.d[1], x20
-; CHECK-NEXT:    mov v23.16b, v27.16b
-; CHECK-NEXT:    add v27.2d, v9.2d, v1.2d
-; CHECK-NEXT:    mul x15, x4, x5
-; CHECK-NEXT:    str q11, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:    mov v11.16b, v15.16b
-; CHECK-NEXT:    mov v4.d[1], x22
-; CHECK-NEXT:    add v19.2d, v19.2d, v1.2d
-; CHECK-NEXT:    add v7.2d, v7.2d, v1.2d
+; CHECK-NEXT:    fmov d1, x15
+; CHECK-NEXT:    mul x16, x16, x19
+; CHECK-NEXT:    str q5, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    add v5.2d, v13.2d, v14.2d
+; CHECK-NEXT:    fmov d2, x21
+; CHECK-NEXT:    ldr q13, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    mul x6, x18, x5
+; CHECK-NEXT:    ldp q15, q14, [sp, #16] // 32-byte Folded Reload
+; CHECK-NEXT:    mov v3.d[1], x7
+; CHECK-NEXT:    add v13.2d, v13.2d, v0.2d
+; CHECK-NEXT:    mul x18, x18, x19
+; CHECK-NEXT:    mov v1.d[1], x16
+; CHECK-NEXT:    mul x22, x4, x19
+; CHECK-NEXT:    str q13, [sp, #80] // 16-byte Folded Spill
+; CHECK-NEXT:    mov v13.16b, v5.16b
+; CHECK-NEXT:    mov v5.16b, v17.16b
+; CHECK-NEXT:    mov v17.16b, v20.16b
+; CHECK-NEXT:    mov v20.16b, v24.16b
 ; CHECK-NEXT:    mul x13, x13, x5
-; CHECK-NEXT:    add v23.2d, v23.2d, v1.2d
-; CHECK-NEXT:    add v1.2d, v5.2d, v1.2d
-; CHECK-NEXT:    fmov d14, x14
-; CHECK-NEXT:    add v30.2d, v30.2d, v3.2d
-; CHECK-NEXT:    mov v3.16b, v16.16b
+; CHECK-NEXT:    mov v24.16b, v28.16b
+; CHECK-NEXT:    add v11.2d, v11.2d, v3.2d
+; CHECK-NEXT:    mov v2.d[1], x18
+; CHECK-NEXT:    add v15.2d, v15.2d, v1.2d
+; CHECK-NEXT:    add v27.2d, v27.2d, v3.2d
+; CHECK-NEXT:    mul x17, x12, x19
+; CHECK-NEXT:    add v23.2d, v23.2d, v3.2d
+; CHECK-NEXT:    add v19.2d, v19.2d, v3.2d
+; CHECK-NEXT:    fmov d4, x22
+; CHECK-NEXT:    add v10.2d, v10.2d, v3.2d
+; CHECK-NEXT:    mul x14, x4, x5
+; CHECK-NEXT:    fmov d0, x13
+; CHECK-NEXT:    add v14.2d, v14.2d, v2.2d
+; CHECK-NEXT:    add v2.2d, v6.2d, v3.2d
 ; CHECK-NEXT:    mul x12, x12, x5
-; CHECK-NEXT:    mov v16.16b, v20.16b
-; CHECK-NEXT:    mov v5.16b, v22.16b
-; CHECK-NEXT:    fmov d0, x15
-; CHECK-NEXT:    add v28.2d, v28.2d, v4.2d
-; CHECK-NEXT:    mov v4.16b, v17.16b
-; CHECK-NEXT:    mov v17.16b, v21.16b
-; CHECK-NEXT:    mov v21.16b, v10.16b
-; CHECK-NEXT:    mov v10.16b, v26.16b
-; CHECK-NEXT:    mov v14.d[1], x13
-; CHECK-NEXT:    mov v22.16b, v31.16b
-; CHECK-NEXT:    ldp q26, q31, [sp] // 32-byte Folded Reload
-; CHECK-NEXT:    mov v0.d[1], x12
-; CHECK-NEXT:    add v13.2d, v13.2d, v14.2d
-; CHECK-NEXT:    add v31.2d, v31.2d, v14.2d
-; CHECK-NEXT:    add v26.2d, v26.2d, v14.2d
-; CHECK-NEXT:    add v24.2d, v24.2d, v14.2d
-; CHECK-NEXT:    add v22.2d, v22.2d, v14.2d
-; CHECK-NEXT:    add v20.2d, v8.2d, v14.2d
-; CHECK-NEXT:    add v10.2d, v10.2d, v14.2d
-; CHECK-NEXT:    add v16.2d, v16.2d, v14.2d
-; CHECK-NEXT:    add v5.2d, v5.2d, v14.2d
-; CHECK-NEXT:    add v3.2d, v3.2d, v14.2d
-; CHECK-NEXT:    add v2.2d, v2.2d, v14.2d
-; CHECK-NEXT:    add v29.2d, v29.2d, v0.2d
-; CHECK-NEXT:    add v25.2d, v25.2d, v0.2d
-; CHECK-NEXT:    add v21.2d, v21.2d, v0.2d
+; CHECK-NEXT:    mov v3.16b, v7.16b
+; CHECK-NEXT:    mov v7.16b, v18.16b
+; CHECK-NEXT:    mov v4.d[1], x17
+; CHECK-NEXT:    mov v18.16b, v22.16b
+; CHECK-NEXT:    mov v0.d[1], x6
+; CHECK-NEXT:    fmov d1, x14
+; CHECK-NEXT:    add v28.2d, v8.2d, v4.2d
+; CHECK-NEXT:    mov v1.d[1], x12
+; CHECK-NEXT:    add v31.2d, v31.2d, v0.2d
+; CHECK-NEXT:    add v30.2d, v30.2d, v0.2d
+; CHECK-NEXT:    add v12.2d, v12.2d, v0.2d
+; CHECK-NEXT:    add v24.2d, v24.2d, v0.2d
+; CHECK-NEXT:    add v22.2d, v26.2d, v0.2d
+; CHECK-NEXT:    add v20.2d, v20.2d, v0.2d
+; CHECK-NEXT:    add v18.2d, v18.2d, v0.2d
 ; CHECK-NEXT:    add v17.2d, v17.2d, v0.2d
-; CHECK-NEXT:    add v4.2d, v4.2d, v0.2d
-; CHECK-NEXT:    add v0.2d, v6.2d, v0.2d
+; CHECK-NEXT:    add v7.2d, v7.2d, v0.2d
+; CHECK-NEXT:    add v4.2d, v16.2d, v0.2d
+; CHECK-NEXT:    add v3.2d, v3.2d, v0.2d
+; CHECK-NEXT:    mov v0.16b, v21.16b
+; CHECK-NEXT:    mov v21.16b, v29.16b
+; CHECK-NEXT:    ldr q29, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    add v9.2d, v9.2d, v1.2d
+; CHECK-NEXT:    add v6.2d, v25.2d, v1.2d
+; CHECK-NEXT:    add v5.2d, v5.2d, v1.2d
+; CHECK-NEXT:    add v29.2d, v29.2d, v1.2d
+; CHECK-NEXT:    add v21.2d, v21.2d, v1.2d
+; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    b.ne .LBB0_1
 ; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup
-; CHECK-NEXT:    ldr q6, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr q1, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    adrp x8, C
 ; CHECK-NEXT:    add x8, x8, :lo12:C
-; CHECK-NEXT:    stp q12, q31, [x8, #80]
+; CHECK-NEXT:    stp q11, q30, [x8, #80]
 ; CHECK-NEXT:    ldp x20, x19, [sp, #176] // 16-byte Folded Reload
-; CHECK-NEXT:    str q6, [x8]
-; CHECK-NEXT:    ldr q6, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    str q29, [x8, #112]
+; CHECK-NEXT:    str q1, [x8]
+; CHECK-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    stp q15, q14, [x8, #144]
 ; CHECK-NEXT:    ldp x22, x21, [sp, #160] // 16-byte Folded Reload
-; CHECK-NEXT:    stp q6, q11, [x8, #16]
-; CHECK-NEXT:    ldr q6, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT:    stp q18, q30, [x8, #144]
-; CHECK-NEXT:    ldp d9, d8, [sp, #144] // 16-byte Folded Reload
-; CHECK-NEXT:    stp q6, q13, [x8, #48]
+; CHECK-NEXT:    stp q1, q13, [x8, #16]
+; CHECK-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
+; CHECK-NEXT:    stp q28, q12, [x8, #176]
 ; CHECK-NEXT:    ldp d13, d12, [sp, #112] // 16-byte Folded Reload
-; CHECK-NEXT:    stp q28, q26, [x8, #176]
+; CHECK-NEXT:    stp q1, q31, [x8, #48]
 ; CHECK-NEXT:    ldp d15, d14, [sp, #96] // 16-byte Folded Reload
-; CHECK-NEXT:    stp q19, q10, [x8, #336]
+; CHECK-NEXT:    stp q9, q24, [x8, #240]
+; CHECK-NEXT:    ldp d9, d8, [sp, #144] // 16-byte Folded Reload
+; CHECK-NEXT:    stp q19, q18, [x8, #336]
+; CHECK-NEXT:    stp q10, q7, [x8, #400]
 ; CHECK-NEXT:    ldp d11, d10, [sp, #128] // 16-byte Folded Reload
+; CHECK-NEXT:    str q29, [x8, #112]
 ; CHECK-NEXT:    str q27, [x8, #208]
-; CHECK-NEXT:    stp q25, q24, [x8, #240]
 ; CHECK-NEXT:    stp q23, q22, [x8, #272]
 ; CHECK-NEXT:    stp q21, q20, [x8, #304]
-; CHECK-NEXT:    stp q17, q16, [x8, #368]
-; CHECK-NEXT:    stp q7, q5, [x8, #400]
-; CHECK-NEXT:    stp q4, q3, [x8, #432]
-; CHECK-NEXT:    stp q1, q2, [x8, #464]
+; CHECK-NEXT:    stp q6, q17, [x8, #368]
+; CHECK-NEXT:    stp q5, q4, [x8, #432]
+; CHECK-NEXT:    stp q2, q3, [x8, #464]
 ; CHECK-NEXT:    str q0, [x8, #496]
 ; CHECK-NEXT:    add sp, sp, #192
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/AArch64/rcpc3-sve.ll b/llvm/test/CodeGen/AArch64/rcpc3-sve.ll
index 47e3381517499..6b03e5d12bfd3 100644
--- a/llvm/test/CodeGen/AArch64/rcpc3-sve.ll
+++ b/llvm/test/CodeGen/AArch64/rcpc3-sve.ll
@@ -8,8 +8,8 @@
 define hidden <vscale x 2 x i64> @test_load_sve_lane0(ptr nocapture noundef readonly %a, <vscale x 2 x i64> noundef %b) local_unnamed_addr {
 ; CHECK-LABEL: test_load_sve_lane0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl1
 ; CHECK-NEXT:    ldapr x8, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl1
 ; CHECK-NEXT:    mov z0.d, p0/m, x8
 ; CHECK-NEXT:    ret
   %1 = load atomic i64, ptr %a acquire, align 8
@@ -20,9 +20,9 @@ define hidden <vscale x 2 x i64> @test_load_sve_lane0(ptr nocapture noundef read
 define hidden <vscale x 2 x i64> @test_load_sve_lane1(ptr nocapture noundef readonly %a, <vscale x 2 x i64> noundef %b) local_unnamed_addr {
 ; CHECK-LABEL: test_load_sve_lane1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    index z1.d, #0, #1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    ldapr x8, [x0]
 ; CHECK-NEXT:    cmpeq p0.d, p0/z, z1.d, z2.d
diff --git a/llvm/test/CodeGen/AArch64/reassocmls.ll b/llvm/test/CodeGen/AArch64/reassocmls.ll
index 381caffba92eb..acbf9fc584a2e 100644
--- a/llvm/test/CodeGen/AArch64/reassocmls.ll
+++ b/llvm/test/CodeGen/AArch64/reassocmls.ll
@@ -79,7 +79,7 @@ define i64 @mls_i64_C(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e) {
 ; CHECK-LABEL: mls_i64_C:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mul x8, x2, x1
-; CHECK-NEXT:    mov w9, #10
+; CHECK-NEXT:    mov w9, #10 // =0xa
 ; CHECK-NEXT:    madd x8, x4, x3, x8
 ; CHECK-NEXT:    sub x0, x9, x8
 ; CHECK-NEXT:    ret
@@ -290,9 +290,9 @@ define <vscale x 8 x i16> @smlsl_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i8
 define <vscale x 8 x i16> @umlsl_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i8> %b, <vscale x 8 x i8> %c, <vscale x 8 x i8> %d, <vscale x 8 x i8> %e) {
 ; CHECK-LABEL: umlsl_nxv8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    and z3.h, z3.h, #0xff
 ; CHECK-NEXT:    and z4.h, z4.h, #0xff
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    and z1.h, z1.h, #0xff
 ; CHECK-NEXT:    and z2.h, z2.h, #0xff
 ; CHECK-NEXT:    mls z0.h, p0/m, z4.h, z3.h
@@ -326,8 +326,8 @@ define <vscale x 8 x i16> @mls_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16>
 define <vscale x 8 x i16> @mla_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c, <vscale x 8 x i16> %d, <vscale x 8 x i16> %e) {
 ; CHECK-LABEL: mla_nxv8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mul z1.h, z2.h, z1.h
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mla z1.h, p0/m, z4.h, z3.h
 ; CHECK-NEXT:    add z0.h, z1.h, z0.h
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
index a080a7403811f..325ab444205bf 100644
--- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
@@ -20,111 +20,111 @@ define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
 ; CHECK-NEXT:    ldr d6, [x10, x8]
 ; CHECK-NEXT:    ldr d5, [x11]
 ; CHECK-NEXT:    ldr d7, [x11, x9]
-; CHECK-NEXT:    usubl v2.8h, v2.8b, v3.8b
 ; CHECK-NEXT:    usubl v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    usubl v1.8h, v4.8b, v5.8b
+; CHECK-NEXT:    usubl v1.8h, v2.8b, v3.8b
+; CHECK-NEXT:    usubl v2.8h, v4.8b, v5.8b
 ; CHECK-NEXT:    usubl v3.8h, v6.8b, v7.8b
-; CHECK-NEXT:    shll2 v4.4s, v2.8h, #16
-; CHECK-NEXT:    shll2 v5.4s, v0.8h, #16
+; CHECK-NEXT:    shll2 v4.4s, v0.8h, #16
+; CHECK-NEXT:    shll2 v5.4s, v1.8h, #16
 ; CHECK-NEXT:    shll2 v6.4s, v3.8h, #16
-; CHECK-NEXT:    shll2 v7.4s, v1.8h, #16
-; CHECK-NEXT:    saddw v2.4s, v4.4s, v2.4h
-; CHECK-NEXT:    saddw v0.4s, v5.4s, v0.4h
+; CHECK-NEXT:    shll2 v7.4s, v2.8h, #16
+; CHECK-NEXT:    saddw v0.4s, v4.4s, v0.4h
+; CHECK-NEXT:    saddw v1.4s, v5.4s, v1.4h
 ; CHECK-NEXT:    saddw v3.4s, v6.4s, v3.4h
-; CHECK-NEXT:    saddw v1.4s, v7.4s, v1.4h
+; CHECK-NEXT:    saddw v2.4s, v7.4s, v2.4h
+; CHECK-NEXT:    zip1 v4.4s, v1.4s, v0.4s
+; CHECK-NEXT:    zip2 v6.4s, v1.4s, v0.4s
+; CHECK-NEXT:    uzp2 v5.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    mov v7.16b, v2.16b
-; CHECK-NEXT:    zip1 v4.4s, v2.4s, v0.4s
-; CHECK-NEXT:    zip2 v6.4s, v2.4s, v0.4s
-; CHECK-NEXT:    uzp2 v5.4s, v3.4s, v1.4s
-; CHECK-NEXT:    mov v17.16b, v1.16b
-; CHECK-NEXT:    zip2 v16.4s, v1.4s, v3.4s
-; CHECK-NEXT:    mov v7.s[3], v0.s[2]
-; CHECK-NEXT:    ext v18.16b, v3.16b, v3.16b, #12
-; CHECK-NEXT:    ext v2.16b, v2.16b, v4.16b, #8
-; CHECK-NEXT:    mov v17.s[1], v3.s[0]
+; CHECK-NEXT:    ext v17.16b, v3.16b, v3.16b, #12
+; CHECK-NEXT:    zip2 v18.4s, v3.4s, v2.4s
+; CHECK-NEXT:    ext v16.16b, v1.16b, v4.16b, #8
+; CHECK-NEXT:    mov v1.s[3], v0.s[2]
+; CHECK-NEXT:    mov v7.s[1], v3.s[0]
 ; CHECK-NEXT:    uzp2 v0.4s, v5.4s, v3.4s
-; CHECK-NEXT:    zip2 v5.4s, v3.4s, v1.4s
-; CHECK-NEXT:    mov v3.s[0], v1.s[1]
-; CHECK-NEXT:    ext v1.16b, v1.16b, v18.16b, #12
-; CHECK-NEXT:    mov v16.d[1], v7.d[1]
-; CHECK-NEXT:    mov v17.d[1], v2.d[1]
+; CHECK-NEXT:    zip2 v5.4s, v2.4s, v3.4s
+; CHECK-NEXT:    mov v3.s[0], v2.s[1]
+; CHECK-NEXT:    ext v2.16b, v2.16b, v17.16b, #12
+; CHECK-NEXT:    mov v18.d[1], v1.d[1]
+; CHECK-NEXT:    mov v7.d[1], v16.d[1]
 ; CHECK-NEXT:    mov v0.d[1], v6.d[1]
-; CHECK-NEXT:    mov v5.d[1], v7.d[1]
 ; CHECK-NEXT:    mov v3.d[1], v4.d[1]
-; CHECK-NEXT:    mov v1.d[1], v6.d[1]
-; CHECK-NEXT:    add v0.4s, v0.4s, v5.4s
-; CHECK-NEXT:    add v2.4s, v3.4s, v17.4s
-; CHECK-NEXT:    sub v3.4s, v17.4s, v3.4s
-; CHECK-NEXT:    sub v1.4s, v16.4s, v1.4s
+; CHECK-NEXT:    mov v5.d[1], v1.d[1]
+; CHECK-NEXT:    mov v2.d[1], v6.d[1]
+; CHECK-NEXT:    add v0.4s, v0.4s, v18.4s
+; CHECK-NEXT:    add v1.4s, v3.4s, v7.4s
+; CHECK-NEXT:    sub v3.4s, v7.4s, v3.4s
+; CHECK-NEXT:    sub v2.4s, v5.4s, v2.4s
 ; CHECK-NEXT:    rev64 v4.4s, v0.4s
-; CHECK-NEXT:    rev64 v5.4s, v2.4s
-; CHECK-NEXT:    add v6.4s, v1.4s, v3.4s
-; CHECK-NEXT:    sub v1.4s, v3.4s, v1.4s
+; CHECK-NEXT:    rev64 v6.4s, v1.4s
+; CHECK-NEXT:    sub v5.4s, v3.4s, v2.4s
+; CHECK-NEXT:    add v2.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    mov v4.d[1], v0.d[1]
-; CHECK-NEXT:    mov v5.d[1], v2.d[1]
-; CHECK-NEXT:    rev64 v3.4s, v1.4s
-; CHECK-NEXT:    sub v2.4s, v2.4s, v4.4s
-; CHECK-NEXT:    add v0.4s, v0.4s, v5.4s
-; CHECK-NEXT:    rev64 v4.4s, v6.4s
-; CHECK-NEXT:    rev64 v5.4s, v2.4s
-; CHECK-NEXT:    rev64 v7.4s, v0.4s
-; CHECK-NEXT:    addp v16.4s, v0.4s, v6.4s
-; CHECK-NEXT:    addp v17.4s, v2.4s, v1.4s
-; CHECK-NEXT:    sub v4.4s, v6.4s, v4.4s
-; CHECK-NEXT:    sub v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    sub v2.4s, v2.4s, v5.4s
-; CHECK-NEXT:    sub v0.4s, v0.4s, v7.4s
-; CHECK-NEXT:    zip1 v21.4s, v16.4s, v16.4s
-; CHECK-NEXT:    ext v5.16b, v17.16b, v1.16b, #4
-; CHECK-NEXT:    ext v6.16b, v16.16b, v4.16b, #4
-; CHECK-NEXT:    mov v18.16b, v1.16b
-; CHECK-NEXT:    mov v19.16b, v4.16b
-; CHECK-NEXT:    ext v3.16b, v2.16b, v17.16b, #8
-; CHECK-NEXT:    ext v7.16b, v0.16b, v16.16b, #4
-; CHECK-NEXT:    mov v18.s[2], v17.s[3]
-; CHECK-NEXT:    zip2 v5.4s, v5.4s, v17.4s
-; CHECK-NEXT:    zip2 v6.4s, v6.4s, v16.4s
-; CHECK-NEXT:    mov v19.s[2], v16.s[3]
-; CHECK-NEXT:    trn2 v0.4s, v21.4s, v0.4s
-; CHECK-NEXT:    ext v20.16b, v3.16b, v2.16b, #4
-; CHECK-NEXT:    ext v7.16b, v7.16b, v7.16b, #4
-; CHECK-NEXT:    mov v2.s[2], v17.s[1]
-; CHECK-NEXT:    ext v1.16b, v1.16b, v5.16b, #12
-; CHECK-NEXT:    ext v4.16b, v4.16b, v6.16b, #12
-; CHECK-NEXT:    mov v5.16b, v18.16b
-; CHECK-NEXT:    uzp2 v3.4s, v3.4s, v20.4s
-; CHECK-NEXT:    mov v6.16b, v7.16b
-; CHECK-NEXT:    mov v20.16b, v19.16b
-; CHECK-NEXT:    mov v21.16b, v2.16b
-; CHECK-NEXT:    mov v5.s[1], v17.s[2]
-; CHECK-NEXT:    sub v7.4s, v0.4s, v7.4s
-; CHECK-NEXT:    mov v6.s[0], v16.s[1]
-; CHECK-NEXT:    mov v20.s[1], v16.s[2]
-; CHECK-NEXT:    sub v16.4s, v19.4s, v4.4s
-; CHECK-NEXT:    mov v21.s[1], v17.s[0]
-; CHECK-NEXT:    sub v2.4s, v2.4s, v3.4s
-; CHECK-NEXT:    sub v17.4s, v18.4s, v1.4s
-; CHECK-NEXT:    add v1.4s, v5.4s, v1.4s
+; CHECK-NEXT:    mov v6.d[1], v1.d[1]
+; CHECK-NEXT:    rev64 v3.4s, v5.4s
+; CHECK-NEXT:    rev64 v7.4s, v2.4s
+; CHECK-NEXT:    sub v1.4s, v1.4s, v4.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v6.4s
-; CHECK-NEXT:    add v4.4s, v20.4s, v4.4s
-; CHECK-NEXT:    add v3.4s, v21.4s, v3.4s
-; CHECK-NEXT:    mov v1.d[1], v17.d[1]
-; CHECK-NEXT:    mov v0.d[1], v7.d[1]
-; CHECK-NEXT:    mov v4.d[1], v16.d[1]
-; CHECK-NEXT:    mov v3.d[1], v2.d[1]
+; CHECK-NEXT:    sub v3.4s, v5.4s, v3.4s
+; CHECK-NEXT:    addp v4.4s, v1.4s, v5.4s
+; CHECK-NEXT:    sub v5.4s, v2.4s, v7.4s
+; CHECK-NEXT:    addp v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    rev64 v6.4s, v0.4s
+; CHECK-NEXT:    rev64 v7.4s, v1.4s
+; CHECK-NEXT:    ext v16.16b, v4.16b, v3.16b, #4
+; CHECK-NEXT:    ext v17.16b, v2.16b, v5.16b, #4
+; CHECK-NEXT:    sub v0.4s, v0.4s, v6.4s
+; CHECK-NEXT:    sub v1.4s, v1.4s, v7.4s
+; CHECK-NEXT:    mov v7.16b, v3.16b
+; CHECK-NEXT:    zip2 v6.4s, v16.4s, v4.4s
+; CHECK-NEXT:    mov v16.16b, v5.16b
+; CHECK-NEXT:    zip2 v17.4s, v17.4s, v2.4s
+; CHECK-NEXT:    ext v18.16b, v0.16b, v2.16b, #4
+; CHECK-NEXT:    mov v7.s[2], v4.s[3]
+; CHECK-NEXT:    mov v21.16b, v1.16b
+; CHECK-NEXT:    mov v16.s[2], v2.s[3]
+; CHECK-NEXT:    ext v5.16b, v5.16b, v17.16b, #12
+; CHECK-NEXT:    zip1 v17.4s, v2.4s, v2.4s
+; CHECK-NEXT:    ext v3.16b, v3.16b, v6.16b, #12
+; CHECK-NEXT:    ext v18.16b, v18.16b, v18.16b, #4
+; CHECK-NEXT:    mov v19.16b, v7.16b
+; CHECK-NEXT:    ext v6.16b, v1.16b, v4.16b, #8
+; CHECK-NEXT:    mov v21.s[2], v4.s[1]
+; CHECK-NEXT:    mov v20.16b, v16.16b
+; CHECK-NEXT:    mov v19.s[1], v4.s[2]
+; CHECK-NEXT:    trn2 v0.4s, v17.4s, v0.4s
+; CHECK-NEXT:    sub v16.4s, v16.4s, v5.4s
+; CHECK-NEXT:    mov v17.16b, v18.16b
+; CHECK-NEXT:    ext v1.16b, v6.16b, v1.16b, #4
+; CHECK-NEXT:    sub v7.4s, v7.4s, v3.4s
+; CHECK-NEXT:    mov v20.s[1], v2.s[2]
+; CHECK-NEXT:    mov v17.s[0], v2.s[1]
+; CHECK-NEXT:    mov v2.16b, v21.16b
+; CHECK-NEXT:    add v3.4s, v19.4s, v3.4s
+; CHECK-NEXT:    uzp2 v1.4s, v6.4s, v1.4s
+; CHECK-NEXT:    add v5.4s, v20.4s, v5.4s
+; CHECK-NEXT:    mov v2.s[1], v4.s[0]
+; CHECK-NEXT:    sub v4.4s, v0.4s, v18.4s
+; CHECK-NEXT:    mov v3.d[1], v7.d[1]
+; CHECK-NEXT:    add v0.4s, v0.4s, v17.4s
+; CHECK-NEXT:    mov v5.d[1], v16.d[1]
+; CHECK-NEXT:    sub v6.4s, v21.4s, v1.4s
+; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    mov v0.d[1], v4.d[1]
+; CHECK-NEXT:    cmlt v4.8h, v3.8h, #0
+; CHECK-NEXT:    cmlt v2.8h, v5.8h, #0
+; CHECK-NEXT:    mov v1.d[1], v6.d[1]
+; CHECK-NEXT:    add v3.4s, v4.4s, v3.4s
+; CHECK-NEXT:    cmlt v6.8h, v0.8h, #0
+; CHECK-NEXT:    add v5.4s, v2.4s, v5.4s
+; CHECK-NEXT:    eor v3.16b, v3.16b, v4.16b
 ; CHECK-NEXT:    cmlt v7.8h, v1.8h, #0
-; CHECK-NEXT:    cmlt v2.8h, v0.8h, #0
-; CHECK-NEXT:    cmlt v6.8h, v4.8h, #0
-; CHECK-NEXT:    cmlt v5.8h, v3.8h, #0
+; CHECK-NEXT:    add v0.4s, v6.4s, v0.4s
+; CHECK-NEXT:    eor v2.16b, v5.16b, v2.16b
 ; CHECK-NEXT:    add v1.4s, v7.4s, v1.4s
-; CHECK-NEXT:    add v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    add v4.4s, v6.4s, v4.4s
-; CHECK-NEXT:    add v3.4s, v5.4s, v3.4s
+; CHECK-NEXT:    eor v0.16b, v0.16b, v6.16b
+; CHECK-NEXT:    add v2.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    eor v1.16b, v1.16b, v7.16b
-; CHECK-NEXT:    eor v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    eor v2.16b, v3.16b, v5.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v6.16b
-; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    addv s0, v0.4s
@@ -278,13 +278,13 @@ define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
 ; CHECK-NEXT:    mov v1.d[1], v6.d[1]
 ; CHECK-NEXT:    add v2.4s, v2.4s, v16.4s
 ; CHECK-NEXT:    add v3.4s, v4.4s, v17.4s
+; CHECK-NEXT:    rev64 v5.4s, v2.4s
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    sub v1.4s, v17.4s, v4.4s
-; CHECK-NEXT:    rev64 v5.4s, v2.4s
 ; CHECK-NEXT:    rev64 v6.4s, v3.4s
+; CHECK-NEXT:    mov v5.d[1], v2.d[1]
 ; CHECK-NEXT:    sub v4.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    mov v5.d[1], v2.d[1]
 ; CHECK-NEXT:    mov v6.d[1], v3.d[1]
 ; CHECK-NEXT:    sub v3.4s, v3.4s, v5.4s
 ; CHECK-NEXT:    add v1.4s, v2.4s, v6.4s
@@ -304,40 +304,40 @@ define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
 ; CHECK-NEXT:    mov v4.d[1], v16.d[1]
 ; CHECK-NEXT:    mov v1.d[1], v7.d[1]
 ; CHECK-NEXT:    add v0.4s, v17.4s, v1.4s
-; CHECK-NEXT:    sub v1.4s, v1.4s, v17.4s
 ; CHECK-NEXT:    add v2.4s, v18.4s, v4.4s
+; CHECK-NEXT:    sub v1.4s, v1.4s, v17.4s
 ; CHECK-NEXT:    sub v3.4s, v4.4s, v18.4s
-; CHECK-NEXT:    zip2 v4.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ext v5.16b, v0.16b, v0.16b, #4
-; CHECK-NEXT:    ext v6.16b, v2.16b, v2.16b, #4
+; CHECK-NEXT:    ext v4.16b, v0.16b, v0.16b, #4
+; CHECK-NEXT:    ext v5.16b, v2.16b, v2.16b, #4
+; CHECK-NEXT:    zip2 v6.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    zip2 v7.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    zip2 v16.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    zip2 v17.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    zip1 v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    zip1 v2.4s, v2.4s, v3.4s
-; CHECK-NEXT:    ext v1.16b, v5.16b, v1.16b, #8
-; CHECK-NEXT:    ext v18.16b, v6.16b, v3.16b, #8
-; CHECK-NEXT:    add v3.4s, v16.4s, v7.4s
-; CHECK-NEXT:    sub v4.4s, v4.4s, v17.4s
-; CHECK-NEXT:    sub v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ext v1.16b, v1.16b, v5.16b, #4
-; CHECK-NEXT:    ext v5.16b, v18.16b, v6.16b, #4
-; CHECK-NEXT:    cmlt v2.8h, v4.8h, #0
-; CHECK-NEXT:    cmlt v6.8h, v3.8h, #0
-; CHECK-NEXT:    add v3.4s, v6.4s, v3.4s
-; CHECK-NEXT:    add v4.4s, v2.4s, v4.4s
-; CHECK-NEXT:    add v1.4s, v5.4s, v1.4s
+; CHECK-NEXT:    ext v18.16b, v4.16b, v1.16b, #8
+; CHECK-NEXT:    ext v19.16b, v5.16b, v3.16b, #8
+; CHECK-NEXT:    zip1 v1.4s, v2.4s, v3.4s
+; CHECK-NEXT:    add v2.4s, v16.4s, v7.4s
+; CHECK-NEXT:    sub v3.4s, v6.4s, v17.4s
+; CHECK-NEXT:    ext v4.16b, v18.16b, v4.16b, #4
+; CHECK-NEXT:    ext v5.16b, v19.16b, v5.16b, #4
+; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    cmlt v1.8h, v3.8h, #0
+; CHECK-NEXT:    cmlt v6.8h, v2.8h, #0
+; CHECK-NEXT:    add v4.4s, v5.4s, v4.4s
 ; CHECK-NEXT:    cmlt v5.8h, v0.8h, #0
+; CHECK-NEXT:    add v2.4s, v6.4s, v2.4s
+; CHECK-NEXT:    add v3.4s, v1.4s, v3.4s
+; CHECK-NEXT:    cmlt v7.8h, v4.8h, #0
 ; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
-; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
-; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
-; CHECK-NEXT:    cmlt v4.8h, v1.8h, #0
-; CHECK-NEXT:    add v2.4s, v3.4s, v2.4s
-; CHECK-NEXT:    add v1.4s, v4.4s, v1.4s
+; CHECK-NEXT:    eor v2.16b, v2.16b, v6.16b
+; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    add v3.4s, v7.4s, v4.4s
 ; CHECK-NEXT:    eor v0.16b, v0.16b, v5.16b
-; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    eor v1.16b, v1.16b, v4.16b
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    eor v2.16b, v3.16b, v7.16b
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    add v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    lsr w9, w8, #16
@@ -461,93 +461,93 @@ define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
 ; CHECK-NEXT:    ldr d3, [x11, x9]
 ; CHECK-NEXT:    ldr d4, [x10]
 ; CHECK-NEXT:    ldr d5, [x11]
+; CHECK-NEXT:    shll2 v6.4s, v0.8h, #16
 ; CHECK-NEXT:    usubl v2.8h, v2.8b, v3.8b
-; CHECK-NEXT:    usubl v3.8h, v4.8b, v5.8b
-; CHECK-NEXT:    shll2 v4.4s, v0.8h, #16
-; CHECK-NEXT:    shll2 v5.4s, v1.8h, #16
-; CHECK-NEXT:    saddw v0.4s, v4.4s, v0.4h
-; CHECK-NEXT:    shll2 v4.4s, v2.8h, #16
-; CHECK-NEXT:    saddw v1.4s, v5.4s, v1.4h
-; CHECK-NEXT:    shll2 v5.4s, v3.8h, #16
-; CHECK-NEXT:    saddw v2.4s, v4.4s, v2.4h
-; CHECK-NEXT:    saddw v3.4s, v5.4s, v3.4h
-; CHECK-NEXT:    rev64 v4.4s, v0.4s
+; CHECK-NEXT:    shll2 v3.4s, v1.8h, #16
+; CHECK-NEXT:    usubl v4.8h, v4.8b, v5.8b
+; CHECK-NEXT:    saddw v0.4s, v6.4s, v0.4h
+; CHECK-NEXT:    shll2 v5.4s, v2.8h, #16
+; CHECK-NEXT:    saddw v1.4s, v3.4s, v1.4h
+; CHECK-NEXT:    shll2 v3.4s, v4.8h, #16
+; CHECK-NEXT:    rev64 v6.4s, v0.4s
+; CHECK-NEXT:    saddw v2.4s, v5.4s, v2.4h
 ; CHECK-NEXT:    rev64 v5.4s, v1.4s
-; CHECK-NEXT:    rev64 v6.4s, v2.4s
-; CHECK-NEXT:    rev64 v7.4s, v3.4s
-; CHECK-NEXT:    sub v4.4s, v0.4s, v4.4s
+; CHECK-NEXT:    saddw v3.4s, v3.4s, v4.4h
+; CHECK-NEXT:    rev64 v4.4s, v2.4s
+; CHECK-NEXT:    sub v6.4s, v0.4s, v6.4s
 ; CHECK-NEXT:    addp v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    rev64 v7.4s, v3.4s
 ; CHECK-NEXT:    sub v5.4s, v1.4s, v5.4s
-; CHECK-NEXT:    sub v6.4s, v2.4s, v6.4s
+; CHECK-NEXT:    sub v4.4s, v2.4s, v4.4s
 ; CHECK-NEXT:    addp v2.4s, v2.4s, v3.4s
-; CHECK-NEXT:    sub v1.4s, v3.4s, v7.4s
+; CHECK-NEXT:    ext v1.16b, v5.16b, v6.16b, #4
+; CHECK-NEXT:    sub v7.4s, v3.4s, v7.4s
 ; CHECK-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT:    ext v7.16b, v5.16b, v4.16b, #4
-; CHECK-NEXT:    mov v4.s[3], v5.s[2]
-; CHECK-NEXT:    zip2 v16.4s, v6.4s, v1.4s
-; CHECK-NEXT:    zip1 v1.4s, v6.4s, v1.4s
-; CHECK-NEXT:    uzp2 v6.4s, v2.4s, v0.4s
-; CHECK-NEXT:    ext v5.16b, v7.16b, v5.16b, #4
+; CHECK-NEXT:    mov v6.s[3], v5.s[2]
+; CHECK-NEXT:    zip2 v16.4s, v4.4s, v7.4s
+; CHECK-NEXT:    zip1 v4.4s, v4.4s, v7.4s
+; CHECK-NEXT:    ext v1.16b, v1.16b, v5.16b, #4
+; CHECK-NEXT:    uzp2 v5.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    uzp1 v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    uzp1 v7.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    uzp2 v2.4s, v2.4s, v3.4s
-; CHECK-NEXT:    mov v16.d[1], v4.d[1]
-; CHECK-NEXT:    rev64 v3.4s, v6.4s
-; CHECK-NEXT:    mov v1.d[1], v5.d[1]
+; CHECK-NEXT:    mov v16.d[1], v6.d[1]
+; CHECK-NEXT:    mov v4.d[1], v1.d[1]
+; CHECK-NEXT:    rev64 v1.4s, v5.4s
 ; CHECK-NEXT:    rev64 v0.4s, v0.4s
 ; CHECK-NEXT:    sub v2.4s, v7.4s, v2.4s
-; CHECK-NEXT:    sub v4.4s, v1.4s, v16.4s
-; CHECK-NEXT:    add v0.4s, v3.4s, v0.4s
-; CHECK-NEXT:    add v1.4s, v16.4s, v1.4s
-; CHECK-NEXT:    zip1 v3.4s, v2.4s, v4.4s
+; CHECK-NEXT:    sub v3.4s, v4.4s, v16.4s
+; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    add v1.4s, v16.4s, v4.4s
+; CHECK-NEXT:    zip1 v4.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    zip1 v5.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    uzp2 v6.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    zip2 v7.4s, v0.4s, v1.4s
-; CHECK-NEXT:    zip2 v17.4s, v2.4s, v4.4s
-; CHECK-NEXT:    ext v16.16b, v2.16b, v3.16b, #8
+; CHECK-NEXT:    zip2 v17.4s, v2.4s, v3.4s
+; CHECK-NEXT:    ext v16.16b, v2.16b, v4.16b, #8
 ; CHECK-NEXT:    trn2 v5.4s, v0.4s, v5.4s
 ; CHECK-NEXT:    uzp2 v6.4s, v6.4s, v0.4s
-; CHECK-NEXT:    mov v2.s[3], v4.s[2]
+; CHECK-NEXT:    mov v2.s[3], v3.s[2]
 ; CHECK-NEXT:    mov v0.s[1], v1.s[1]
 ; CHECK-NEXT:    mov v5.d[1], v16.d[1]
 ; CHECK-NEXT:    mov v6.d[1], v17.d[1]
 ; CHECK-NEXT:    mov v7.d[1], v2.d[1]
-; CHECK-NEXT:    mov v0.d[1], v3.d[1]
+; CHECK-NEXT:    mov v0.d[1], v4.d[1]
 ; CHECK-NEXT:    add v1.4s, v6.4s, v7.4s
-; CHECK-NEXT:    sub v2.4s, v7.4s, v6.4s
-; CHECK-NEXT:    add v3.4s, v5.4s, v0.4s
+; CHECK-NEXT:    add v2.4s, v5.4s, v0.4s
+; CHECK-NEXT:    sub v3.4s, v7.4s, v6.4s
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v5.4s
-; CHECK-NEXT:    zip2 v4.4s, v1.4s, v2.4s
-; CHECK-NEXT:    ext v5.16b, v1.16b, v1.16b, #4
-; CHECK-NEXT:    ext v6.16b, v3.16b, v3.16b, #4
-; CHECK-NEXT:    zip2 v7.4s, v2.4s, v1.4s
-; CHECK-NEXT:    zip2 v16.4s, v0.4s, v3.4s
-; CHECK-NEXT:    zip2 v17.4s, v3.4s, v0.4s
-; CHECK-NEXT:    zip1 v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    ext v2.16b, v5.16b, v2.16b, #8
-; CHECK-NEXT:    ext v18.16b, v6.16b, v0.16b, #8
-; CHECK-NEXT:    zip1 v0.4s, v3.4s, v0.4s
-; CHECK-NEXT:    add v3.4s, v16.4s, v7.4s
-; CHECK-NEXT:    sub v4.4s, v4.4s, v17.4s
-; CHECK-NEXT:    ext v2.16b, v2.16b, v5.16b, #4
-; CHECK-NEXT:    ext v5.16b, v18.16b, v6.16b, #4
+; CHECK-NEXT:    ext v4.16b, v1.16b, v1.16b, #4
+; CHECK-NEXT:    ext v5.16b, v2.16b, v2.16b, #4
+; CHECK-NEXT:    zip2 v6.4s, v1.4s, v3.4s
+; CHECK-NEXT:    zip2 v7.4s, v3.4s, v1.4s
+; CHECK-NEXT:    zip2 v16.4s, v0.4s, v2.4s
+; CHECK-NEXT:    zip2 v17.4s, v2.4s, v0.4s
+; CHECK-NEXT:    zip1 v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    ext v18.16b, v4.16b, v3.16b, #8
+; CHECK-NEXT:    ext v19.16b, v5.16b, v0.16b, #8
+; CHECK-NEXT:    zip1 v0.4s, v2.4s, v0.4s
+; CHECK-NEXT:    add v2.4s, v16.4s, v7.4s
+; CHECK-NEXT:    sub v3.4s, v6.4s, v17.4s
+; CHECK-NEXT:    ext v4.16b, v18.16b, v4.16b, #4
+; CHECK-NEXT:    ext v5.16b, v19.16b, v5.16b, #4
 ; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    cmlt v1.8h, v4.8h, #0
-; CHECK-NEXT:    cmlt v6.8h, v3.8h, #0
-; CHECK-NEXT:    add v3.4s, v6.4s, v3.4s
-; CHECK-NEXT:    add v4.4s, v1.4s, v4.4s
-; CHECK-NEXT:    add v2.4s, v5.4s, v2.4s
+; CHECK-NEXT:    cmlt v1.8h, v3.8h, #0
+; CHECK-NEXT:    cmlt v6.8h, v2.8h, #0
+; CHECK-NEXT:    add v4.4s, v5.4s, v4.4s
 ; CHECK-NEXT:    cmlt v5.8h, v0.8h, #0
+; CHECK-NEXT:    add v2.4s, v6.4s, v2.4s
+; CHECK-NEXT:    add v3.4s, v1.4s, v3.4s
+; CHECK-NEXT:    cmlt v7.8h, v4.8h, #0
 ; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
-; CHECK-NEXT:    eor v1.16b, v4.16b, v1.16b
-; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
-; CHECK-NEXT:    cmlt v4.8h, v2.8h, #0
-; CHECK-NEXT:    add v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    add v2.4s, v4.4s, v2.4s
+; CHECK-NEXT:    eor v2.16b, v2.16b, v6.16b
+; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
+; CHECK-NEXT:    add v3.4s, v7.4s, v4.4s
 ; CHECK-NEXT:    eor v0.16b, v0.16b, v5.16b
+; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    eor v2.16b, v3.16b, v7.16b
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    eor v1.16b, v2.16b, v4.16b
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    add v0.4s, v2.4s, v0.4s
 ; CHECK-NEXT:    addv s0, v0.4s
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    lsr w9, w8, #16
diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll
index 86c224bee990a..2deb19be24821 100644
--- a/llvm/test/CodeGen/AArch64/sat-add.ll
+++ b/llvm/test/CodeGen/AArch64/sat-add.ll
@@ -346,9 +346,9 @@ define <16 x i8> @unsigned_sat_constant_v16i8_using_min(<16 x i8> %x) {
 ; CHECK-LABEL: unsigned_sat_constant_v16i8_using_min:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    movi v1.16b, #213
+; CHECK-NEXT:    movi v2.16b, #42
 ; CHECK-NEXT:    umin v0.16b, v0.16b, v1.16b
-; CHECK-NEXT:    movi v1.16b, #42
-; CHECK-NEXT:    add v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    add v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
   %c = icmp ult <16 x i8> %x, <i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43>
   %s = select <16 x i1> %c, <16 x i8> %x, <16 x i8> <i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43, i8 -43>
@@ -384,9 +384,9 @@ define <8 x i16> @unsigned_sat_constant_v8i16_using_min(<8 x i16> %x) {
 ; CHECK-LABEL: unsigned_sat_constant_v8i16_using_min:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvni v1.8h, #42
+; CHECK-NEXT:    movi v2.8h, #42
 ; CHECK-NEXT:    umin v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    movi v1.8h, #42
-; CHECK-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    add v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    ret
   %c = icmp ult <8 x i16> %x, <i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43>
   %s = select <8 x i1> %c, <8 x i16> %x, <8 x i16> <i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43, i16 -43>
diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll
index 3e0d5dd875097..5237a3491de9b 100644
--- a/llvm/test/CodeGen/AArch64/sext.ll
+++ b/llvm/test/CodeGen/AArch64/sext.ll
@@ -245,15 +245,15 @@ define <3 x i32> @sext_v3i8_v3i32(<3 x i8> %a) {
 ; CHECK-GI-LABEL: sext_v3i8_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov w8, #24 // =0x18
-; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w1
-; CHECK-GI-NEXT:    mov v0.s[1], w8
-; CHECK-GI-NEXT:    mov v1.s[2], w2
-; CHECK-GI-NEXT:    mov v0.s[2], w8
-; CHECK-GI-NEXT:    neg v2.4s, v0.4s
-; CHECK-GI-NEXT:    ushl v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    mov v0.s[2], w2
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    neg v1.4s, v1.4s
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <3 x i8> %a to <3 x i32>
@@ -408,15 +408,15 @@ define <3 x i32> @sext_v3i10_v3i32(<3 x i10> %a) {
 ; CHECK-GI-LABEL: sext_v3i10_v3i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    mov w8, #22 // =0x16
-; CHECK-GI-NEXT:    fmov s1, w0
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    mov v1.s[1], w1
-; CHECK-GI-NEXT:    mov v0.s[1], w8
-; CHECK-GI-NEXT:    mov v1.s[2], w2
-; CHECK-GI-NEXT:    mov v0.s[2], w8
-; CHECK-GI-NEXT:    neg v2.4s, v0.4s
-; CHECK-GI-NEXT:    ushl v0.4s, v1.4s, v0.4s
-; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov s1, w8
+; CHECK-GI-NEXT:    mov v0.s[1], w1
+; CHECK-GI-NEXT:    mov v1.s[1], w8
+; CHECK-GI-NEXT:    mov v0.s[2], w2
+; CHECK-GI-NEXT:    mov v1.s[2], w8
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    neg v1.4s, v1.4s
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <3 x i10> %a to <3 x i32>
diff --git a/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll b/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll
index f4f75bb9c7825..88e062d2c999c 100644
--- a/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll
+++ b/llvm/test/CodeGen/AArch64/sink-addsub-of-const.ll
@@ -158,8 +158,8 @@ define i32 @sink_sub_from_const_to_sub2(i32 %a, i32 %b) {
 define <4 x i32> @vec_sink_add_of_const_to_add0(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_add_of_const_to_add0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI12_0
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI12_0]
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
@@ -170,8 +170,8 @@ define <4 x i32> @vec_sink_add_of_const_to_add0(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_sink_add_of_const_to_add1(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_add_of_const_to_add1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI13_0
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI13_0]
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
@@ -186,8 +186,8 @@ define <4 x i32> @vec_sink_add_of_const_to_add1(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_sink_sub_of_const_to_add0(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_sub_of_const_to_add0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI14_0
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI14_0]
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
@@ -198,8 +198,8 @@ define <4 x i32> @vec_sink_sub_of_const_to_add0(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_sink_sub_of_const_to_add1(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_sub_of_const_to_add1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI15_0
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI15_0]
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
@@ -214,8 +214,8 @@ define <4 x i32> @vec_sink_sub_of_const_to_add1(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_sink_sub_from_const_to_add0(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_sub_from_const_to_add0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    adrp x8, .LCPI16_0
+; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI16_0]
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
@@ -226,8 +226,8 @@ define <4 x i32> @vec_sink_sub_from_const_to_add0(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_sink_sub_from_const_to_add1(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_sub_from_const_to_add1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    adrp x8, .LCPI17_0
+; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI17_0]
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
@@ -242,8 +242,8 @@ define <4 x i32> @vec_sink_sub_from_const_to_add1(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_sink_add_of_const_to_sub(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_add_of_const_to_sub:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI18_0
+; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI18_0]
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
@@ -254,8 +254,8 @@ define <4 x i32> @vec_sink_add_of_const_to_sub(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_sink_add_of_const_to_sub2(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_add_of_const_to_sub2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    adrp x8, .LCPI19_0
+; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI19_0]
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
@@ -270,8 +270,8 @@ define <4 x i32> @vec_sink_add_of_const_to_sub2(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_sink_sub_of_const_to_sub(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_sub_of_const_to_sub:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI20_0
+; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI20_0]
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
@@ -282,8 +282,8 @@ define <4 x i32> @vec_sink_sub_of_const_to_sub(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_sink_sub_of_const_to_sub2(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_sub_of_const_to_sub2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    adrp x8, .LCPI21_0
+; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI21_0]
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
@@ -298,8 +298,8 @@ define <4 x i32> @vec_sink_sub_of_const_to_sub2(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_sink_sub_from_const_to_sub(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_sub_from_const_to_sub:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI22_0
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI22_0]
 ; CHECK-NEXT:    sub v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    ret
@@ -310,8 +310,8 @@ define <4 x i32> @vec_sink_sub_from_const_to_sub(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_sink_sub_from_const_to_sub2(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: vec_sink_sub_from_const_to_sub2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    adrp x8, .LCPI23_0
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI23_0]
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
index 0c674c5685e00..1d1bae42c9e30 100644
--- a/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
+++ b/llvm/test/CodeGen/AArch64/sme-pstate-sm-changing-call-disable-coalescing.ll
@@ -28,8 +28,8 @@ define void @dont_coalesce_arg_i8(i8 %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl use_i8
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
@@ -61,8 +61,8 @@ define void @dont_coalesce_arg_i16(i16 %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl use_i16
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
@@ -94,8 +94,8 @@ define void @dont_coalesce_arg_i32(i32 %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl use_i32
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
@@ -127,8 +127,8 @@ define void @dont_coalesce_arg_i64(i64 %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl use_i64
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
@@ -165,8 +165,8 @@ define void @dont_coalesce_arg_f16(half %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr h0, [sp, #14] // 2-byte Folded Reload
 ; CHECK-NEXT:    bl use_f16
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -205,8 +205,8 @@ define void @dont_coalesce_arg_f32(float %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
 ; CHECK-NEXT:    bl use_f32
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -245,8 +245,8 @@ define void @dont_coalesce_arg_f64(double %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl use_f64
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -290,8 +290,8 @@ define void @dont_coalesce_arg_v1i8(<1 x i8> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl use_v16i8
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -331,8 +331,8 @@ define void @dont_coalesce_arg_v1i16(<1 x i16> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl use_v8i16
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -372,8 +372,8 @@ define void @dont_coalesce_arg_v1i32(<1 x i32> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl use_v4i32
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -413,8 +413,8 @@ define void @dont_coalesce_arg_v1i64(<1 x i64> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl use_v2i64
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -454,8 +454,8 @@ define void @dont_coalesce_arg_v1f16(<1 x half> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr h0, [sp, #14] // 2-byte Folded Reload
 ; CHECK-NEXT:    bl use_v8f16
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -495,8 +495,8 @@ define void @dont_coalesce_arg_v1f32(<1 x float> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl use_v4f32
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -536,8 +536,8 @@ define void @dont_coalesce_arg_v1f64(<1 x double> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    bl use_v2f64
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -581,8 +581,8 @@ define void @dont_coalesce_arg_v16i8(<16 x i8> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v16i8
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -621,8 +621,8 @@ define void @dont_coalesce_arg_v8i16(<8 x i16> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v8i16
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -661,8 +661,8 @@ define void @dont_coalesce_arg_v4i32(<4 x i32> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v4i32
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -701,8 +701,8 @@ define void @dont_coalesce_arg_v2i64(<2 x i64> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v2i64
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -741,8 +741,8 @@ define void @dont_coalesce_arg_v8f16(<8 x half> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v8f16
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -781,8 +781,8 @@ define void @dont_coalesce_arg_v8bf16(<8 x bfloat> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v8bf16
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -821,8 +821,8 @@ define void @dont_coalesce_arg_v4f32(<4 x float> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v4f32
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -861,8 +861,8 @@ define void @dont_coalesce_arg_v2f64(<2 x double> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl use_v2f64
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldr z0, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -894,9 +894,9 @@ define void @dont_coalesce_arg_v8i1(<8 x i1> %arg, ptr %ptr) #0 {
 ; CHECK-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    add x8, sp, #16
 ; CHECK-NEXT:    mov x19, x0
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -942,13 +942,13 @@ define void @dont_coalesce_res_i8(ptr %ptr) #0 {
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_i8
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x19]
 ; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %res = call i8 @get_i8()
@@ -969,13 +969,13 @@ define void @dont_coalesce_res_i16(ptr %ptr) #0 {
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_i16
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
 ; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %res = call i16 @get_i16()
@@ -996,13 +996,13 @@ define void @dont_coalesce_res_i32(ptr %ptr) #0 {
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_i32
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
 ; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %res = call i32 @get_i32()
@@ -1023,13 +1023,13 @@ define void @dont_coalesce_res_i64(ptr %ptr) #0 {
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    bl get_i64
 ; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
 ; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp], #80 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %res = call i64 @get_i64()
@@ -1056,11 +1056,11 @@ define void @dont_coalesce_res_f16(ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr h0, [sp, #14] // 2-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call half @get_f16()
@@ -1086,11 +1086,11 @@ define void @dont_coalesce_res_f32(ptr %ptr) #0 {
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ldr s0, [sp, #12] // 4-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call float @get_f32()
@@ -1116,11 +1116,11 @@ define void @dont_coalesce_res_f64(ptr %ptr) #0 {
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call double @get_f64()
@@ -1150,11 +1150,11 @@ define void @dont_coalesce_res_v1i8(ptr %ptr) #0 {
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    st1b { z0.b }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    st1b { z0.b }, p0, [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <1 x i8> @get_v1i8()
@@ -1181,11 +1181,11 @@ define void @dont_coalesce_res_v1i16(ptr %ptr) #0 {
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <1 x i16> @get_v1i16()
@@ -1212,11 +1212,11 @@ define void @dont_coalesce_res_v1i32(ptr %ptr) #0 {
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <1 x i32> @get_v1i32()
@@ -1243,11 +1243,11 @@ define void @dont_coalesce_res_v1i64(ptr %ptr) #0 {
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <1 x i64> @get_v1i64()
@@ -1275,11 +1275,11 @@ define void @dont_coalesce_res_v1f16(ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr h0, [sp, #14] // 2-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <1 x half> @get_v1f16()
@@ -1306,11 +1306,11 @@ define void @dont_coalesce_res_v1f32(ptr %ptr) #0 {
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <1 x float> @get_v1f32()
@@ -1337,11 +1337,11 @@ define void @dont_coalesce_res_v1f64(ptr %ptr) #0 {
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldr d0, [sp, #8] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <1 x double> @get_v1f64()
@@ -1373,11 +1373,11 @@ define void @dont_coalesce_res_v16i8(ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    st1b { z0.b }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    st1b { z0.b }, p0, [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <16 x i8> @get_v16i8()
@@ -1404,11 +1404,11 @@ define void @dont_coalesce_res_v8i16(ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <8 x i16> @get_v8i16()
@@ -1435,11 +1435,11 @@ define void @dont_coalesce_res_v4i32(ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <4 x i32> @get_v4i32()
@@ -1466,11 +1466,11 @@ define void @dont_coalesce_res_v2i64(ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <2 x i64> @get_v2i64()
@@ -1497,11 +1497,11 @@ define void @dont_coalesce_res_v8f16(ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    st1h { z0.h }, p0, [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <8 x half> @get_v8f16()
@@ -1528,11 +1528,11 @@ define void @dont_coalesce_res_v4f32(ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    st1w { z0.s }, p0, [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <4 x float> @get_v4f32()
@@ -1559,11 +1559,11 @@ define void @dont_coalesce_res_v2f64(ptr %ptr) #0 {
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    ldp d9, d8, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d15, d14, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    st1d { z0.d }, p0, [x19]
-; CHECK-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
   %res = call <2 x double> @get_v2f64()
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index 47b24290d3c85..1e16f140676ba 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -151,8 +151,8 @@ define <2 x double> @streaming_compatible_with_neon_vectors(<2 x double> %arg) "
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB4_4:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z1, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll
index 0097968b1171d..b4fd5a2272e7e 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp-dots.ll
@@ -26,16 +26,16 @@ define void @fdot_multi_za32_f16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @fdot_multi_za32_f16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
 ; CHECK-LABEL: fdot_multi_za32_f16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    fdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    fdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
@@ -71,16 +71,16 @@ define void @bfdot_multi_za32_bf16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused,
 define void @fdot_multi_za32_bf16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
 ; CHECK-LABEL: fdot_multi_za32_bf16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    bfdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    bfdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
index 6d98604837115..e154a4df86efe 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
@@ -26,16 +26,16 @@ define void @udot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @udot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
 ; CHECK-LABEL: udot_multi_za32_u16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    udot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
@@ -68,16 +68,16 @@ define void @udot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
 define void @udot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
 ; CHECK-LABEL: udot_multi_za32_u8_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    udot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    ret
@@ -110,16 +110,16 @@ define void @udot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @udot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
 ; CHECK-LABEL: udot_multi_za64_u16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    udot za.d[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    udot za.d[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
@@ -152,16 +152,16 @@ define void @usdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @usdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
 ; CHECK-LABEL: usdot_multi_za32_u8_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    usdot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    ret
@@ -197,16 +197,16 @@ define void @sdot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @sdot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
 ; CHECK-LABEL: sdot_multi_za32_u16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    sdot za.s[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
@@ -239,16 +239,16 @@ define void @sdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <v
 define void @sdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
 ; CHECK-LABEL: sdot_multi_za32_u8_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    sdot za.s[w8, 7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    ret
@@ -281,16 +281,16 @@ define void @sdot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <
 define void @sdot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
 ; CHECK-LABEL: sdot_multi_za64_u16_vg1x4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    sdot za.d[w8, 0, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    sdot za.d[w8, 7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll
index e95d29f65e55e..92e8877927ea5 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-max.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
 
 ; SMAX (Single, x2)
@@ -151,8 +152,7 @@ define { <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_max_single_x2
 
 ; SMAX (Single, x4)
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
-@multi_vec_max_single_x4_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm) {
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_max_single_x4_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: multi_vec_max_single_x4_s8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -170,8 +170,7 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
-@multi_vec_max_single_x4_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) {
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_max_single_x4_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: multi_vec_max_single_x4_s16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -189,8 +188,7 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
   ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
 }
 
-define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
-@multi_vec_max_single_x4_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) {
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_max_single_x4_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) {
 ; CHECK-LABEL: multi_vec_max_single_x4_s32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -208,8 +206,7 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
   ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
 }
 
-define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
-@multi_vec_max_single_x4_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) {
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_max_single_x4_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) {
 ; CHECK-LABEL: multi_vec_max_single_x4_s64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -229,8 +226,7 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
 
 ; UMAX (Single, x4)
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
-@multi_vec_max_single_x4_u8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm) {
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_max_single_x4_u8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: multi_vec_max_single_x4_u8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -248,8 +244,7 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
-@multi_vec_max_single_x4_u16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) {
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_max_single_x4_u16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: multi_vec_max_single_x4_u16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -267,8 +262,7 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
   ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
 }
 
-define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
-@multi_vec_max_single_x4_u32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) {
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_max_single_x4_u32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) {
 ; CHECK-LABEL: multi_vec_max_single_x4_u32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -286,8 +280,7 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
   ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
 }
 
-define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
-@multi_vec_max_single_x4_u64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) {
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_max_single_x4_u64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) {
 ; CHECK-LABEL: multi_vec_max_single_x4_u64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -307,8 +300,7 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
 
 ; FMAX (SINGLE, x4)
 
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }
-@multi_vec_max_single_x4_f16(<vscale x 8 x half> %unused, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x half> %zm) {
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @multi_vec_max_single_x4_f16(<vscale x 8 x half> %unused, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x half> %zm) {
 ; CHECK-LABEL: multi_vec_max_single_x4_f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -326,8 +318,7 @@ define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale
   ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
 }
 
-define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }
-@multi_vec_max_single_x4_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x float> %zm) {
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_max_single_x4_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x float> %zm) {
 ; CHECK-LABEL: multi_vec_max_single_x4_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -345,8 +336,7 @@ define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vsca
   ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
 }
 
-define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }
-@multi_vec_max_single_x4_f64(<vscale x 2 x double> %unused, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4, <vscale x 2 x double> %zm) {
+define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_max_single_x4_f64(<vscale x 2 x double> %unused, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4, <vscale x 2 x double> %zm) {
 ; CHECK-LABEL: multi_vec_max_single_x4_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -537,104 +527,100 @@ define { <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_max_multi_x2_
 
 ; SMAX (Multi, x4)
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
-@multi_vec_max_multi_x4_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
-                           <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_max_multi_x4_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
 ; CHECK-LABEL: multi_vec_max_multi_x4_s8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1b { z31.b }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    smax { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                           <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
               @llvm.aarch64.sve.smax.x4.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
                                                 <vscale x 16 x i8> %zm1,  <vscale x 16 x i8> %zm2,  <vscale x 16 x i8> %zm3,  <vscale x 16 x i8> %zm4)
   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
-@multi_vec_max_multi_x4_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
-                            <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_max_multi_x4_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
 ; CHECK-LABEL: multi_vec_max_multi_x4_s16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    smax { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                            <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
               @llvm.aarch64.sve.smax.x4.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
                                                 <vscale x 8 x i16> %zm1,  <vscale x 8 x i16> %zm2,  <vscale x 8 x i16> %zm3,  <vscale x 8 x i16> %zm4)
   ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
 }
 
-define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
-@multi_vec_max_multi_x4_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
-                            <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_max_multi_x4_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
 ; CHECK-LABEL: multi_vec_max_multi_x4_s32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    smax { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                            <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
               @llvm.aarch64.sve.smax.x4.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
                                                 <vscale x 4 x i32> %zm1,  <vscale x 4 x i32> %zm2,  <vscale x 4 x i32> %zm3,  <vscale x 4 x i32> %zm4)
   ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
 }
 
-define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
-@multi_vec_max_multi_x4_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
-                            <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_max_multi_x4_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
 ; CHECK-LABEL: multi_vec_max_multi_x4_s64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    smax { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                            <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
               @llvm.aarch64.sve.smax.x4.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
                                                 <vscale x 2 x i64> %zm1,  <vscale x 2 x i64> %zm2,  <vscale x 2 x i64> %zm3,  <vscale x 2 x i64> %zm4)
@@ -643,104 +629,100 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
 
 ; UMAX (Multi, x4)
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
-@multi_vec_max_multi_x4_u8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
-                           <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_max_multi_x4_u8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
 ; CHECK-LABEL: multi_vec_max_multi_x4_u8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1b { z31.b }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    umax { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                           <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
               @llvm.aarch64.sve.umax.x4.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
                                                 <vscale x 16 x i8> %zm1,  <vscale x 16 x i8> %zm2,  <vscale x 16 x i8> %zm3,  <vscale x 16 x i8> %zm4)
   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
-@multi_vec_max_multi_x4_u16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
-                            <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_max_multi_x4_u16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
 ; CHECK-LABEL: multi_vec_max_multi_x4_u16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    umax { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                            <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
               @llvm.aarch64.sve.umax.x4.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
                                                 <vscale x 8 x i16> %zm1,  <vscale x 8 x i16> %zm2,  <vscale x 8 x i16> %zm3,  <vscale x 8 x i16> %zm4)
   ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
 }
 
-define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
-@multi_vec_max_multi_x4_u32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
-                            <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_max_multi_x4_u32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
 ; CHECK-LABEL: multi_vec_max_multi_x4_u32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    umax { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                            <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
               @llvm.aarch64.sve.umax.x4.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
                                                 <vscale x 4 x i32> %zm1,  <vscale x 4 x i32> %zm2,  <vscale x 4 x i32> %zm3,  <vscale x 4 x i32> %zm4)
   ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
 }
 
-define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
-@multi_vec_max_multi_x4_u64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
-                            <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_max_multi_x4_u64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
 ; CHECK-LABEL: multi_vec_max_multi_x4_u64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    umax { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                            <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
               @llvm.aarch64.sve.umax.x4.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
                                                 <vscale x 2 x i64> %zm1,  <vscale x 2 x i64> %zm2,  <vscale x 2 x i64> %zm3,  <vscale x 2 x i64> %zm4)
@@ -749,78 +731,75 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
 
 ; FMAX (Multi, x4)
 
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }
-@multi_vec_max_multi_x4_f16(<vscale x 8 x half> %unused, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4,
-                            <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4) {
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @multi_vec_max_multi_x4_f16(<vscale x 8 x half> %unused, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4,
 ; CHECK-LABEL: multi_vec_max_multi_x4_f16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    fmax { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                            <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4) {
   %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }
               @llvm.aarch64.sve.fmax.x4.nxv8f16(<vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4,
                                                 <vscale x 8 x half> %zm1,  <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4)
   ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
 }
 
-define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }
-@multi_vec_max_multi_x4_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4,
-                            <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) {
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_max_multi_x4_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4,
 ; CHECK-LABEL: multi_vec_max_multi_x4_f32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    fmax { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                            <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) {
   %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }
               @llvm.aarch64.sve.fmax.x4.nxv4f32(<vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4,
                                                 <vscale x 4 x float> %zm1,  <vscale x 4 x float> %zm2,  <vscale x 4 x float> %zm3,  <vscale x 4 x float> %zm4)
   ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
 }
 
-define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }
-@multi_vec_max_multi_x4_f64(<vscale x 2 x double> %unused, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4,
-                            <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) {
+define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_max_multi_x4_f64(<vscale x 2 x double> %unused, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4,
 ; CHECK-LABEL: multi_vec_max_multi_x4_f64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    fmax { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                            <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) {
   %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }
               @llvm.aarch64.sve.fmax.x4.nxv2f64(<vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4,
                                                 <vscale x 2 x double> %zm1,  <vscale x 2 x double> %zm2,  <vscale x 2 x double> %zm3,  <vscale x 2 x double> %zm4)
@@ -870,8 +849,7 @@ define { <vscale x 2 x double>, <vscale x 2 x double> }  @multi_vec_maxnm_single
 
 ; FMAXNM (Single, x4)
 
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }
-@multi_vec_maxnm_single_x4_f16(<vscale x 8 x half> %dummy, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x half> %zm) {
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @multi_vec_maxnm_single_x4_f16(<vscale x 8 x half> %dummy, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x half> %zm) {
 ; CHECK-LABEL: multi_vec_maxnm_single_x4_f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -889,8 +867,7 @@ define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale
   ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
 }
 
-define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }
-@multi_vec_maxnm_single_x4_f32(<vscale x 8 x half> %dummy, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x float> %zm) {
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_maxnm_single_x4_f32(<vscale x 8 x half> %dummy, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x float> %zm) {
 ; CHECK-LABEL: multi_vec_maxnm_single_x4_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -908,8 +885,7 @@ define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vsca
   ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
 }
 
-define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }
-@multi_vec_maxnm_single_x4_f64(<vscale x 8 x half> %dummy, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4, <vscale x 2 x double> %zm) {
+define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_maxnm_single_x4_f64(<vscale x 8 x half> %dummy, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4, <vscale x 2 x double> %zm) {
 ; CHECK-LABEL: multi_vec_maxnm_single_x4_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -976,19 +952,18 @@ define { <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_maxnm_x2_f64(
 
 ; FMAXNM (Multi, x4)
 
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }
-@multi_vec_maxnm_x4_f16(<vscale x 8 x half> %dummy, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4) {
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @multi_vec_maxnm_x4_f16(<vscale x 8 x half> %dummy, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4) {
 ; CHECK-LABEL: multi_vec_maxnm_x4_f16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    fmaxnm { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -996,24 +971,23 @@ define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
   %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }
-              @llvm.aarch64.sve.fmaxnm.x4.nxv8f16(<vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, 
+              @llvm.aarch64.sve.fmaxnm.x4.nxv8f16(<vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4,
                                                   <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4)
   ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
 }
 
-define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }
-@multi_vec_maxnm_x4_f32(<vscale x 8 x half> %dummy, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) {
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_maxnm_x4_f32(<vscale x 8 x half> %dummy, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) {
 ; CHECK-LABEL: multi_vec_maxnm_x4_f32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    fmaxnm { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -1026,19 +1000,18 @@ define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vsca
   ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
 }
 
-define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }
-@multi_vec_maxnm_x4_f64(<vscale x 8 x half> %dummy, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) {
+define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_maxnm_x4_f64(<vscale x 8 x half> %dummy, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) {
 ; CHECK-LABEL: multi_vec_maxnm_x4_f64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    fmaxnm { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll
index 21a55c6436acd..363f9ba5d3530 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-min.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
 
 ; SMIN (Single, x2)
@@ -151,8 +152,7 @@ define { <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_min_single_x2
 
 ; SMIN (Single, x4)
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
-@multi_vec_min_single_x4_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm) {
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_min_single_x4_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: multi_vec_min_single_x4_s8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -170,8 +170,7 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
-@multi_vec_min_single_x4_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) {
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_min_single_x4_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: multi_vec_min_single_x4_s16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -189,8 +188,7 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
   ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
 }
 
-define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
-@multi_vec_min_single_x4_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) {
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_min_single_x4_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) {
 ; CHECK-LABEL: multi_vec_min_single_x4_s32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -208,8 +206,7 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
   ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
 }
 
-define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
-@multi_vec_min_single_x4_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) {
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_min_single_x4_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) {
 ; CHECK-LABEL: multi_vec_min_single_x4_s64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -229,8 +226,7 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
 
 ; UMIN (Single, x4)
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
-@multi_vec_min_single_x4_u8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm) {
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_min_single_x4_u8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: multi_vec_min_single_x4_u8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -248,8 +244,7 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
-@multi_vec_min_single_x4_u16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) {
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_min_single_x4_u16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: multi_vec_min_single_x4_u16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -267,8 +262,7 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
   ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
 }
 
-define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
-@multi_vec_min_single_x4_u32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) {
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_min_single_x4_u32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) {
 ; CHECK-LABEL: multi_vec_min_single_x4_u32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -286,8 +280,7 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
   ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
 }
 
-define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
-@multi_vec_min_single_x4_u64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) {
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_min_single_x4_u64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) {
 ; CHECK-LABEL: multi_vec_min_single_x4_u64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -307,8 +300,7 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
 
 ; FMIN (SINGLE, x4)
 
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }
-@multi_vec_min_single_x4_f16(<vscale x 8 x half> %unused, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x half> %zm) {
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @multi_vec_min_single_x4_f16(<vscale x 8 x half> %unused, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x half> %zm) {
 ; CHECK-LABEL: multi_vec_min_single_x4_f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -326,8 +318,7 @@ define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale
   ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
 }
 
-define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }
-@multi_vec_min_single_x4_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x float> %zm) {
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_min_single_x4_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x float> %zm) {
 ; CHECK-LABEL: multi_vec_min_single_x4_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -345,8 +336,7 @@ define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vsca
   ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
 }
 
-define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }
-@multi_vec_min_single_x4_f64(<vscale x 2 x double> %unused, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4, <vscale x 2 x double> %zm) {
+define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_min_single_x4_f64(<vscale x 2 x double> %unused, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4, <vscale x 2 x double> %zm) {
 ; CHECK-LABEL: multi_vec_min_single_x4_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -537,104 +527,100 @@ define { <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_min_multi_x2_
 
 ; SMIN (Multi, x4)
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
-@multi_vec_min_multi_x4_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
-                           <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_min_multi_x4_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
 ; CHECK-LABEL: multi_vec_min_multi_x4_s8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
 ; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    ld1b { z31.b }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    smin { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                           <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
               @llvm.aarch64.sve.smin.x4.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
                                                 <vscale x 16 x i8> %zm1,  <vscale x 16 x i8> %zm2,  <vscale x 16 x i8> %zm3,  <vscale x 16 x i8> %zm4)
   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
-@multi_vec_min_multi_x4_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
-                            <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_min_multi_x4_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
 ; CHECK-LABEL: multi_vec_min_multi_x4_s16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
 ; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    smin { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                            <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
               @llvm.aarch64.sve.smin.x4.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
                                                 <vscale x 8 x i16> %zm1,  <vscale x 8 x i16> %zm2,  <vscale x 8 x i16> %zm3,  <vscale x 8 x i16> %zm4)
   ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
 }
 
-define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
-@multi_vec_min_multi_x4_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
-                            <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_min_multi_x4_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
 ; CHECK-LABEL: multi_vec_min_multi_x4_s32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
 ; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    smin { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                            <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
               @llvm.aarch64.sve.smin.x4.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
                                                 <vscale x 4 x i32> %zm1,  <vscale x 4 x i32> %zm2,  <vscale x 4 x i32> %zm3,  <vscale x 4 x i32> %zm4)
   ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
 }
 
-define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
-@multi_vec_min_multi_x4_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
-                            <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_min_multi_x4_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
 ; CHECK-LABEL: multi_vec_min_multi_x4_s64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
 ; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    smin { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                            <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
               @llvm.aarch64.sve.smin.x4.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
                                                 <vscale x 2 x i64> %zm1,  <vscale x 2 x i64> %zm2,  <vscale x 2 x i64> %zm3,  <vscale x 2 x i64> %zm4)
@@ -643,104 +629,100 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
 
 ; UMIN (Multi, x4)
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
-@multi_vec_min_multi_x4_u8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
-                           <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_min_multi_x4_u8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
 ; CHECK-LABEL: multi_vec_min_multi_x4_u8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
 ; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    ld1b { z31.b }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    umin { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                           <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
               @llvm.aarch64.sve.umin.x4.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
                                                 <vscale x 16 x i8> %zm1,  <vscale x 16 x i8> %zm2,  <vscale x 16 x i8> %zm3,  <vscale x 16 x i8> %zm4)
   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
-@multi_vec_min_multi_x4_u16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
-                            <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_min_multi_x4_u16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
 ; CHECK-LABEL: multi_vec_min_multi_x4_u16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
 ; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    umin { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                            <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
               @llvm.aarch64.sve.umin.x4.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
                                                 <vscale x 8 x i16> %zm1,  <vscale x 8 x i16> %zm2,  <vscale x 8 x i16> %zm3,  <vscale x 8 x i16> %zm4)
   ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
 }
 
-define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
-@multi_vec_min_multi_x4_u32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
-                            <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_min_multi_x4_u32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
 ; CHECK-LABEL: multi_vec_min_multi_x4_u32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
 ; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    umin { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                            <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
               @llvm.aarch64.sve.umin.x4.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
                                                 <vscale x 4 x i32> %zm1,  <vscale x 4 x i32> %zm2,  <vscale x 4 x i32> %zm3,  <vscale x 4 x i32> %zm4)
   ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
 }
 
-define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
-@multi_vec_min_multi_x4_u64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
-                            <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_min_multi_x4_u64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
 ; CHECK-LABEL: multi_vec_min_multi_x4_u64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
 ; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    umin { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                            <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
               @llvm.aarch64.sve.umin.x4.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
                                                 <vscale x 2 x i64> %zm1,  <vscale x 2 x i64> %zm2,  <vscale x 2 x i64> %zm3,  <vscale x 2 x i64> %zm4)
@@ -749,78 +731,75 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
 
 ; FMIN (Multi, x4)
 
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }
-@multi_vec_min_multi_x4_f16(<vscale x 8 x half> %unused, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4,
-                            <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4) {
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @multi_vec_min_multi_x4_f16(<vscale x 8 x half> %unused, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4,
 ; CHECK-LABEL: multi_vec_min_multi_x4_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
 ; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    fmin { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                            <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4) {
   %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }
               @llvm.aarch64.sve.fmin.x4.nxv8f16(<vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4,
                                                 <vscale x 8 x half> %zm1,  <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4)
   ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
 }
 
-define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }
-@multi_vec_min_multi_x4_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4,
-                            <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) {
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_min_multi_x4_f32(<vscale x 4 x float> %unused, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4,
 ; CHECK-LABEL: multi_vec_min_multi_x4_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
 ; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    fmin { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                            <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) {
   %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }
               @llvm.aarch64.sve.fmin.x4.nxv4f32(<vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4,
                                                 <vscale x 4 x float> %zm1,  <vscale x 4 x float> %zm2,  <vscale x 4 x float> %zm3,  <vscale x 4 x float> %zm4)
   ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
 }
 
-define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }
-@multi_vec_min_multi_x4_f64(<vscale x 2 x double> %unused, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4,
-                            <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) {
+define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_min_multi_x4_f64(<vscale x 2 x double> %unused, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4,
 ; CHECK-LABEL: multi_vec_min_multi_x4_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
 ; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    fmin { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                            <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) {
   %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }
               @llvm.aarch64.sve.fmin.x4.nxv2f64(<vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4,
                                                 <vscale x 2 x double> %zm1,  <vscale x 2 x double> %zm2,  <vscale x 2 x double> %zm3,  <vscale x 2 x double> %zm4)
@@ -870,8 +849,7 @@ define { <vscale x 2 x double>, <vscale x 2 x double> }  @multi_vec_minnm_single
 
 ; FMINNM (Single, x4)
 
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }
-@multi_vec_minnm_single_x4_f16(<vscale x 8 x half> %dummy, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x half> %zm) {
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @multi_vec_minnm_single_x4_f16(<vscale x 8 x half> %dummy, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x half> %zm) {
 ; CHECK-LABEL: multi_vec_minnm_single_x4_f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -889,8 +867,7 @@ define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale
   ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
 }
 
-define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }
-@multi_vec_minnm_single_x4_f32(<vscale x 8 x half> %dummy, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x float> %zm) {
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_minnm_single_x4_f32(<vscale x 8 x half> %dummy, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x float> %zm) {
 ; CHECK-LABEL: multi_vec_minnm_single_x4_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -908,8 +885,7 @@ define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vsca
   ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
 }
 
-define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }
-@multi_vec_minnm_single_x4_f64(<vscale x 8 x half> %dummy, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4, <vscale x 2 x double> %zm) {
+define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_minnm_single_x4_f64(<vscale x 8 x half> %dummy, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4, <vscale x 2 x double> %zm) {
 ; CHECK-LABEL: multi_vec_minnm_single_x4_f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -976,19 +952,18 @@ define { <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_minnm_x2_f64(
 
 ; FMINNM (Multi, x4)
 
-define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> }
-@multi_vec_minnm_x4_f16(<vscale x 8 x half> %dummy, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4) {
+define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @multi_vec_minnm_x4_f16(<vscale x 8 x half> %dummy, <vscale x 8 x half> %zdn1, <vscale x 8 x half> %zdn2, <vscale x 8 x half> %zdn3, <vscale x 8 x half> %zdn4, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3, <vscale x 8 x half> %zm4) {
 ; CHECK-LABEL: multi_vec_minnm_x4_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
 ; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    fminnm { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -1001,19 +976,18 @@ define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale
   ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res
 }
 
-define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> }
-@multi_vec_minnm_x4_f32(<vscale x 8 x half> %dummy, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) {
+define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @multi_vec_minnm_x4_f32(<vscale x 8 x half> %dummy, <vscale x 4 x float> %zdn1, <vscale x 4 x float> %zdn2, <vscale x 4 x float> %zdn3, <vscale x 4 x float> %zdn4, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) {
 ; CHECK-LABEL: multi_vec_minnm_x4_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
 ; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    fminnm { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -1026,19 +1000,18 @@ define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vsca
   ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
 }
 
-define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> }
-@multi_vec_minnm_x4_f64(<vscale x 8 x half> %dummy, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) {
+define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @multi_vec_minnm_x4_f64(<vscale x 8 x half> %dummy, <vscale x 2 x double> %zdn1, <vscale x 2 x double> %zdn2, <vscale x 2 x double> %zdn3, <vscale x 2 x double> %zdn4, <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) {
 ; CHECK-LABEL: multi_vec_minnm_x4_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z30.d, z7.d
 ; CHECK-NEXT:    mov z27.d, z4.d
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z29.d, z6.d
 ; CHECK-NEXT:    mov z26.d, z3.d
 ; CHECK-NEXT:    mov z28.d, z5.d
 ; CHECK-NEXT:    mov z25.d, z2.d
-; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    fminnm { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll
index f766bfcff4d1d..346afc611eb75 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll
@@ -142,16 +142,16 @@ define void @multi_vector_mul_add_multi_long_vg4x2_s16(i32 %slice, <vscale x 8 x
 define void @multi_vector_mul_add_multi_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
 ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_s8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    smlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    smlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    ret
@@ -164,16 +164,16 @@ define void @multi_vector_mul_add_multi_long_vg4x4_s8(i32 %slice, <vscale x 16 x
 define void @multi_vector_mul_add_multi_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
 ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_s16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    smlall za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    smlall za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
@@ -418,16 +418,16 @@ define void @multi_vector_mul_add_multi_long_vg4x2_u16(i32 %slice, <vscale x 8 x
 define void @multi_vector_mul_add_multi_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
 ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_u8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    umlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    umlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    ret
@@ -440,16 +440,16 @@ define void @multi_vector_mul_add_multi_long_vg4x4_u8(i32 %slice, <vscale x 16 x
 define void @multi_vector_mul_add_multi_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
 ; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_u16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    umlall za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    umlall za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
@@ -694,16 +694,16 @@ define void @multi_vector_mul_sub_multi_long_vg4x2_s16(i32 %slice, <vscale x 8 x
 define void @multi_vector_mul_sub_multi_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
 ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_s8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    smlsll za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    smlsll za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    ret
@@ -716,16 +716,16 @@ define void @multi_vector_mul_sub_multi_long_vg4x4_s8(i32 %slice, <vscale x 16 x
 define void @multi_vector_mul_sub_multi_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
 ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_s16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    smlsll za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    smlsll za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
@@ -970,16 +970,16 @@ define void @multi_vector_mul_sub_multi_long_vg4x2_u16(i32 %slice, <vscale x 8 x
 define void @multi_vector_mul_sub_multi_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
 ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_u8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    umlsll za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    umlsll za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    ret
@@ -992,16 +992,16 @@ define void @multi_vector_mul_sub_multi_long_vg4x4_u8(i32 %slice, <vscale x 16 x
 define void @multi_vector_mul_sub_multi_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
 ; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_u16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    umlsll za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    umlsll za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ret
@@ -1275,16 +1275,16 @@ define void @multi_vector_mul_add_multi_unsigned_long_vg4x2_u8(i32 %slice, <vsca
 define void @multi_vector_mul_add_multi_unsigned_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
 ; CHECK-LABEL: multi_vector_mul_add_multi_unsigned_long_vg4x4_u8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    usmlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    usmlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll
index d138a3af43852..12a940ff03e29 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-rshl.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
 
 ; SRSHL (Single, x2)
@@ -56,8 +57,7 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_rounding_shl_single
 
 ; SRSHL (Single, x4)
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
-@multi_vec_rounding_shl_single_x4_s8(<vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm) {
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_rounding_shl_single_x4_s8(<vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_s8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -75,8 +75,7 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
-@multi_vec_rounding_shl_single_x4_s16(<vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) {
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_rounding_shl_single_x4_s16(<vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_s16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -94,8 +93,7 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
   ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
 }
 
-define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
-@multi_vec_rounding_shl_single_x4_s32(<vscale x 4 x i32> %dummy, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) {
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_rounding_shl_single_x4_s32(<vscale x 4 x i32> %dummy, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) {
 ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_s32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -113,8 +111,7 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
   ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
 }
 
-define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
-@multi_vec_rounding_shl_single_x4_s64(<vscale x 2 x i64> %dummy, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) {
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_rounding_shl_single_x4_s64(<vscale x 2 x i64> %dummy, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) {
 ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_s64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -188,8 +185,7 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_rounding_shl_single
 
 ; URSHL (Single, x4)
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
-@multi_vec_rounding_shl_single_x4_u8(<vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm) {
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_rounding_shl_single_x4_u8(<vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_u8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -207,8 +203,7 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
-@multi_vec_rounding_shl_single_x4_u16(<vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) {
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_rounding_shl_single_x4_u16(<vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_u16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -226,8 +221,7 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
   ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
 }
 
-define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
-@multi_vec_rounding_shl_single_x4_u32(<vscale x 4 x i32> %dummy, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) {
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_rounding_shl_single_x4_u32(<vscale x 4 x i32> %dummy, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) {
 ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_u32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -245,8 +239,7 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
   ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
 }
 
-define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
-@multi_vec_rounding_shl_single_x4_u64(<vscale x 2 x i64> %dummy, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) {
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_rounding_shl_single_x4_u64(<vscale x 2 x i64> %dummy, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) {
 ; CHECK-LABEL: multi_vec_rounding_shl_single_x4_u64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -328,19 +321,18 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_rounding_shl_x2_s64
 
 ; SRSHL (Multi, x4)
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
-@multi_vec_rounding_shl_x4_s8(<vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_rounding_shl_x4_s8(<vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
 ; CHECK-LABEL: multi_vec_rounding_shl_x4_s8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1b { z31.b }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    srshl { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -353,19 +345,18 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
-@multi_vec_rounding_shl_x4_s16(<vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_rounding_shl_x4_s16(<vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
 ; CHECK-LABEL: multi_vec_rounding_shl_x4_s16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    srshl { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -378,19 +369,18 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
   ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
 }
 
-define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
-@multi_vec_rounding_shl_x4_s32(<vscale x 4 x i32> %dummy, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_rounding_shl_x4_s32(<vscale x 4 x i32> %dummy, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
 ; CHECK-LABEL: multi_vec_rounding_shl_x4_s32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    srshl { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -403,19 +393,18 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
   ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
 }
 
-define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
-@multi_vec_rounding_shl_x4_s64(<vscale x 2 x i64> %dummy, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_rounding_shl_x4_s64(<vscale x 2 x i64> %dummy, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
 ; CHECK-LABEL: multi_vec_rounding_shl_x4_s64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    srshl { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -492,19 +481,18 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_rounding_uhl_x2_u64
 
 ; URSHL (Multi, x4)
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
-@multi_vec_rounding_shl_x4_u8(<vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_rounding_shl_x4_u8(<vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
 ; CHECK-LABEL: multi_vec_rounding_shl_x4_u8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1b { z31.b }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    urshl { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -517,19 +505,18 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
-@multi_vec_rounding_shl_x4_u16(<vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_rounding_shl_x4_u16(<vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
 ; CHECK-LABEL: multi_vec_rounding_shl_x4_u16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    urshl { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -542,19 +529,18 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
   ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
 }
 
-define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
-@multi_vec_rounding_shl_x4_u32(<vscale x 4 x i32> %dummy, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_rounding_shl_x4_u32(<vscale x 4 x i32> %dummy, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
 ; CHECK-LABEL: multi_vec_rounding_shl_x4_u32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    urshl { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
@@ -567,19 +553,18 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
   ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
 }
 
-define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
-@multi_vec_rounding_shl_x4_u64(<vscale x 2 x i64> %dummy, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_rounding_shl_x4_u64(<vscale x 2 x i64> %dummy, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
 ; CHECK-LABEL: multi_vec_rounding_shl_x4_u64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    urshl { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll
index 9c5dff6c3bf6f..e71afe213d8a5 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-sqdmulh.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s
 
 ; SQDMULH (Single, x2)
@@ -56,8 +57,7 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_sat_double_mulh_sin
 
 ; SQDMULH (Single, x4)
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
-@multi_vec_sat_double_mulh_single_x4_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm) {
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_sat_double_mulh_single_x4_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: multi_vec_sat_double_mulh_single_x4_s8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -75,8 +75,7 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
-@multi_vec_sat_double_mulh_single_x4_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) {
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_sat_double_mulh_single_x4_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: multi_vec_sat_double_mulh_single_x4_s16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -94,8 +93,7 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
   ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
 }
 
-define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
-@multi_vec_sat_double_mulh_single_x4_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) {
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_sat_double_mulh_single_x4_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) {
 ; CHECK-LABEL: multi_vec_sat_double_mulh_single_x4_s32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -113,8 +111,7 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
   ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
 }
 
-define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
-@multi_vec_sat_double_mulh_single_x4_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) {
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_sat_double_mulh_single_x4_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) {
 ; CHECK-LABEL: multi_vec_sat_double_mulh_single_x4_s64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z27.d, z4.d
@@ -196,104 +193,100 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_sat_double_mulh_mul
 
 ; SQDMULH (x4, Multi)
 
-define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
-@multi_vec_sat_double_mulh_multi_x4_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
-                                       <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_sat_double_mulh_multi_x4_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
 ; CHECK-LABEL: multi_vec_sat_double_mulh_multi_x4_s8:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1b { z31.b }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    sqdmulh { z24.b - z27.b }, { z24.b - z27.b }, { z28.b - z31.b }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                                       <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4) {
   %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
               @llvm.aarch64.sve.sqdmulh.vgx4.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
                                                      <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3, <vscale x 16 x i8> %zm4)
   ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
 }
 
-define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
-@multi_vec_sat_double_mulh_multi_x4_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
-                                        <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_sat_double_mulh_multi_x4_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
 ; CHECK-LABEL: multi_vec_sat_double_mulh_multi_x4_s16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1h { z31.h }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    sqdmulh { z24.h - z27.h }, { z24.h - z27.h }, { z28.h - z31.h }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                                        <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4) {
   %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
               @llvm.aarch64.sve.sqdmulh.vgx4.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
                                                      <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3, <vscale x 8 x i16> %zm4)
   ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
 }
 
-define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
-@multi_vec_sat_double_mulh_multi_x4_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
-                                        <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_sat_double_mulh_multi_x4_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
 ; CHECK-LABEL: multi_vec_sat_double_mulh_multi_x4_s32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1w { z31.s }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    sqdmulh { z24.s - z27.s }, { z24.s - z27.s }, { z28.s - z31.s }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                                        <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4) {
   %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
               @llvm.aarch64.sve.sqdmulh.vgx4.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
                                                      <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3, <vscale x 4 x i32> %zm4)
   ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
 }
 
-define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
-@multi_vec_sat_double_mulh_multi_x4_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
-                                        <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_sat_double_mulh_multi_x4_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
 ; CHECK-LABEL: multi_vec_sat_double_mulh_multi_x4_s64:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z30.d, z7.d
+; CHECK-NEXT:    mov z27.d, z4.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov	z30.d, z7.d
-; CHECK-NEXT:    mov	z27.d, z4.d
-; CHECK-NEXT:    mov	z29.d, z6.d
-; CHECK-NEXT:    mov	z26.d, z3.d
-; CHECK-NEXT:    mov	z28.d, z5.d
-; CHECK-NEXT:    mov	z25.d, z2.d
-; CHECK-NEXT:    mov	z24.d, z1.d
+; CHECK-NEXT:    mov z29.d, z6.d
+; CHECK-NEXT:    mov z26.d, z3.d
+; CHECK-NEXT:    mov z28.d, z5.d
+; CHECK-NEXT:    mov z25.d, z2.d
 ; CHECK-NEXT:    ld1d { z31.d }, p0/z, [x0]
+; CHECK-NEXT:    mov z24.d, z1.d
 ; CHECK-NEXT:    sqdmulh { z24.d - z27.d }, { z24.d - z27.d }, { z28.d - z31.d }
 ; CHECK-NEXT:    mov z0.d, z24.d
 ; CHECK-NEXT:    mov z1.d, z25.d
 ; CHECK-NEXT:    mov z2.d, z26.d
 ; CHECK-NEXT:    mov z3.d, z27.d
 ; CHECK-NEXT:    ret
+                                        <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4) {
   %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
               @llvm.aarch64.sve.sqdmulh.vgx4.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
                                                      <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3, <vscale x 2 x i64> %zm4)
diff --git a/llvm/test/CodeGen/AArch64/split-vector-insert.ll b/llvm/test/CodeGen/AArch64/split-vector-insert.ll
index a507296338f93..b8ab9a00c6981 100644
--- a/llvm/test/CodeGen/AArch64/split-vector-insert.ll
+++ b/llvm/test/CodeGen/AArch64/split-vector-insert.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc < %s -debug-only=legalize-types 2>&1 | FileCheck %s --check-prefix=CHECK-LEGALIZATION
 ; RUN: llc < %s | FileCheck %s
 ; REQUIRES: asserts
@@ -9,54 +10,94 @@ declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64>,
 declare <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v8f64(<vscale x 2 x double>, <8 x double>, i64)
 
 define <vscale x 2 x i64> @test_nxv2i64_v8i64(<vscale x 2 x i64> %a, <8 x i64> %b) #0 {
-; CHECK-LEGALIZATION: Legally typed node: [[T1:t[0-9]+]]: nxv2i64 = insert_subvector {{t[0-9]+}}, {{t[0-9]+}}, Constant:i64<0>
-; CHECK-LEGALIZATION: Legally typed node: [[T2:t[0-9]+]]: nxv2i64 = insert_subvector [[T1]], {{t[0-9]+}}, Constant:i64<2>
-; CHECK-LEGALIZATION: Legally typed node: [[T3:t[0-9]+]]: nxv2i64 = insert_subvector [[T2]], {{t[0-9]+}}, Constant:i64<4>
-; CHECK-LEGALIZATION: Legally typed node: [[T4:t[0-9]+]]: nxv2i64 = insert_subvector [[T3]], {{t[0-9]+}}, Constant:i64<6>
-
+; CHECK-LEGALIZATION-LABEL: test_nxv2i64_v8i64:
+; CHECK-LEGALIZATION:       // %bb.0:
+; CHECK-LEGALIZATION-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-LEGALIZATION-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-LEGALIZATION-NEXT:    .cfi_offset w29, -16
+; CHECK-LEGALIZATION-NEXT:    addvl sp, sp, #-3
+; CHECK-LEGALIZATION-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-LEGALIZATION-NEXT:    cntd x8
+; CHECK-LEGALIZATION-NEXT:    ptrue p0.d, vl2
+; CHECK-LEGALIZATION-NEXT:    mov w9, #2 // =0x2
+; CHECK-LEGALIZATION-NEXT:    sub x8, x8, #2
+; CHECK-LEGALIZATION-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-LEGALIZATION-NEXT:    mov x10, sp
+; CHECK-LEGALIZATION-NEXT:    cmp x8, #2
+; CHECK-LEGALIZATION-NEXT:    mov z0.d, p0/m, z1.d
+; CHECK-LEGALIZATION-NEXT:    ptrue p0.d
+; CHECK-LEGALIZATION-NEXT:    csel x9, x8, x9, lo
+; CHECK-LEGALIZATION-NEXT:    cmp x8, #4
+; CHECK-LEGALIZATION-NEXT:    lsl x9, x9, #3
+; CHECK-LEGALIZATION-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK-LEGALIZATION-NEXT:    str q2, [x10, x9]
+; CHECK-LEGALIZATION-NEXT:    mov w9, #4 // =0x4
+; CHECK-LEGALIZATION-NEXT:    addvl x10, sp, #1
+; CHECK-LEGALIZATION-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK-LEGALIZATION-NEXT:    csel x9, x8, x9, lo
+; CHECK-LEGALIZATION-NEXT:    cmp x8, #6
+; CHECK-LEGALIZATION-NEXT:    lsl x9, x9, #3
+; CHECK-LEGALIZATION-NEXT:    st1d { z0.d }, p0, [sp, #1, mul vl]
+; CHECK-LEGALIZATION-NEXT:    str q3, [x10, x9]
+; CHECK-LEGALIZATION-NEXT:    mov w9, #6 // =0x6
+; CHECK-LEGALIZATION-NEXT:    ld1d { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-LEGALIZATION-NEXT:    csel x8, x8, x9, lo
+; CHECK-LEGALIZATION-NEXT:    addvl x9, sp, #2
+; CHECK-LEGALIZATION-NEXT:    lsl x8, x8, #3
+; CHECK-LEGALIZATION-NEXT:    st1d { z0.d }, p0, [sp, #2, mul vl]
+; CHECK-LEGALIZATION-NEXT:    str q4, [x9, x8]
+; CHECK-LEGALIZATION-NEXT:    ld1d { z0.d }, p0/z, [sp, #2, mul vl]
+; CHECK-LEGALIZATION-NEXT:    addvl sp, sp, #3
+; CHECK-LEGALIZATION-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-LEGALIZATION-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-LEGALIZATION-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-LEGALIZATION-NEXT:    .cfi_restore w29
+; CHECK-LEGALIZATION-NEXT:    ret
+;
 ; CHECK-LABEL: test_nxv2i64_v8i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT: str	x29, [sp, #-16]!                // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: addvl	sp, sp, #-3
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
-; CHECK-NEXT: ptrue	p1.d, vl2
-; CHECK-NEXT: cntd	x8
-; CHECK-NEXT: mov	w9, #2                          // =0x2
-; CHECK-NEXT: ptrue	p0.d
-; CHECK-NEXT: sub	x8, x8, #2
-; CHECK-NEXT:                                       // kill: def $q1 killed $q1 def $z1
-; CHECK-NEXT: mov	x10, sp
-; CHECK-NEXT: cmp	x8, #2
-; CHECK-NEXT: csel	x9, x8, x9, lo
-; CHECK-NEXT: cmp	x8, #4
-; CHECK-NEXT: lsl	x9, x9, #3
-; CHECK-NEXT: mov	z0.d, p1/m, z1.d
-; CHECK-NEXT: st1d	{ z0.d }, p0, [sp]
-; CHECK-NEXT: str	q2, [x10, x9]
-; CHECK-NEXT: mov	w9, #4                          // =0x4
-; CHECK-NEXT: addvl	x10, sp, #1
-; CHECK-NEXT: ld1d	{ z0.d }, p0/z, [sp]
-; CHECK-NEXT: csel	x9, x8, x9, lo
-; CHECK-NEXT: cmp	x8, #6
-; CHECK-NEXT: lsl	x9, x9, #3
-; CHECK-NEXT: st1d	{ z0.d }, p0, [sp, #1, mul vl]
-; CHECK-NEXT: str	q3, [x10, x9]
-; CHECK-NEXT: mov	w9, #6                          // =0x6
-; CHECK-NEXT: ld1d	{ z0.d }, p0/z, [sp, #1, mul vl]
-; CHECK-NEXT: csel	x8, x8, x9, lo
-; CHECK-NEXT: addvl	x9, sp, #2
-; CHECK-NEXT: lsl	x8, x8, #3
-; CHECK-NEXT: st1d	{ z0.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT: str	q4, [x9, x8]
-; CHECK-NEXT: ld1d	{ z0.d }, p0/z, [sp, #2, mul vl]
-; CHECK-NEXT: addvl	sp, sp, #3
-; CHECK-NEXT: .cfi_def_cfa wsp, 16
-; CHECK-NEXT: ldr	x29, [sp], #16                  // 8-byte Folded Reload
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: .cfi_restore w29
-; CHECK-NEXT: ret
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    mov w9, #2 // =0x2
+; CHECK-NEXT:    sub x8, x8, #2
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    cmp x8, #2
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    csel x9, x8, x9, lo
+; CHECK-NEXT:    cmp x8, #4
+; CHECK-NEXT:    lsl x9, x9, #3
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK-NEXT:    str q2, [x10, x9]
+; CHECK-NEXT:    mov w9, #4 // =0x4
+; CHECK-NEXT:    addvl x10, sp, #1
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK-NEXT:    csel x9, x8, x9, lo
+; CHECK-NEXT:    cmp x8, #6
+; CHECK-NEXT:    lsl x9, x9, #3
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    str q3, [x10, x9]
+; CHECK-NEXT:    mov w9, #6 // =0x6
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    addvl x9, sp, #2
+; CHECK-NEXT:    lsl x8, x8, #3
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    str q4, [x9, x8]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp, #2, mul vl]
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+
 
 
 
@@ -66,54 +107,94 @@ define <vscale x 2 x i64> @test_nxv2i64_v8i64(<vscale x 2 x i64> %a, <8 x i64> %
 }
 
 define <vscale x 2 x double> @test_nxv2f64_v8f64(<vscale x 2 x double> %a, <8 x double> %b) #0 {
-; CHECK-LEGALIZATION: Legally typed node: [[T1:t[0-9]+]]: nxv2f64 = insert_subvector {{t[0-9]+}}, {{t[0-9]+}}, Constant:i64<0>
-; CHECK-LEGALIZATION: Legally typed node: [[T2:t[0-9]+]]: nxv2f64 = insert_subvector [[T1]], {{t[0-9]+}}, Constant:i64<2>
-; CHECK-LEGALIZATION: Legally typed node: [[T3:t[0-9]+]]: nxv2f64 = insert_subvector [[T2]], {{t[0-9]+}}, Constant:i64<4>
-; CHECK-LEGALIZATION: Legally typed node: [[T4:t[0-9]+]]: nxv2f64 = insert_subvector [[T3]], {{t[0-9]+}}, Constant:i64<6>
-
+; CHECK-LEGALIZATION-LABEL: test_nxv2f64_v8f64:
+; CHECK-LEGALIZATION:       // %bb.0:
+; CHECK-LEGALIZATION-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-LEGALIZATION-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-LEGALIZATION-NEXT:    .cfi_offset w29, -16
+; CHECK-LEGALIZATION-NEXT:    addvl sp, sp, #-3
+; CHECK-LEGALIZATION-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-LEGALIZATION-NEXT:    cntd x8
+; CHECK-LEGALIZATION-NEXT:    ptrue p0.d, vl2
+; CHECK-LEGALIZATION-NEXT:    mov w9, #2 // =0x2
+; CHECK-LEGALIZATION-NEXT:    sub x8, x8, #2
+; CHECK-LEGALIZATION-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-LEGALIZATION-NEXT:    mov x10, sp
+; CHECK-LEGALIZATION-NEXT:    cmp x8, #2
+; CHECK-LEGALIZATION-NEXT:    mov z0.d, p0/m, z1.d
+; CHECK-LEGALIZATION-NEXT:    ptrue p0.d
+; CHECK-LEGALIZATION-NEXT:    csel x9, x8, x9, lo
+; CHECK-LEGALIZATION-NEXT:    cmp x8, #4
+; CHECK-LEGALIZATION-NEXT:    lsl x9, x9, #3
+; CHECK-LEGALIZATION-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK-LEGALIZATION-NEXT:    str q2, [x10, x9]
+; CHECK-LEGALIZATION-NEXT:    mov w9, #4 // =0x4
+; CHECK-LEGALIZATION-NEXT:    addvl x10, sp, #1
+; CHECK-LEGALIZATION-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK-LEGALIZATION-NEXT:    csel x9, x8, x9, lo
+; CHECK-LEGALIZATION-NEXT:    cmp x8, #6
+; CHECK-LEGALIZATION-NEXT:    lsl x9, x9, #3
+; CHECK-LEGALIZATION-NEXT:    st1d { z0.d }, p0, [sp, #1, mul vl]
+; CHECK-LEGALIZATION-NEXT:    str q3, [x10, x9]
+; CHECK-LEGALIZATION-NEXT:    mov w9, #6 // =0x6
+; CHECK-LEGALIZATION-NEXT:    ld1d { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-LEGALIZATION-NEXT:    csel x8, x8, x9, lo
+; CHECK-LEGALIZATION-NEXT:    addvl x9, sp, #2
+; CHECK-LEGALIZATION-NEXT:    lsl x8, x8, #3
+; CHECK-LEGALIZATION-NEXT:    st1d { z0.d }, p0, [sp, #2, mul vl]
+; CHECK-LEGALIZATION-NEXT:    str q4, [x9, x8]
+; CHECK-LEGALIZATION-NEXT:    ld1d { z0.d }, p0/z, [sp, #2, mul vl]
+; CHECK-LEGALIZATION-NEXT:    addvl sp, sp, #3
+; CHECK-LEGALIZATION-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-LEGALIZATION-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-LEGALIZATION-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-LEGALIZATION-NEXT:    .cfi_restore w29
+; CHECK-LEGALIZATION-NEXT:    ret
+;
 ; CHECK-LABEL: test_nxv2f64_v8f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT: str	x29, [sp, #-16]!                // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: addvl	sp, sp, #-3
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
-; CHECK-NEXT: ptrue	p1.d, vl2
-; CHECK-NEXT: cntd	x8
-; CHECK-NEXT: mov	w9, #2                          // =0x2
-; CHECK-NEXT: ptrue	p0.d
-; CHECK-NEXT: sub	x8, x8, #2
-; CHECK-NEXT:                                       // kill: def $q1 killed $q1 def $z1
-; CHECK-NEXT: mov	x10, sp
-; CHECK-NEXT: cmp	x8, #2
-; CHECK-NEXT: csel	x9, x8, x9, lo
-; CHECK-NEXT: cmp	x8, #4
-; CHECK-NEXT: lsl	x9, x9, #3
-; CHECK-NEXT: mov	z0.d, p1/m, z1.d
-; CHECK-NEXT: st1d	{ z0.d }, p0, [sp]
-; CHECK-NEXT: str	q2, [x10, x9]
-; CHECK-NEXT: mov	w9, #4                          // =0x4
-; CHECK-NEXT: addvl	x10, sp, #1
-; CHECK-NEXT: ld1d	{ z0.d }, p0/z, [sp]
-; CHECK-NEXT: csel	x9, x8, x9, lo
-; CHECK-NEXT: cmp	x8, #6
-; CHECK-NEXT: lsl	x9, x9, #3
-; CHECK-NEXT: st1d	{ z0.d }, p0, [sp, #1, mul vl]
-; CHECK-NEXT: str	q3, [x10, x9]
-; CHECK-NEXT: mov	w9, #6                          // =0x6
-; CHECK-NEXT: ld1d	{ z0.d }, p0/z, [sp, #1, mul vl]
-; CHECK-NEXT: csel	x8, x8, x9, lo
-; CHECK-NEXT: addvl	x9, sp, #2
-; CHECK-NEXT: lsl	x8, x8, #3
-; CHECK-NEXT: st1d	{ z0.d }, p0, [sp, #2, mul vl]
-; CHECK-NEXT: str	q4, [x9, x8]
-; CHECK-NEXT: ld1d	{ z0.d }, p0/z, [sp, #2, mul vl]
-; CHECK-NEXT: addvl	sp, sp, #3
-; CHECK-NEXT: .cfi_def_cfa wsp, 16
-; CHECK-NEXT: ldr	x29, [sp], #16                  // 8-byte Folded Reload
-; CHECK-NEXT: .cfi_def_cfa_offset 0
-; CHECK-NEXT: .cfi_restore w29
-; CHECK-NEXT: ret
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    ptrue p0.d, vl2
+; CHECK-NEXT:    mov w9, #2 // =0x2
+; CHECK-NEXT:    sub x8, x8, #2
+; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
+; CHECK-NEXT:    mov x10, sp
+; CHECK-NEXT:    cmp x8, #2
+; CHECK-NEXT:    mov z0.d, p0/m, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    csel x9, x8, x9, lo
+; CHECK-NEXT:    cmp x8, #4
+; CHECK-NEXT:    lsl x9, x9, #3
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK-NEXT:    str q2, [x10, x9]
+; CHECK-NEXT:    mov w9, #4 // =0x4
+; CHECK-NEXT:    addvl x10, sp, #1
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
+; CHECK-NEXT:    csel x9, x8, x9, lo
+; CHECK-NEXT:    cmp x8, #6
+; CHECK-NEXT:    lsl x9, x9, #3
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    str q3, [x10, x9]
+; CHECK-NEXT:    mov w9, #6 // =0x6
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    addvl x9, sp, #2
+; CHECK-NEXT:    lsl x8, x8, #3
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    str q4, [x9, x8]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp, #2, mul vl]
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    .cfi_def_cfa wsp, 16
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    .cfi_restore w29
+; CHECK-NEXT:    ret
+
 
 
 
diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
index 1d9cb88260b60..c0c0ae5c9d1fe 100644
--- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
+++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-splat.ll
@@ -111,6 +111,7 @@ define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_srem_odd_undef1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-NEXT:    movi v3.4s, #25
 ; CHECK-NEXT:    movk w8, #20971, lsl #16
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    smull2 v2.2d, v0.4s, v1.4s
@@ -118,9 +119,8 @@ define <4 x i32> @test_srem_odd_undef1(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    sshr v2.4s, v1.4s, #3
 ; CHECK-NEXT:    usra v2.4s, v1.4s, #31
-; CHECK-NEXT:    movi v1.4s, #25
-; CHECK-NEXT:    mls v0.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    mls v0.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -134,6 +134,7 @@ define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_srem_even_undef1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #34079 // =0x851f
+; CHECK-NEXT:    movi v3.4s, #100
 ; CHECK-NEXT:    movk w8, #20971, lsl #16
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    smull2 v2.2d, v0.4s, v1.4s
@@ -141,9 +142,8 @@ define <4 x i32> @test_srem_even_undef1(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    sshr v2.4s, v1.4s, #5
 ; CHECK-NEXT:    usra v2.4s, v1.4s, #31
-; CHECK-NEXT:    movi v1.4s, #100
-; CHECK-NEXT:    mls v0.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    movi v1.4s, #1
+; CHECK-NEXT:    mls v0.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
@@ -201,11 +201,11 @@ define <4 x i32> @test_srem_pow2(<4 x i32> %X) nounwind {
 define <4 x i32> @test_srem_int_min(<4 x i32> %X) nounwind {
 ; CHECK-LABEL: test_srem_int_min:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    cmlt v1.4s, v0.4s, #0
-; CHECK-NEXT:    mov v2.16b, v0.16b
-; CHECK-NEXT:    usra v2.4s, v1.4s, #1
+; CHECK-NEXT:    cmlt v2.4s, v0.4s, #0
+; CHECK-NEXT:    mov v3.16b, v0.16b
 ; CHECK-NEXT:    movi v1.4s, #128, lsl #24
-; CHECK-NEXT:    and v1.16b, v2.16b, v1.16b
+; CHECK-NEXT:    usra v3.4s, v2.4s, #1
+; CHECK-NEXT:    and v1.16b, v3.16b, v1.16b
 ; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    movi v1.4s, #1
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
diff --git a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll
index 0598af7c98063..a74f0c86fe185 100644
--- a/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/AArch64/srem-vector-lkk.ll
@@ -245,6 +245,7 @@ define <4 x i32> @fold_srem_v4i32(<4 x i32> %x) {
 ; CHECK-LABEL: fold_srem_v4i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #26215 // =0x6667
+; CHECK-NEXT:    movi v3.4s, #10
 ; CHECK-NEXT:    movk w8, #26214, lsl #16
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    smull2 v2.2d, v0.4s, v1.4s
@@ -252,8 +253,7 @@ define <4 x i32> @fold_srem_v4i32(<4 x i32> %x) {
 ; CHECK-NEXT:    uzp2 v1.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    sshr v2.4s, v1.4s, #2
 ; CHECK-NEXT:    usra v2.4s, v1.4s, #31
-; CHECK-NEXT:    movi v1.4s, #10
-; CHECK-NEXT:    mls v0.4s, v2.4s, v1.4s
+; CHECK-NEXT:    mls v0.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    ret
   %1 = srem <4 x i32> %x, <i32 10, i32 10, i32 10, i32 10>
   ret <4 x i32> %1
diff --git a/llvm/test/CodeGen/AArch64/sve-abd.ll b/llvm/test/CodeGen/AArch64/sve-abd.ll
index 95aec0a492619..7b492229e3d23 100644
--- a/llvm/test/CodeGen/AArch64/sve-abd.ll
+++ b/llvm/test/CodeGen/AArch64/sve-abd.ll
@@ -24,10 +24,10 @@ define <vscale x 16 x i8> @sabd_b(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
 define <vscale x 16 x i8> @sabd_b_promoted_ops(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b) #0 {
 ; CHECK-LABEL: sabd_b_promoted_ops:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p2.b
 ; CHECK-NEXT:    mov z0.b, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.b, p1/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    sabd z0.b, p2/m, z0.b, z1.b
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    sabd z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    ret
   %a.sext = sext <vscale x 16 x i1> %a to <vscale x 16 x i8>
   %b.sext = sext <vscale x 16 x i1> %b to <vscale x 16 x i8>
@@ -144,10 +144,10 @@ define <vscale x 16 x i8> @uabd_b(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
 define <vscale x 16 x i8> @uabd_b_promoted_ops(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b) #0 {
 ; CHECK-LABEL: uabd_b_promoted_ops:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p2.b
 ; CHECK-NEXT:    mov z0.b, p0/z, #1 // =0x1
 ; CHECK-NEXT:    mov z1.b, p1/z, #1 // =0x1
-; CHECK-NEXT:    uabd z0.b, p2/m, z0.b, z1.b
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    uabd z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    ret
   %a.zext = zext <vscale x 16 x i1> %a to <vscale x 16 x i8>
   %b.zext = zext <vscale x 16 x i1> %b to <vscale x 16 x i8>
@@ -173,9 +173,9 @@ define <vscale x 8 x i16> @uabd_h(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
 define <vscale x 8 x i16> @uabd_h_promoted_ops(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b) #0 {
 ; CHECK-LABEL: uabd_h_promoted_ops:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    and z0.h, z0.h, #0xff
 ; CHECK-NEXT:    and z1.h, z1.h, #0xff
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    uabd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %a.zext = zext <vscale x 8 x i8> %a to <vscale x 8 x i16>
@@ -202,9 +202,9 @@ define <vscale x 4 x i32> @uabd_s(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
 define <vscale x 4 x i32> @uabd_s_promoted_ops(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b) #0 {
 ; CHECK-LABEL: uabd_s_promoted_ops:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and z0.s, z0.s, #0xffff
 ; CHECK-NEXT:    and z1.s, z1.s, #0xffff
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uabd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %a.zext = zext <vscale x 4 x i16> %a to <vscale x 4 x i32>
@@ -231,9 +231,9 @@ define <vscale x 2 x i64> @uabd_d(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
 define <vscale x 2 x i64> @uabd_d_promoted_ops(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b) #0 {
 ; CHECK-LABEL: uabd_d_promoted_ops:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
 ; CHECK-NEXT:    and z1.d, z1.d, #0xffffffff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uabd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %a.zext = zext <vscale x 2 x i32> %a to <vscale x 2 x i64>
@@ -248,8 +248,8 @@ define <vscale x 2 x i64> @uabd_d_promoted_ops(<vscale x 2 x i32> %a, <vscale x
 define <vscale x 4 x i32> @uabd_non_matching_extension(<vscale x 4 x i32> %a, <vscale x 4 x i8> %b) #0 {
 ; CHECK-LABEL: uabd_non_matching_extension:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and z1.s, z1.s, #0xff
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uabd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %a.zext = zext <vscale x 4 x i32> %a to <vscale x 4 x i64>
@@ -265,9 +265,9 @@ define <vscale x 4 x i32> @uabd_non_matching_extension(<vscale x 4 x i32> %a, <v
 define <vscale x 4 x i32> @uabd_non_matching_promoted_ops(<vscale x 4 x i8> %a, <vscale x 4 x i16> %b) #0 {
 ; CHECK-LABEL: uabd_non_matching_promoted_ops:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and z0.s, z0.s, #0xff
 ; CHECK-NEXT:    and z1.s, z1.s, #0xffff
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uabd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %a.zext = zext <vscale x 4 x i8> %a to <vscale x 4 x i32>
diff --git a/llvm/test/CodeGen/AArch64/sve-bitcast.ll b/llvm/test/CodeGen/AArch64/sve-bitcast.ll
index 7dd568fc837a3..95f43ba512632 100644
--- a/llvm/test/CodeGen/AArch64/sve-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bitcast.ll
@@ -1698,8 +1698,8 @@ define <vscale x 1 x i64> @bitcast_nxv8i8_to_nxv1i64(<vscale x 8 x i8> %v) #0 {
 ; CHECK_BE:       // %bb.0:
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.b
 ; CHECK_BE-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK_BE-NEXT:    ptrue p0.b
 ; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
 ; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
@@ -1720,8 +1720,8 @@ define <vscale x 1 x i64> @bitcast_nxv4i16_to_nxv1i64(<vscale x 4 x i16> %v) #0
 ; CHECK_BE:       // %bb.0:
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.h
 ; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK_BE-NEXT:    ptrue p0.h
 ; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
@@ -1742,8 +1742,8 @@ define <vscale x 1 x i64> @bitcast_nxv2i32_to_nxv1i64(<vscale x 2 x i32> %v) #0
 ; CHECK_BE:       // %bb.0:
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.s
 ; CHECK_BE-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK_BE-NEXT:    ptrue p0.s
 ; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
@@ -2218,8 +2218,8 @@ define <vscale x 1 x double> @bitcast_nxv8i8_to_nxv1f64(<vscale x 8 x i8> %v) #0
 ; CHECK_BE:       // %bb.0:
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.b
 ; CHECK_BE-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK_BE-NEXT:    ptrue p0.b
 ; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1b { z0.b }, p0, [sp]
 ; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
@@ -2240,8 +2240,8 @@ define <vscale x 1 x double> @bitcast_nxv4i16_to_nxv1f64(<vscale x 4 x i16> %v)
 ; CHECK_BE:       // %bb.0:
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.h
 ; CHECK_BE-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK_BE-NEXT:    ptrue p0.h
 ; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
@@ -2262,8 +2262,8 @@ define <vscale x 1 x double> @bitcast_nxv2i32_to_nxv1f64(<vscale x 2 x i32> %v)
 ; CHECK_BE:       // %bb.0:
 ; CHECK_BE-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK_BE-NEXT:    addvl sp, sp, #-1
-; CHECK_BE-NEXT:    ptrue p0.s
 ; CHECK_BE-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK_BE-NEXT:    ptrue p0.s
 ; CHECK_BE-NEXT:    ptrue p1.d
 ; CHECK_BE-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK_BE-NEXT:    ld1d { z0.d }, p1/z, [sp]
@@ -2827,11 +2827,11 @@ define <vscale x 1 x i32> @bitcast_nxv2f16_to_nxv1i32(<vscale x 2 x half> %v) #0
 ; CHECK_BE-NEXT:    addvl sp, sp, #-2
 ; CHECK_BE-NEXT:    ptrue p0.d
 ; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    ptrue p2.s
 ; CHECK_BE-NEXT:    st1h { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.s
 ; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    st1h { z0.h }, p1, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p2/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #2
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
@@ -2860,11 +2860,11 @@ define <vscale x 1 x i32> @bitcast_nxv2bf16_to_nxv1i32(<vscale x 2 x bfloat> %v)
 ; CHECK_BE-NEXT:    addvl sp, sp, #-2
 ; CHECK_BE-NEXT:    ptrue p0.d
 ; CHECK_BE-NEXT:    ptrue p1.h
-; CHECK_BE-NEXT:    ptrue p2.s
 ; CHECK_BE-NEXT:    st1h { z0.d }, p0, [sp]
+; CHECK_BE-NEXT:    ptrue p0.s
 ; CHECK_BE-NEXT:    ld1h { z0.h }, p1/z, [sp]
 ; CHECK_BE-NEXT:    st1h { z0.h }, p1, [sp, #1, mul vl]
-; CHECK_BE-NEXT:    ld1w { z0.s }, p2/z, [sp, #1, mul vl]
+; CHECK_BE-NEXT:    ld1w { z0.s }, p0/z, [sp, #1, mul vl]
 ; CHECK_BE-NEXT:    addvl sp, sp, #2
 ; CHECK_BE-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK_BE-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
index 56b023086ea24..3b7b03e6ef61f 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
@@ -64,12 +64,12 @@ define float @foo2(ptr %x0, ptr %x1) nounwind {
 ; CHECK-NEXT:    add x9, sp, #16
 ; CHECK-NEXT:    mov w2, #2 // =0x2
 ; CHECK-NEXT:    mov w3, #3 // =0x3
+; CHECK-NEXT:    ld4d { z1.d - z4.d }, p0/z, [x0]
+; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    mov w4, #4 // =0x4
 ; CHECK-NEXT:    mov w5, #5 // =0x5
 ; CHECK-NEXT:    mov w6, #6 // =0x6
 ; CHECK-NEXT:    mov w7, #7 // =0x7
-; CHECK-NEXT:    ld4d { z1.d - z4.d }, p0/z, [x0]
-; CHECK-NEXT:    mov w0, wzr
 ; CHECK-NEXT:    ld4d { z16.d - z19.d }, p0/z, [x1]
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w1, #1 // =0x1
@@ -182,8 +182,8 @@ entry:
 define double @foo5(i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, ptr %ptr1, ptr %ptr2, double %x0, <vscale x 8 x double> %x1, <vscale x 8 x double> %x2) nounwind {
 ; CHECK-LABEL: foo5:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldr x8, [sp]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x8, #1, mul vl]
 ; CHECK-NEXT:    ld1d { z6.d }, p0/z, [x8]
 ; CHECK-NEXT:    ld1d { z7.d }, p0/z, [x8, #3, mul vl]
@@ -229,10 +229,10 @@ entry:
 define void @aavpcs1(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, i32 %s6, <vscale x 4 x i32> %s7, <vscale x 4 x i32> %s8, <vscale x 4 x i32> %s9, <vscale x 4 x i32> %s10, <vscale x 4 x i32> %s11, <vscale x 4 x i32> %s12, <vscale x 4 x i32> %s13, <vscale x 4 x i32> %s14, <vscale x 4 x i32> %s15, <vscale x 4 x i32> %s16, ptr %ptr) nounwind {
 ; CHECK-LABEL: aavpcs1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ldp x8, x9, [sp]
-; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x8]
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ld1w { z24.s }, p0/z, [x7]
+; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x8]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x9]
 ; CHECK-NEXT:    st1w { z1.s }, p0, [x9]
 ; CHECK-NEXT:    st1w { z2.s }, p0, [x9]
@@ -261,12 +261,12 @@ entry:
 define void @aavpcs2(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, <vscale x 4 x float> %s7, <vscale x 4 x float> %s8, <vscale x 4 x float> %s9, <vscale x 4 x float> %s10, <vscale x 4 x float> %s11, <vscale x 4 x float> %s12,<vscale x 4 x float> %s13,<vscale x 4 x float> %s14,<vscale x 4 x float> %s15,<vscale x 4 x float> %s16,ptr %ptr) nounwind {
 ; CHECK-LABEL: aavpcs2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ldp x8, x9, [sp]
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x7]
 ; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x6]
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
 ; CHECK-NEXT:    ld1w { z4.s }, p0/z, [x5]
 ; CHECK-NEXT:    ld1w { z5.s }, p0/z, [x1]
 ; CHECK-NEXT:    ld1w { z6.s }, p0/z, [x4]
@@ -299,11 +299,10 @@ entry:
 define void @aavpcs3(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, float %s7, <vscale x 4 x float> %s8, <vscale x 4 x float> %s9, <vscale x 4 x float> %s10, <vscale x 4 x float> %s11, <vscale x 4 x float> %s12, <vscale x 4 x float> %s13, <vscale x 4 x float> %s14, <vscale x 4 x float> %s15, <vscale x 4 x float> %s16, <vscale x 4 x float> %s17, <vscale x 16 x i1> %p0, ptr %ptr) nounwind {
 ; CHECK-LABEL: aavpcs3:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ldr x8, [sp]
-; CHECK-NEXT:    ldr x9, [sp, #16]
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x8]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x7]
 ; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; CHECK-NEXT:    ld1w { z4.s }, p0/z, [x6]
@@ -311,15 +310,16 @@ define void @aavpcs3(float %s0, float %s1, float %s2, float %s3, float %s4, floa
 ; CHECK-NEXT:    ld1w { z6.s }, p0/z, [x2]
 ; CHECK-NEXT:    ld1w { z7.s }, p0/z, [x4]
 ; CHECK-NEXT:    ld1w { z24.s }, p0/z, [x3]
-; CHECK-NEXT:    st1w { z0.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z3.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z6.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z24.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z7.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z5.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z4.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z2.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z1.s }, p0, [x9]
+; CHECK-NEXT:    ldr x8, [sp, #16]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z3.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z6.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z24.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z7.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z5.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z4.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z2.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
 ; CHECK-NEXT:    ret
 entry:
   store volatile <vscale x 4 x float> %s8, ptr %ptr
@@ -339,8 +339,8 @@ entry:
 define void @aavpcs4(i32 %s0, i32 %s1, i32 %s2, i32 %s3, i32 %s4, i32 %s5, i32 %s6, i32 %s7, <vscale x 4 x i32> %s8, <vscale x 4 x i32> %s9, <vscale x 4 x i32> %s10, <vscale x 4 x i32> %s11, <vscale x 4 x i32> %s12, <vscale x 4 x i32> %s13, <vscale x 4 x i32> %s14, <vscale x 4 x i32> %s15, <vscale x 4 x i32> %s16, <vscale x 4 x i32> %s17, ptr %ptr) nounwind {
 ; CHECK-LABEL: aavpcs4:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ldr x8, [sp]
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ldr x9, [sp, #16]
 ; CHECK-NEXT:    ld1w { z24.s }, p0/z, [x8]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x9]
@@ -371,11 +371,10 @@ entry:
 define <vscale x 4 x float> @aavpcs5(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, float %s7, <vscale x 4 x float> %s8, <vscale x 4 x float> %s9, <vscale x 4 x float> %s10, <vscale x 4 x float> %s11, <vscale x 4 x float> %s12, <vscale x 4 x float> %s13, <vscale x 4 x float> %s14, <vscale x 4 x float> %s15, <vscale x 4 x float> %s16, <vscale x 4 x float> %s17, ptr %ptr) nounwind {
 ; CHECK-LABEL: aavpcs5:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ldr x8, [sp]
-; CHECK-NEXT:    ldr x9, [sp, #16]
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x8]
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x7]
 ; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; CHECK-NEXT:    ld1w { z4.s }, p0/z, [x6]
@@ -383,15 +382,16 @@ define <vscale x 4 x float> @aavpcs5(float %s0, float %s1, float %s2, float %s3,
 ; CHECK-NEXT:    ld1w { z6.s }, p0/z, [x2]
 ; CHECK-NEXT:    ld1w { z7.s }, p0/z, [x4]
 ; CHECK-NEXT:    ld1w { z24.s }, p0/z, [x3]
-; CHECK-NEXT:    st1w { z0.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z3.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z6.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z24.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z7.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z5.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z4.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z2.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z1.s }, p0, [x9]
+; CHECK-NEXT:    ldr x8, [sp, #16]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z3.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z6.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z24.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z7.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z5.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z4.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z2.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x8]
 ; CHECK-NEXT:    ret
 entry:
   store volatile <vscale x 4 x float> %s8, ptr %ptr
@@ -409,11 +409,10 @@ entry:
 define void @aapcs1(float %s0, float %s1, float %s2, float %s3, float %s4, float %s5, float %s6, float %s7, <vscale x 4 x float> %s8, <vscale x 4 x float> %s9, <vscale x 4 x float> %s10, <vscale x 4 x float> %s11, <vscale x 4 x float> %s12, <vscale x 4 x float> %s13, <vscale x 4 x float> %s14, <vscale x 4 x float> %s15, <vscale x 4 x float> %s16, <vscale x 4 x float> %s17, ptr %ptr) nounwind {
 ; CHECK-LABEL: aapcs1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ldr x8, [sp]
-; CHECK-NEXT:    ldr x9, [sp, #16]
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x8]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x7]
 ; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x1]
 ; CHECK-NEXT:    ld1w { z4.s }, p0/z, [x6]
@@ -421,15 +420,16 @@ define void @aapcs1(float %s0, float %s1, float %s2, float %s3, float %s4, float
 ; CHECK-NEXT:    ld1w { z6.s }, p0/z, [x2]
 ; CHECK-NEXT:    ld1w { z7.s }, p0/z, [x4]
 ; CHECK-NEXT:    ld1w { z16.s }, p0/z, [x3]
-; CHECK-NEXT:    st1w { z0.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z3.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z6.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z16.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z7.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z5.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z4.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z2.s }, p0, [x9]
-; CHECK-NEXT:    st1w { z1.s }, p0, [x9]
+; CHECK-NEXT:    ldr x8, [sp, #16]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z3.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z6.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z16.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z7.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z5.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z4.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z2.s }, p0, [x8]
+; CHECK-NEXT:    st1w { z0.s }, p0, [x8]
 ; CHECK-NEXT:    ret
 entry:
   store volatile <vscale x 4 x float> %s8, ptr %ptr
@@ -486,13 +486,13 @@ define void @non_sve_caller_high_range_non_sve_callee_high_range(float %f0, floa
 ; CHECK-NEXT:    fmov s2, #2.00000000
 ; CHECK-NEXT:    fmov s3, #3.00000000
 ; CHECK-NEXT:    fmov s4, #4.00000000
-; CHECK-NEXT:    fmov s5, #5.00000000
-; CHECK-NEXT:    fmov s6, #6.00000000
-; CHECK-NEXT:    fmov s7, #7.00000000
 ; CHECK-NEXT:    ld1w { z16.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z17.s }, p0/z, [x1]
 ; CHECK-NEXT:    addvl x0, sp, #1
+; CHECK-NEXT:    fmov s5, #5.00000000
+; CHECK-NEXT:    fmov s6, #6.00000000
 ; CHECK-NEXT:    mov x1, sp
+; CHECK-NEXT:    fmov s7, #7.00000000
 ; CHECK-NEXT:    st1w { z17.s }, p0, [sp]
 ; CHECK-NEXT:    st1w { z16.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    bl non_sve_callee_high_range
@@ -548,20 +548,20 @@ define <vscale x 4 x float> @sve_caller_non_sve_callee_high_range(<vscale x 4 x
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z25.d, z0.d
 ; CHECK-NEXT:    str z0, [sp] // 16-byte Folded Spill
 ; CHECK-NEXT:    movi d0, #0000000000000000
 ; CHECK-NEXT:    mov z24.d, z1.d
-; CHECK-NEXT:    addvl x0, sp, #2
 ; CHECK-NEXT:    fmov s1, #1.00000000
+; CHECK-NEXT:    addvl x0, sp, #2
 ; CHECK-NEXT:    fmov s2, #2.00000000
-; CHECK-NEXT:    addvl x1, sp, #1
 ; CHECK-NEXT:    fmov s3, #3.00000000
+; CHECK-NEXT:    addvl x1, sp, #1
 ; CHECK-NEXT:    fmov s4, #4.00000000
 ; CHECK-NEXT:    fmov s5, #5.00000000
 ; CHECK-NEXT:    fmov s6, #6.00000000
 ; CHECK-NEXT:    fmov s7, #7.00000000
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    st1w { z24.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1w { z25.s }, p0, [sp, #2, mul vl]
 ; CHECK-NEXT:    bl non_sve_callee_high_range
@@ -704,20 +704,20 @@ define void @verify_all_operands_are_initialised() {
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    movi d0, #0000000000000000
-; CHECK-NEXT:    fmov s1, #1.00000000
 ; CHECK-NEXT:    fmov z16.s, #9.00000000
-; CHECK-NEXT:    mov w8, #1090519040 // =0x41000000
-; CHECK-NEXT:    add x0, sp, #16
+; CHECK-NEXT:    add x8, sp, #16
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fmov s1, #1.00000000
 ; CHECK-NEXT:    fmov s2, #2.00000000
 ; CHECK-NEXT:    fmov s3, #3.00000000
-; CHECK-NEXT:    add x9, sp, #16
+; CHECK-NEXT:    add x0, sp, #16
 ; CHECK-NEXT:    fmov s4, #4.00000000
 ; CHECK-NEXT:    fmov s5, #5.00000000
+; CHECK-NEXT:    st1w { z16.s }, p0, [x8]
+; CHECK-NEXT:    mov w8, #1090519040 // =0x41000000
 ; CHECK-NEXT:    fmov s6, #6.00000000
 ; CHECK-NEXT:    fmov s7, #7.00000000
-; CHECK-NEXT:    st1w { z16.s }, p0, [x9]
 ; CHECK-NEXT:    str w8, [sp]
 ; CHECK-NEXT:    bl func_f8_and_v0_passed_via_memory
 ; CHECK-NEXT:    addvl sp, sp, #1
diff --git a/llvm/test/CodeGen/AArch64/sve-cmp-folds.ll b/llvm/test/CodeGen/AArch64/sve-cmp-folds.ll
index c8a36e47efca6..1b78c253c58dd 100644
--- a/llvm/test/CodeGen/AArch64/sve-cmp-folds.ll
+++ b/llvm/test/CodeGen/AArch64/sve-cmp-folds.ll
@@ -115,10 +115,10 @@ define i1 @foo_last(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    whilels p1.s, xzr, x8
 ; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.s, p0/z, #1 // =0x1
-; CHECK-NEXT:    lastb w8, p1, z0.s
+; CHECK-NEXT:    whilels p0.s, xzr, x8
+; CHECK-NEXT:    lastb w8, p0, z0.s
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
   %vcond = fcmp oeq <vscale x 4 x float> %a, %b
diff --git a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
index f5721cd0fd793..7bc31d44bb654 100644
--- a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
+++ b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
@@ -87,8 +87,8 @@ define float @fmaximum_f32(<vscale x 8 x float> %a, <vscale x 4 x float> %b) {
 define i32 @add_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: add_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    add z0.s, z0.s, z2.s
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
 ; CHECK-NEXT:    fmov x0, d0
@@ -160,8 +160,8 @@ define i16 @add_ext_v32i16(<vscale x 32 x i8> %a, <vscale x 16 x i8> %b) {
 define i32 @and_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: and_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and z0.d, z0.d, z2.d
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
@@ -175,8 +175,8 @@ define i32 @and_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
 define i32 @or_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: or_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    orr z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    orr z0.d, z0.d, z2.d
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
@@ -190,8 +190,8 @@ define i32 @or_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
 define i32 @xor_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: xor_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    eor3 z0.d, z0.d, z1.d, z2.d
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-expand-div.ll b/llvm/test/CodeGen/AArch64/sve-expand-div.ll
index fe5cdc9387728..180c64e0a7de1 100644
--- a/llvm/test/CodeGen/AArch64/sve-expand-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-expand-div.ll
@@ -10,8 +10,8 @@
 define <vscale x 16 x i8> @sdiv_i8(<vscale x 16 x i8> %a) #0 {
 ; CHECK-LABEL: sdiv_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z1.b, #86 // =0x56
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    lsr z1.b, z0.b, #7
 ; CHECK-NEXT:    add z0.b, z0.b, z1.b
@@ -23,8 +23,8 @@ define <vscale x 16 x i8> @sdiv_i8(<vscale x 16 x i8> %a) #0 {
 define <vscale x 8 x i16> @sdiv_i16(<vscale x 8 x i16> %a) #0 {
 ; CHECK-LABEL: sdiv_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w8, #21846 // =0x5556
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    lsr z1.h, z0.h, #15
@@ -37,8 +37,8 @@ define <vscale x 8 x i16> @sdiv_i16(<vscale x 8 x i16> %a) #0 {
 define <vscale x 4 x i32> @sdiv_i32(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: sdiv_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #21846 // =0x5556
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    movk w8, #21845, lsl #16
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
@@ -52,8 +52,8 @@ define <vscale x 4 x i32> @sdiv_i32(<vscale x 4 x i32> %a) #0 {
 define <vscale x 2 x i64> @sdiv_i64(<vscale x 2 x i64> %a) #0 {
 ; CHECK-LABEL: sdiv_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #6148914691236517205 // =0x5555555555555555
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    movk x8, #21846
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
@@ -71,8 +71,8 @@ define <vscale x 2 x i64> @sdiv_i64(<vscale x 2 x i64> %a) #0 {
 define <vscale x 16 x i8> @udiv_i8(<vscale x 16 x i8> %a) #0 {
 ; CHECK-LABEL: udiv_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z1.b, #-85 // =0xffffffffffffffab
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    lsr z0.b, z0.b, #1
 ; CHECK-NEXT:    ret
@@ -83,8 +83,8 @@ define <vscale x 16 x i8> @udiv_i8(<vscale x 16 x i8> %a) #0 {
 define <vscale x 8 x i16> @udiv_i16(<vscale x 8 x i16> %a) #0 {
 ; CHECK-LABEL: udiv_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w8, #-21845 // =0xffffaaab
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    lsr z0.h, z0.h, #1
@@ -96,8 +96,8 @@ define <vscale x 8 x i16> @udiv_i16(<vscale x 8 x i16> %a) #0 {
 define <vscale x 4 x i32> @udiv_i32(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: udiv_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #43691 // =0xaaab
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    movk w8, #43690, lsl #16
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
@@ -110,8 +110,8 @@ define <vscale x 4 x i32> @udiv_i32(<vscale x 4 x i32> %a) #0 {
 define <vscale x 2 x i64> @udiv_i64(<vscale x 2 x i64> %a) #0 {
 ; CHECK-LABEL: udiv_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-6148914691236517206 // =0xaaaaaaaaaaaaaaaa
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    movk x8, #43691
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-element.ll b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
index a3c34b53baa07..6d4f5963881e5 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-element.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
@@ -616,8 +616,8 @@ define i1 @test_last_8xi1(<vscale x 8 x i1> %a) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov x8, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z0.h, p0/z, #1 // =0x1
-; CHECK-NEXT:    whilels p1.h, xzr, x8
-; CHECK-NEXT:    lastb w8, p1, z0.h
+; CHECK-NEXT:    whilels p0.h, xzr, x8
+; CHECK-NEXT:    lastb w8, p0, z0.h
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
   %vscale = call i64 @llvm.vscale.i64()
@@ -630,10 +630,10 @@ define i1 @test_last_8xi1(<vscale x 8 x i1> %a) #0 {
 define i1 @test_lanex_4xi1(<vscale x 4 x i1> %a, i32 %x) #0 {
 ; CHECK-LABEL: test_lanex_4xi1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    mov z0.s, p0/z, #1 // =0x1
-; CHECK-NEXT:    whilels p1.s, xzr, x8
-; CHECK-NEXT:    lastb w8, p1, z0.s
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    whilels p0.s, xzr, x8
+; CHECK-NEXT:    lastb w8, p0, z0.s
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
   %b = extractelement <vscale x 4 x i1> %a, i32 %x
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
index bc1c563810f35..b9c531fe33526 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-from-scalable-vector.ll
@@ -100,16 +100,16 @@ define <2 x i64> @extract_v2i64_nxv8i64_8(<vscale x 8 x i64> %arg) {
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cnth x8
 ; CHECK-NEXT:    mov w9, #8 // =0x8
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    sub x8, x8, #2
 ; CHECK-NEXT:    cmp x8, #8
+; CHECK-NEXT:    st1d { z3.d }, p0, [sp, #3, mul vl]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    st1d { z2.d }, p0, [sp, #2, mul vl]
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    lsl x8, x8, #3
-; CHECK-NEXT:    st1d { z3.d }, p0, [sp, #3, mul vl]
-; CHECK-NEXT:    st1d { z2.d }, p0, [sp, #2, mul vl]
 ; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
 ; CHECK-NEXT:    ldr q0, [x9, x8]
@@ -183,10 +183,10 @@ define <4 x i1> @extract_v4i1_nxv32i1_16(<vscale x 32 x i1> %arg) {
 ; CHECK-NEXT:    addvl sp, sp, #-8
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p2.b
 ; CHECK-NEXT:    mov z0.b, p1/z, #1 // =0x1
 ; CHECK-NEXT:    mov z1.b, p0/z, #1 // =0x1
 ; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    ptrue p2.b
 ; CHECK-NEXT:    add x8, x8, #16
 ; CHECK-NEXT:    st1b { z0.b }, p2, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1b { z1.b }, p2, [sp]
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
index e2f8dad03ef6f..88268104889fd 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
@@ -17,15 +17,15 @@ define <2 x i64> @extract_v2i64_nxv2i64_idx2(<vscale x 2 x i64> %vec) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    mov w9, #2 // =0x2
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    sub x8, x8, #2
 ; CHECK-NEXT:    cmp x8, #2
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    lsl x8, x8, #3
-; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
 ; CHECK-NEXT:    ldr q0, [x9, x8]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -50,15 +50,15 @@ define <4 x i32> @extract_v4i32_nxv4i32_idx4(<vscale x 4 x i32> %vec) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cntw x8
 ; CHECK-NEXT:    mov w9, #4 // =0x4
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sub x8, x8, #4
 ; CHECK-NEXT:    cmp x8, #4
+; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    lsl x8, x8, #2
-; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK-NEXT:    ldr q0, [x9, x8]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -115,15 +115,15 @@ define <8 x i16> @extract_v8i16_nxv8i16_idx8(<vscale x 8 x i16> %vec) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cnth x8
 ; CHECK-NEXT:    mov w9, #8 // =0x8
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    sub x8, x8, #8
 ; CHECK-NEXT:    cmp x8, #8
+; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    lsl x8, x8, #1
-; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK-NEXT:    ldr q0, [x9, x8]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -214,14 +214,14 @@ define <16 x i8> @extract_v16i8_nxv16i8_idx16(<vscale x 16 x i8> %vec) nounwind
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov w9, #16 // =0x10
 ; CHECK-NEXT:    sub x8, x8, #16
 ; CHECK-NEXT:    cmp x8, #16
+; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
 ; CHECK-NEXT:    ldr q0, [x9, x8]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
index e60a2f142922f..3c0bd501f45d8 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
@@ -65,27 +65,25 @@ define <vscale x 14 x i1> @extract_nxv14i1_nxv28i1_14(<vscale x 28 x i1> %in) uw
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    punpkhi p2.h, p1.b
-; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    punpklo p1.h, p1.b
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    punpkhi p0.h, p0.b
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    punpklo p2.h, p2.b
 ; CHECK-NEXT:    punpkhi p3.h, p1.b
-; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    punpkhi p4.h, p2.b
 ; CHECK-NEXT:    punpklo p2.h, p2.b
+; CHECK-NEXT:    uzp1 p4.s, p4.s, p0.s
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    punpkhi p5.h, p3.b
-; CHECK-NEXT:    punpklo p3.h, p3.b
-; CHECK-NEXT:    punpkhi p6.h, p1.b
 ; CHECK-NEXT:    punpklo p1.h, p1.b
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    uzp1 p2.s, p5.s, p2.s
+; CHECK-NEXT:    punpklo p3.h, p3.b
+; CHECK-NEXT:    punpkhi p5.h, p1.b
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    uzp1 p3.s, p5.s, p3.s
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    uzp1 p3.s, p6.s, p3.s
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    uzp1 p4.s, p4.s, p0.s
 ; CHECK-NEXT:    uzp1 p0.s, p0.s, p1.s
 ; CHECK-NEXT:    uzp1 p1.h, p2.h, p4.h
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-fcmp.ll b/llvm/test/CodeGen/AArch64/sve-fcmp.ll
index f7e3b6d0171ac..35cbe65c6a8b8 100644
--- a/llvm/test/CodeGen/AArch64/sve-fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fcmp.ll
@@ -374,8 +374,8 @@ define <vscale x 4 x i1> @one_zero(<vscale x 4 x float> %x) {
 define <vscale x 4 x i1> @ueq_zero(<vscale x 4 x float> %x) {
 ; CHECK-LABEL: ueq_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmuo p1.s, p0/z, z0.s, z1.s
 ; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
diff --git a/llvm/test/CodeGen/AArch64/sve-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-fcopysign.ll
index f15807597ac21..78843e392e536 100644
--- a/llvm/test/CodeGen/AArch64/sve-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fcopysign.ll
@@ -63,10 +63,10 @@ define <vscale x 4 x float> @test_copysign_v4f32_v4f64(<vscale x 4 x float> %a,
 ; CHECK-EXTEND-ROUND-NEXT:    ptrue p0.d
 ; CHECK-EXTEND-ROUND-NEXT:    uunpkhi z3.d, z0.s
 ; CHECK-EXTEND-ROUND-NEXT:    uunpklo z0.d, z0.s
-; CHECK-EXTEND-ROUND-NEXT:    and z3.s, z3.s, #0x7fffffff
-; CHECK-EXTEND-ROUND-NEXT:    and z0.s, z0.s, #0x7fffffff
 ; CHECK-EXTEND-ROUND-NEXT:    fcvt z2.s, p0/m, z2.d
 ; CHECK-EXTEND-ROUND-NEXT:    fcvt z1.s, p0/m, z1.d
+; CHECK-EXTEND-ROUND-NEXT:    and z3.s, z3.s, #0x7fffffff
+; CHECK-EXTEND-ROUND-NEXT:    and z0.s, z0.s, #0x7fffffff
 ; CHECK-EXTEND-ROUND-NEXT:    and z2.s, z2.s, #0x80000000
 ; CHECK-EXTEND-ROUND-NEXT:    and z1.s, z1.s, #0x80000000
 ; CHECK-EXTEND-ROUND-NEXT:    orr z2.d, z3.d, z2.d
@@ -115,9 +115,9 @@ declare <vscale x 2 x double> @llvm.copysign.v2f64(<vscale x 2 x double> %a, <vs
 define <vscale x 4 x double> @test_copysign_v4f64_v4f32(<vscale x 4 x double> %a, <vscale x 4 x float> %b) #0 {
 ; CHECK-LABEL: test_copysign_v4f64_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uunpklo z3.d, z2.s
 ; CHECK-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and z0.d, z0.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    and z1.d, z1.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcvt z3.d, p0/m, z3.s
@@ -193,10 +193,10 @@ define <vscale x 4 x half> @test_copysign_v4f16_v4f64(<vscale x 4 x half> %a, <v
 ; CHECK-EXTEND-ROUND-NEXT:    ptrue p0.d
 ; CHECK-EXTEND-ROUND-NEXT:    uunpkhi z3.d, z0.s
 ; CHECK-EXTEND-ROUND-NEXT:    uunpklo z0.d, z0.s
-; CHECK-EXTEND-ROUND-NEXT:    and z3.h, z3.h, #0x7fff
-; CHECK-EXTEND-ROUND-NEXT:    and z0.h, z0.h, #0x7fff
 ; CHECK-EXTEND-ROUND-NEXT:    fcvt z2.h, p0/m, z2.d
 ; CHECK-EXTEND-ROUND-NEXT:    fcvt z1.h, p0/m, z1.d
+; CHECK-EXTEND-ROUND-NEXT:    and z3.h, z3.h, #0x7fff
+; CHECK-EXTEND-ROUND-NEXT:    and z0.h, z0.h, #0x7fff
 ; CHECK-EXTEND-ROUND-NEXT:    and z2.h, z2.h, #0x8000
 ; CHECK-EXTEND-ROUND-NEXT:    and z1.h, z1.h, #0x8000
 ; CHECK-EXTEND-ROUND-NEXT:    orr z2.d, z3.d, z2.d
@@ -240,10 +240,10 @@ define <vscale x 8 x half> @test_copysign_v8f16_v8f32(<vscale x 8 x half> %a, <v
 ; CHECK-EXTEND-ROUND-NEXT:    ptrue p0.s
 ; CHECK-EXTEND-ROUND-NEXT:    uunpkhi z3.s, z0.h
 ; CHECK-EXTEND-ROUND-NEXT:    uunpklo z0.s, z0.h
-; CHECK-EXTEND-ROUND-NEXT:    and z3.h, z3.h, #0x7fff
-; CHECK-EXTEND-ROUND-NEXT:    and z0.h, z0.h, #0x7fff
 ; CHECK-EXTEND-ROUND-NEXT:    fcvt z2.h, p0/m, z2.s
 ; CHECK-EXTEND-ROUND-NEXT:    fcvt z1.h, p0/m, z1.s
+; CHECK-EXTEND-ROUND-NEXT:    and z3.h, z3.h, #0x7fff
+; CHECK-EXTEND-ROUND-NEXT:    and z0.h, z0.h, #0x7fff
 ; CHECK-EXTEND-ROUND-NEXT:    and z2.h, z2.h, #0x8000
 ; CHECK-EXTEND-ROUND-NEXT:    and z1.h, z1.h, #0x8000
 ; CHECK-EXTEND-ROUND-NEXT:    orr z2.d, z3.d, z2.d
@@ -261,9 +261,9 @@ define <vscale x 8 x half> @test_copysign_v8f16_v8f32(<vscale x 8 x half> %a, <v
 define <vscale x 4 x half> @test_copysign_nxv4f32_nxv4f16(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
 ; CHECK-NO-EXTEND-ROUND-LABEL: test_copysign_nxv4f32_nxv4f16:
 ; CHECK-NO-EXTEND-ROUND:       // %bb.0:
-; CHECK-NO-EXTEND-ROUND-NEXT:    ptrue p0.s
 ; CHECK-NO-EXTEND-ROUND-NEXT:    and z1.s, z1.s, #0x80000000
 ; CHECK-NO-EXTEND-ROUND-NEXT:    and z0.s, z0.s, #0x7fffffff
+; CHECK-NO-EXTEND-ROUND-NEXT:    ptrue p0.s
 ; CHECK-NO-EXTEND-ROUND-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NO-EXTEND-ROUND-NEXT:    fcvt z0.h, p0/m, z0.s
 ; CHECK-NO-EXTEND-ROUND-NEXT:    ret
@@ -285,9 +285,9 @@ define <vscale x 4 x half> @test_copysign_nxv4f32_nxv4f16(<vscale x 4 x float> %
 define <vscale x 2 x float> @test_copysign_nxv2f64_nxv2f32(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
 ; CHECK-NO-EXTEND-ROUND-LABEL: test_copysign_nxv2f64_nxv2f32:
 ; CHECK-NO-EXTEND-ROUND:       // %bb.0:
-; CHECK-NO-EXTEND-ROUND-NEXT:    ptrue p0.d
 ; CHECK-NO-EXTEND-ROUND-NEXT:    and z1.d, z1.d, #0x8000000000000000
 ; CHECK-NO-EXTEND-ROUND-NEXT:    and z0.d, z0.d, #0x7fffffffffffffff
+; CHECK-NO-EXTEND-ROUND-NEXT:    ptrue p0.d
 ; CHECK-NO-EXTEND-ROUND-NEXT:    orr z0.d, z0.d, z1.d
 ; CHECK-NO-EXTEND-ROUND-NEXT:    fcvt z0.s, p0/m, z0.d
 ; CHECK-NO-EXTEND-ROUND-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-fcvt.ll
index 0fe38bf9ae718..fc5128fffad36 100644
--- a/llvm/test/CodeGen/AArch64/sve-fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fcvt.ll
@@ -454,9 +454,9 @@ define <vscale x 2 x i64> @fcvtzu_d_nxv2f64(<vscale x 2 x double> %a) {
 define <vscale x 2 x half> @scvtf_h_nxv2i1(<vscale x 2 x i1> %a) {
 ; CHECK-LABEL: scvtf_h_nxv2i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    scvtf z0.h, p1/m, z0.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.d
 ; CHECK-NEXT:    ret
   %res = sitofp <vscale x 2 x i1> %a to <vscale x 2 x half>
   ret <vscale x 2 x half> %res
@@ -495,9 +495,9 @@ define <vscale x 2 x half> @scvtf_h_nxv2i64(<vscale x 2 x i64> %a) {
 define <vscale x 3 x half> @scvtf_h_nxv3i1(<vscale x 3 x i1> %a) {
 ; CHECK-LABEL: scvtf_h_nxv3i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    scvtf z0.h, p1/m, z0.s
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.s
 ; CHECK-NEXT:    ret
   %res = sitofp <vscale x 3 x i1> %a to <vscale x 3 x half>
   ret <vscale x 3 x half> %res
@@ -516,9 +516,9 @@ define <vscale x 3 x half> @scvtf_h_nxv3i16(<vscale x 3 x i16> %a) {
 define <vscale x 4 x half> @scvtf_h_nxv4i1(<vscale x 4 x i1> %a) {
 ; CHECK-LABEL: scvtf_h_nxv4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    scvtf z0.h, p1/m, z0.s
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.s
 ; CHECK-NEXT:    ret
   %res = sitofp <vscale x 4 x i1> %a to <vscale x 4 x half>
   ret <vscale x 4 x half> %res
@@ -547,9 +547,9 @@ define <vscale x 4 x half> @scvtf_h_nxv4i32(<vscale x 4 x i32> %a) {
 define <vscale x 7 x half> @scvtf_h_nxv7i1(<vscale x 7 x i1> %a) {
 ; CHECK-LABEL: scvtf_h_nxv7i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.h
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    scvtf z0.h, p1/m, z0.h
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    ret
   %res = sitofp <vscale x 7 x i1> %a to <vscale x 7 x half>
   ret <vscale x 7 x half> %res
@@ -568,9 +568,9 @@ define <vscale x 7 x half> @scvtf_h_nxv7i16(<vscale x 7 x i16> %a) {
 define <vscale x 8 x half> @scvtf_h_nxv8i1(<vscale x 8 x i1> %a) {
 ; CHECK-LABEL: scvtf_h_nxv8i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.h
 ; CHECK-NEXT:    mov z0.h, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    scvtf z0.h, p1/m, z0.h
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    ret
   %res = sitofp <vscale x 8 x i1> %a to <vscale x 8 x half>
   ret <vscale x 8 x half> %res
@@ -589,9 +589,9 @@ define <vscale x 8 x half> @scvtf_h_nxv8i16(<vscale x 8 x i16> %a) {
 define <vscale x 2 x float> @scvtf_s_nxv2i1(<vscale x 2 x i1> %a) {
 ; CHECK-LABEL: scvtf_s_nxv2i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    scvtf z0.s, p1/m, z0.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.d
 ; CHECK-NEXT:    ret
   %res = sitofp <vscale x 2 x i1> %a to <vscale x 2 x float>
   ret <vscale x 2 x float> %res
@@ -620,9 +620,9 @@ define <vscale x 2 x float> @scvtf_s_nxv2i64(<vscale x 2 x i64> %a) {
 define <vscale x 3 x float> @scvtf_s_nxv3i1(<vscale x 3 x i1> %a) {
 ; CHECK-LABEL: scvtf_s_nxv3i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    scvtf z0.s, p1/m, z0.s
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    ret
   %res = sitofp <vscale x 3 x i1> %a to <vscale x 3 x float>
   ret <vscale x 3 x float> %res
@@ -641,9 +641,9 @@ define <vscale x 3 x float> @scvtf_s_nxv3i32(<vscale x 3 x i32> %a) {
 define <vscale x 4 x float> @scvtf_s_nxv4i1(<vscale x 4 x i1> %a) {
 ; CHECK-LABEL: scvtf_s_nxv4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    mov z0.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    scvtf z0.s, p1/m, z0.s
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    ret
   %res = sitofp <vscale x 4 x i1> %a to <vscale x 4 x float>
   ret <vscale x 4 x float> %res
@@ -662,9 +662,9 @@ define <vscale x 4 x float> @scvtf_s_nxv4i32(<vscale x 4 x i32> %a) {
 define <vscale x 2 x double> @scvtf_d_nxv2i1(<vscale x 2 x i1> %a) {
 ; CHECK-LABEL: scvtf_d_nxv2i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    mov z0.d, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    scvtf z0.d, p1/m, z0.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    ret
   %res = sitofp <vscale x 2 x i1> %a to <vscale x 2 x double>
   ret <vscale x 2 x double> %res
@@ -695,9 +695,9 @@ define <vscale x 2 x double> @scvtf_d_nxv2i64(<vscale x 2 x i64> %a) {
 define <vscale x 2 x half> @ucvtf_h_nxv2i1(<vscale x 2 x i1> %a) {
 ; CHECK-LABEL: ucvtf_h_nxv2i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    mov z0.d, p0/z, #1 // =0x1
-; CHECK-NEXT:    ucvtf z0.h, p1/m, z0.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.d
 ; CHECK-NEXT:    ret
   %res = uitofp <vscale x 2 x i1> %a to <vscale x 2 x half>
   ret <vscale x 2 x half> %res
@@ -736,9 +736,9 @@ define <vscale x 2 x half> @ucvtf_h_nxv2i64(<vscale x 2 x i64> %a) {
 define <vscale x 3 x half> @ucvtf_h_nxv3i1(<vscale x 3 x i1> %a) {
 ; CHECK-LABEL: ucvtf_h_nxv3i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    mov z0.s, p0/z, #1 // =0x1
-; CHECK-NEXT:    ucvtf z0.h, p1/m, z0.s
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.s
 ; CHECK-NEXT:    ret
   %res = uitofp <vscale x 3 x i1> %a to <vscale x 3 x half>
   ret <vscale x 3 x half> %res
@@ -767,9 +767,9 @@ define <vscale x 3 x half> @ucvtf_h_nxv3i32(<vscale x 3 x i32> %a) {
 define <vscale x 4 x half> @ucvtf_h_nxv4i1(<vscale x 4 x i1> %a) {
 ; CHECK-LABEL: ucvtf_h_nxv4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    mov z0.s, p0/z, #1 // =0x1
-; CHECK-NEXT:    ucvtf z0.h, p1/m, z0.s
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.s
 ; CHECK-NEXT:    ret
   %res = uitofp <vscale x 4 x i1> %a to <vscale x 4 x half>
   ret <vscale x 4 x half> %res
@@ -798,9 +798,9 @@ define <vscale x 4 x half> @ucvtf_h_nxv4i32(<vscale x 4 x i32> %a) {
 define <vscale x 8 x half> @ucvtf_h_nxv8i1(<vscale x 8 x i1> %a) {
 ; CHECK-LABEL: ucvtf_h_nxv8i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.h
 ; CHECK-NEXT:    mov z0.h, p0/z, #1 // =0x1
-; CHECK-NEXT:    ucvtf z0.h, p1/m, z0.h
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    ret
   %res = uitofp <vscale x 8 x i1> %a to <vscale x 8 x half>
   ret <vscale x 8 x half> %res
@@ -819,9 +819,9 @@ define <vscale x 8 x half> @ucvtf_h_nxv8i16(<vscale x 8 x i16> %a) {
 define <vscale x 2 x float> @ucvtf_s_nxv2i1(<vscale x 2 x i1> %a) {
 ; CHECK-LABEL: ucvtf_s_nxv2i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    mov z0.d, p0/z, #1 // =0x1
-; CHECK-NEXT:    ucvtf z0.s, p1/m, z0.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.d
 ; CHECK-NEXT:    ret
   %res = uitofp <vscale x 2 x i1> %a to <vscale x 2 x float>
   ret <vscale x 2 x float> %res
@@ -850,9 +850,9 @@ define <vscale x 2 x float> @ucvtf_s_nxv2i64(<vscale x 2 x i64> %a) {
 define <vscale x 4 x float> @ucvtf_s_nxv4i1(<vscale x 4 x i1> %a) {
 ; CHECK-LABEL: ucvtf_s_nxv4i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    mov z0.s, p0/z, #1 // =0x1
-; CHECK-NEXT:    ucvtf z0.s, p1/m, z0.s
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    ret
   %res = uitofp <vscale x 4 x i1> %a to <vscale x 4 x float>
   ret <vscale x 4 x float> %res
@@ -871,9 +871,9 @@ define <vscale x 4 x float> @ucvtf_s_nxv4i32(<vscale x 4 x i32> %a) {
 define <vscale x 2 x double> @ucvtf_d_nxv2i1(<vscale x 2 x i1> %a) {
 ; CHECK-LABEL: ucvtf_d_nxv2i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    mov z0.d, p0/z, #1 // =0x1
-; CHECK-NEXT:    ucvtf z0.d, p1/m, z0.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    ret
   %res = uitofp <vscale x 2 x i1> %a to <vscale x 2 x double>
   ret <vscale x 2 x double> %res
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll
index ed7ea657874a4..28e1412c524a0 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-addressing-modes.ll
@@ -7,8 +7,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @masked_gather_base_plus_stride_v8f32(ptr %dst, ptr %src) #0 {
 ; CHECK-LABEL: masked_gather_base_plus_stride_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    index z0.s, #0, #7
+; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1, z0.s, sxtw #2]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -21,8 +21,8 @@ define void @masked_gather_base_plus_stride_v8f32(ptr %dst, ptr %src) #0 {
 define void @masked_gather_base_plus_stride_v4f64(ptr %dst, ptr %src) #0 {
 ; CHECK-LABEL: masked_gather_base_plus_stride_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    mov x8, #-32 // =0xffffffffffffffe0
+; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    index z0.d, #-2, x8
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1, z0.d, lsl #3]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll
index ad482118ec0bb..47fda39d84001 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-build-vector.ll
@@ -7,8 +7,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @build_vector_7_inc1_v32i8(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: build_vector_7_inc1_v32i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
 ; VBITS_GE_256-NEXT:    index z0.b, #7, #1
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
   store <32 x i8> <i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38>, ptr %a, align 1
@@ -18,8 +18,8 @@ define void @build_vector_7_inc1_v32i8(ptr %a) #0 {
 define void @build_vector_0_inc2_v16i16(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: build_vector_0_inc2_v16i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
 ; VBITS_GE_256-NEXT:    index z0.h, #0, #2
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
   store <16 x i16> <i16 0, i16 2, i16 4, i16 6, i16 8, i16 10, i16 12, i16 14, i16 16, i16 18, i16 20, i16 22, i16 24, i16 26, i16 28, i16 30>, ptr %a, align 2
@@ -30,8 +30,8 @@ define void @build_vector_0_inc2_v16i16(ptr %a) #0 {
 define void @build_vector_0_dec3_v8i32(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: build_vector_0_dec3_v8i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    index z0.s, #0, #-3
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
   store <8 x i32> <i32 0, i32 -3, i32 -6, i32 -9, i32 -12, i32 -15, i32 -18, i32 -21>, ptr %a, align 4
@@ -42,8 +42,8 @@ define void @build_vector_0_dec3_v8i32(ptr %a) #0 {
 define void @build_vector_minus2_dec32_v4i64(ptr %a) #0 {
 ; VBITS_GE_256-LABEL: build_vector_minus2_dec32_v4i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    mov x8, #-32 // =0xffffffffffffffe0
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    index z0.d, #-2, x8
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
@@ -53,11 +53,6 @@ define void @build_vector_minus2_dec32_v4i64(ptr %a) #0 {
 
 ; Constant but not a sequence.
 define void @build_vector_no_stride_v4i64(ptr %a) #0 {
-; VBITS_GE_256-LABEL:  .LCPI4_0:
-; VBITS_GE_256:         .xword  0
-; VBITS_GE_256-NEXT:    .xword  4
-; VBITS_GE_256-NEXT:    .xword  1
-; VBITS_GE_256-NEXT:    .xword  8
 ; VBITS_GE_256-LABEL: build_vector_no_stride_v4i64:
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll
index 65cb448cac117..f7751131005e3 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-concat.ll
@@ -38,9 +38,9 @@ define void @concat_v32i8(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ptrue p1.b, vl32
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
-; CHECK-NEXT:    st1b { z0.b }, p1, [x2]
+; CHECK-NEXT:    ptrue p0.b, vl32
+; CHECK-NEXT:    st1b { z0.b }, p0, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i8>, ptr %a
   %op2 = load <16 x i8>, ptr %b
@@ -66,11 +66,11 @@ define void @concat_v64i8(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_512-LABEL: concat_v64i8:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.b, vl32
-; VBITS_GE_512-NEXT:    ptrue p1.b, vl64
 ; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    splice z0.b, p0, z0.b, z1.b
-; VBITS_GE_512-NEXT:    st1b { z0.b }, p1, [x2]
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x2]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
   %op2 = load <32 x i8>, ptr %b
@@ -90,11 +90,11 @@ define void @concat_v128i8(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: concat_v128i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b, vl64
-; CHECK-NEXT:    ptrue p1.b, vl128
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
-; CHECK-NEXT:    st1b { z0.b }, p1, [x2]
+; CHECK-NEXT:    ptrue p0.b, vl128
+; CHECK-NEXT:    st1b { z0.b }, p0, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <64 x i8>, ptr %a
   %op2 = load <64 x i8>, ptr %b
@@ -122,11 +122,11 @@ define void @concat_v256i8(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: concat_v256i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b, vl128
-; CHECK-NEXT:    ptrue p1.b, vl256
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
-; CHECK-NEXT:    st1b { z0.b }, p1, [x2]
+; CHECK-NEXT:    ptrue p0.b, vl256
+; CHECK-NEXT:    st1b { z0.b }, p0, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, ptr %a
   %op2 = load <128 x i8>, ptr %b
@@ -198,9 +198,9 @@ define void @concat_v16i16(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ptrue p1.h, vl16
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
-; CHECK-NEXT:    st1h { z0.h }, p1, [x2]
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    st1h { z0.h }, p0, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i16>, ptr %a
   %op2 = load <8 x i16>, ptr %b
@@ -224,11 +224,11 @@ define void @concat_v32i16(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_512-LABEL: concat_v32i16:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_512-NEXT:    ptrue p1.h, vl32
 ; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    splice z0.h, p0, z0.h, z1.h
-; VBITS_GE_512-NEXT:    st1h { z0.h }, p1, [x2]
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x2]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
   %op2 = load <16 x i16>, ptr %b
@@ -244,11 +244,11 @@ define void @concat_v64i16(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: concat_v64i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl32
-; CHECK-NEXT:    ptrue p1.h, vl64
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
-; CHECK-NEXT:    st1h { z0.h }, p1, [x2]
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    st1h { z0.h }, p0, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i16>, ptr %a
   %op2 = load <32 x i16>, ptr %b
@@ -268,11 +268,11 @@ define void @concat_v128i16(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: concat_v128i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl64
-; CHECK-NEXT:    ptrue p1.h, vl128
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
-; CHECK-NEXT:    st1h { z0.h }, p1, [x2]
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    st1h { z0.h }, p0, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <64 x i16>, ptr %a
   %op2 = load <64 x i16>, ptr %b
@@ -328,9 +328,9 @@ define void @concat_v8i32(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ptrue p1.s, vl8
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
-; CHECK-NEXT:    st1w { z0.s }, p1, [x2]
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <4 x i32>, ptr %a
   %op2 = load <4 x i32>, ptr %b
@@ -353,11 +353,11 @@ define void @concat_v16i32(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_512-LABEL: concat_v16i32:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_512-NEXT:    ptrue p1.s, vl16
 ; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    splice z0.s, p0, z0.s, z1.s
-; VBITS_GE_512-NEXT:    st1w { z0.s }, p1, [x2]
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x2]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x i32>, ptr %a
   %op2 = load <8 x i32>, ptr %b
@@ -371,11 +371,11 @@ define void @concat_v32i32(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: concat_v32i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl16
-; CHECK-NEXT:    ptrue p1.s, vl32
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
-; CHECK-NEXT:    st1w { z0.s }, p1, [x2]
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i32>, ptr %a
   %op2 = load <16 x i32>, ptr %b
@@ -391,11 +391,11 @@ define void @concat_v64i32(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: concat_v64i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl32
-; CHECK-NEXT:    ptrue p1.s, vl64
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
-; CHECK-NEXT:    st1w { z0.s }, p1, [x2]
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i32>, ptr %a
   %op2 = load <32 x i32>, ptr %b
@@ -433,9 +433,9 @@ define void @concat_v4i64(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ptrue p1.d, vl4
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
-; CHECK-NEXT:    st1d { z0.d }, p1, [x2]
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <2 x i64>, ptr %a
   %op2 = load <2 x i64>, ptr %b
@@ -458,11 +458,11 @@ define void @concat_v8i64(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_512-LABEL: concat_v8i64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_512-NEXT:    ptrue p1.d, vl8
 ; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    splice z0.d, p0, z0.d, z1.d
-; VBITS_GE_512-NEXT:    st1d { z0.d }, p1, [x2]
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x2]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load <4 x i64>, ptr %a
   %op2 = load <4 x i64>, ptr %b
@@ -475,11 +475,11 @@ define void @concat_v16i64(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: concat_v16i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl8
-; CHECK-NEXT:    ptrue p1.d, vl16
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
-; CHECK-NEXT:    st1d { z0.d }, p1, [x2]
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x i64>, ptr %a
   %op2 = load <8 x i64>, ptr %b
@@ -493,11 +493,11 @@ define void @concat_v32i64(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: concat_v32i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl16
-; CHECK-NEXT:    ptrue p1.d, vl32
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
-; CHECK-NEXT:    st1d { z0.d }, p1, [x2]
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i64>, ptr %a
   %op2 = load <16 x i64>, ptr %b
@@ -541,9 +541,9 @@ define void @concat_v16f16(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ptrue p1.h, vl16
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
-; CHECK-NEXT:    st1h { z0.h }, p1, [x2]
+; CHECK-NEXT:    ptrue p0.h, vl16
+; CHECK-NEXT:    st1h { z0.h }, p0, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x half>, ptr %a
   %op2 = load <8 x half>, ptr %b
@@ -567,11 +567,11 @@ define void @concat_v32f16(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_512-LABEL: concat_v32f16:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.h, vl16
-; VBITS_GE_512-NEXT:    ptrue p1.h, vl32
 ; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    splice z0.h, p0, z0.h, z1.h
-; VBITS_GE_512-NEXT:    st1h { z0.h }, p1, [x2]
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x2]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %op2 = load <16 x half>, ptr %b
@@ -587,11 +587,11 @@ define void @concat_v64f16(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: concat_v64f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl32
-; CHECK-NEXT:    ptrue p1.h, vl64
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
-; CHECK-NEXT:    st1h { z0.h }, p1, [x2]
+; CHECK-NEXT:    ptrue p0.h, vl64
+; CHECK-NEXT:    st1h { z0.h }, p0, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x half>, ptr %a
   %op2 = load <32 x half>, ptr %b
@@ -611,11 +611,11 @@ define void @concat_v128f16(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: concat_v128f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl64
-; CHECK-NEXT:    ptrue p1.h, vl128
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
-; CHECK-NEXT:    st1h { z0.h }, p1, [x2]
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    st1h { z0.h }, p0, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <64 x half>, ptr %a
   %op2 = load <64 x half>, ptr %b
@@ -671,9 +671,9 @@ define void @concat_v8f32(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ptrue p1.s, vl8
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
-; CHECK-NEXT:    st1w { z0.s }, p1, [x2]
+; CHECK-NEXT:    ptrue p0.s, vl8
+; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <4 x float>, ptr %a
   %op2 = load <4 x float>, ptr %b
@@ -696,11 +696,11 @@ define void @concat_v16f32(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_512-LABEL: concat_v16f32:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_512-NEXT:    ptrue p1.s, vl16
 ; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    splice z0.s, p0, z0.s, z1.s
-; VBITS_GE_512-NEXT:    st1w { z0.s }, p1, [x2]
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x2]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %op2 = load <8 x float>, ptr %b
@@ -714,11 +714,11 @@ define void @concat_v32f32(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: concat_v32f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl16
-; CHECK-NEXT:    ptrue p1.s, vl32
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
-; CHECK-NEXT:    st1w { z0.s }, p1, [x2]
+; CHECK-NEXT:    ptrue p0.s, vl32
+; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x float>, ptr %a
   %op2 = load <16 x float>, ptr %b
@@ -734,11 +734,11 @@ define void @concat_v64f32(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: concat_v64f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl32
-; CHECK-NEXT:    ptrue p1.s, vl64
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
-; CHECK-NEXT:    st1w { z0.s }, p1, [x2]
+; CHECK-NEXT:    ptrue p0.s, vl64
+; CHECK-NEXT:    st1w { z0.s }, p0, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x float>, ptr %a
   %op2 = load <32 x float>, ptr %b
@@ -776,9 +776,9 @@ define void @concat_v4f64(ptr %a, ptr %b, ptr %c) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
-; CHECK-NEXT:    ptrue p1.d, vl4
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
-; CHECK-NEXT:    st1d { z0.d }, p1, [x2]
+; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <2 x double>, ptr %a
   %op2 = load <2 x double>, ptr %b
@@ -801,11 +801,11 @@ define void @concat_v8f64(ptr %a, ptr %b, ptr %c) #0 {
 ; VBITS_GE_512-LABEL: concat_v8f64:
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl4
-; VBITS_GE_512-NEXT:    ptrue p1.d, vl8
 ; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    splice z0.d, p0, z0.d, z1.d
-; VBITS_GE_512-NEXT:    st1d { z0.d }, p1, [x2]
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x2]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %op2 = load <4 x double>, ptr %b
@@ -818,11 +818,11 @@ define void @concat_v16f64(ptr %a, ptr %b, ptr %c) vscale_range(8,0) #0 {
 ; CHECK-LABEL: concat_v16f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl8
-; CHECK-NEXT:    ptrue p1.d, vl16
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
-; CHECK-NEXT:    st1d { z0.d }, p1, [x2]
+; CHECK-NEXT:    ptrue p0.d, vl16
+; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %op2 = load <8 x double>, ptr %b
@@ -836,11 +836,11 @@ define void @concat_v32f64(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 {
 ; CHECK-LABEL: concat_v32f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl16
-; CHECK-NEXT:    ptrue p1.d, vl32
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
-; CHECK-NEXT:    st1d { z0.d }, p1, [x2]
+; CHECK-NEXT:    ptrue p0.d, vl32
+; CHECK-NEXT:    st1d { z0.d }, p0, [x2]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x double>, ptr %a
   %op2 = load <16 x double>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
index 485124c1d59ed..ad4efeaf39247 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-extract-vector-elt.ll
@@ -70,9 +70,9 @@ define half @extractelement_v64f16(ptr %a) vscale_range(8,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl64
 ; CHECK-NEXT:    mov w8, #63 // =0x3f
-; CHECK-NEXT:    whilels p1.h, xzr, x8
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    lastb h0, p1, z0.h
+; CHECK-NEXT:    whilels p0.h, xzr, x8
+; CHECK-NEXT:    lastb h0, p0, z0.h
 ; CHECK-NEXT:    ret
     %op1 = load <64 x half>, ptr %a
     %r = extractelement <64 x half> %op1, i64 63
@@ -84,9 +84,9 @@ define half @extractelement_v128f16(ptr %a) vscale_range(16,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h, vl128
 ; CHECK-NEXT:    mov w8, #127 // =0x7f
-; CHECK-NEXT:    whilels p1.h, xzr, x8
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; CHECK-NEXT:    lastb h0, p1, z0.h
+; CHECK-NEXT:    whilels p0.h, xzr, x8
+; CHECK-NEXT:    lastb h0, p0, z0.h
 ; CHECK-NEXT:    ret
     %op1 = load <128 x half>, ptr %a
     %r = extractelement <128 x half> %op1, i64 127
@@ -154,9 +154,9 @@ define float @extractelement_v32f32(ptr %a) vscale_range(8,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    mov w8, #31 // =0x1f
-; CHECK-NEXT:    whilels p1.s, xzr, x8
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    lastb s0, p1, z0.s
+; CHECK-NEXT:    whilels p0.s, xzr, x8
+; CHECK-NEXT:    lastb s0, p0, z0.s
 ; CHECK-NEXT:    ret
     %op1 = load <32 x float>, ptr %a
     %r = extractelement <32 x float> %op1, i64 31
@@ -168,9 +168,9 @@ define float @extractelement_v64f32(ptr %a) vscale_range(16,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s, vl64
 ; CHECK-NEXT:    mov w8, #63 // =0x3f
-; CHECK-NEXT:    whilels p1.s, xzr, x8
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; CHECK-NEXT:    lastb s0, p1, z0.s
+; CHECK-NEXT:    whilels p0.s, xzr, x8
+; CHECK-NEXT:    lastb s0, p0, z0.s
 ; CHECK-NEXT:    ret
     %op1 = load <64 x float>, ptr %a
     %r = extractelement <64 x float> %op1, i64 63
@@ -236,9 +236,9 @@ define double @extractelement_v16f64(ptr %a) vscale_range(8,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl16
 ; CHECK-NEXT:    mov w8, #15 // =0xf
-; CHECK-NEXT:    whilels p1.d, xzr, x8
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    lastb d0, p1, z0.d
+; CHECK-NEXT:    whilels p0.d, xzr, x8
+; CHECK-NEXT:    lastb d0, p0, z0.d
 ; CHECK-NEXT:    ret
     %op1 = load <16 x double>, ptr %a
     %r = extractelement <16 x double> %op1, i64 15
@@ -250,9 +250,9 @@ define double @extractelement_v32f64(ptr %a) vscale_range(16,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl32
 ; CHECK-NEXT:    mov w8, #31 // =0x1f
-; CHECK-NEXT:    whilels p1.d, xzr, x8
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    lastb d0, p1, z0.d
+; CHECK-NEXT:    whilels p0.d, xzr, x8
+; CHECK-NEXT:    lastb d0, p0, z0.d
 ; CHECK-NEXT:    ret
     %op1 = load <32 x double>, ptr %a
     %r = extractelement <32 x double> %op1, i64 31
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll
index bca3dfe5717ef..e77cd9ef55eaf 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fcopysign.ll
@@ -396,9 +396,9 @@ define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    mvni v1.4s, #128, lsl #24
 ; CHECK-NEXT:    ldr q2, [x0]
-; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
-; CHECK-NEXT:    fcvt z0.s, p1/m, z0.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    bit v0.16b, v2.16b, v1.16b
 ; CHECK-NEXT:    str q0, [x0]
@@ -450,12 +450,12 @@ define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ;
 ; CHECK_EXTEND_ROUND-LABEL: test_copysign_v4f64_v4f32:
 ; CHECK_EXTEND_ROUND:       // %bb.0:
-; CHECK_EXTEND_ROUND-NEXT:    ptrue p0.d, vl4
 ; CHECK_EXTEND_ROUND-NEXT:    ldr q0, [x1]
+; CHECK_EXTEND_ROUND-NEXT:    ptrue p0.d, vl4
 ; CHECK_EXTEND_ROUND-NEXT:    uunpklo z0.d, z0.s
-; CHECK_EXTEND_ROUND-NEXT:    fcvt z0.d, p0/m, z0.s
 ; CHECK_EXTEND_ROUND-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; CHECK_EXTEND_ROUND-NEXT:    and z1.d, z1.d, #0x7fffffffffffffff
+; CHECK_EXTEND_ROUND-NEXT:    fcvt z0.d, p0/m, z0.s
 ; CHECK_EXTEND_ROUND-NEXT:    and z0.d, z0.d, #0x8000000000000000
 ; CHECK_EXTEND_ROUND-NEXT:    orr z0.d, z1.d, z0.d
 ; CHECK_EXTEND_ROUND-NEXT:    st1d { z0.d }, p0, [x0]
@@ -494,9 +494,9 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    mvni v1.4h, #128, lsl #8
 ; CHECK-NEXT:    ldr d2, [x0]
-; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
-; CHECK-NEXT:    fcvt z0.h, p1/m, z0.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    bit v0.8b, v2.8b, v1.8b
@@ -521,9 +521,9 @@ define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    mvni v1.8h, #128, lsl #8
 ; CHECK-NEXT:    ldr q2, [x0]
-; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1]
-; CHECK-NEXT:    fcvt z0.h, p1/m, z0.s
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    bit v0.16b, v2.16b, v1.16b
 ; CHECK-NEXT:    str q0, [x0]
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
index 6da07b855a5c5..b60988be1e76c 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-extend-trunc.ll
@@ -462,11 +462,11 @@ define void @fcvt_v8f64_v8f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
-; VBITS_GE_256-NEXT:    ptrue p1.d
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    fcvt z0.h, p1/m, z0.d
-; VBITS_GE_256-NEXT:    fcvt z1.h, p1/m, z1.d
+; VBITS_GE_256-NEXT:    ptrue p0.d
+; VBITS_GE_256-NEXT:    fcvt z0.h, p0/m, z0.d
+; VBITS_GE_256-NEXT:    fcvt z1.h, p0/m, z1.d
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll
index 13ebda1df7f9d..d1e9dc13f50e8 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-select.ll
@@ -34,15 +34,15 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) vsca
 define void @select_v16f16(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl16
 ; CHECK-NEXT:    mov z0.h, w2
-; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ptrue p1.h, vl16
 ; CHECK-NEXT:    and z0.h, z0.h, #0x1
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p1.h, p1/z, z0.h, #0
-; CHECK-NEXT:    sel z0.h, p1, z1.h, z2.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <16 x half>, ptr %a
   %op2 = load volatile <16 x half>, ptr %b
@@ -54,33 +54,33 @@ define void @select_v16f16(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 define void @select_v32f16(ptr %a, ptr %b, i1 %mask) #0 {
 ; VBITS_GE_256-LABEL: select_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
 ; VBITS_GE_256-NEXT:    mov z0.h, w2
+; VBITS_GE_256-NEXT:    ptrue p0.h
 ; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
-; VBITS_GE_256-NEXT:    ptrue p1.h
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
 ; VBITS_GE_256-NEXT:    and z0.h, z0.h, #0x1
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    cmpne p1.h, p1/z, z0.h, #0
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    sel z1.h, p1, z1.h, z3.h
-; VBITS_GE_256-NEXT:    mov z0.h, p1/m, z2.h
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p1/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p1/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p1/z, [x1]
+; VBITS_GE_256-NEXT:    sel z0.h, p0, z0.h, z2.h
+; VBITS_GE_256-NEXT:    sel z1.h, p0, z1.h, z3.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p1, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p1, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: select_v32f16:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
 ; VBITS_GE_512-NEXT:    mov z0.h, w2
-; VBITS_GE_512-NEXT:    ptrue p1.h
+; VBITS_GE_512-NEXT:    ptrue p0.h
+; VBITS_GE_512-NEXT:    ptrue p1.h, vl32
 ; VBITS_GE_512-NEXT:    and z0.h, z0.h, #0x1
-; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1h { z2.h }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    cmpne p1.h, p1/z, z0.h, #0
-; VBITS_GE_512-NEXT:    sel z0.h, p1, z1.h, z2.h
-; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p1/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; VBITS_GE_512-NEXT:    sel z0.h, p0, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p1, [x0]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load volatile <32 x half>, ptr %a
   %op2 = load volatile <32 x half>, ptr %b
@@ -92,15 +92,15 @@ define void @select_v32f16(ptr %a, ptr %b, i1 %mask) #0 {
 define void @select_v64f16(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 ; CHECK-LABEL: select_v64f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl64
 ; CHECK-NEXT:    mov z0.h, w2
-; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ptrue p1.h, vl64
 ; CHECK-NEXT:    and z0.h, z0.h, #0x1
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p1.h, p1/z, z0.h, #0
-; CHECK-NEXT:    sel z0.h, p1, z1.h, z2.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <64 x half>, ptr %a
   %op2 = load volatile <64 x half>, ptr %b
@@ -112,15 +112,15 @@ define void @select_v64f16(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 define void @select_v128f16(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 {
 ; CHECK-LABEL: select_v128f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl128
 ; CHECK-NEXT:    mov z0.h, w2
-; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ptrue p1.h, vl128
 ; CHECK-NEXT:    and z0.h, z0.h, #0x1
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p1.h, p1/z, z0.h, #0
-; CHECK-NEXT:    sel z0.h, p1, z1.h, z2.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <128 x half>, ptr %a
   %op2 = load volatile <128 x half>, ptr %b
@@ -158,15 +158,15 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) v
 define void @select_v8f32(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    and w8, w2, #0x1
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z0.s, w8
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p1.s, p1/z, z0.s, #0
-; CHECK-NEXT:    sel z0.s, p1, z1.s, z2.s
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ptrue p1.s, vl8
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <8 x float>, ptr %a
   %op2 = load volatile <8 x float>, ptr %b
@@ -178,33 +178,33 @@ define void @select_v8f32(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 define void @select_v16f32(ptr %a, ptr %b, i1 %mask) #0 {
 ; VBITS_GE_256-LABEL: select_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    and w8, w2, #0x1
-; VBITS_GE_256-NEXT:    ptrue p1.s
+; VBITS_GE_256-NEXT:    ptrue p0.s
 ; VBITS_GE_256-NEXT:    mov z0.s, w8
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z0.s, #0
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    sel z1.s, p1, z1.s, z3.s
-; VBITS_GE_256-NEXT:    mov z0.s, p1/m, z2.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p1/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p1/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p1/z, [x1]
+; VBITS_GE_256-NEXT:    sel z0.s, p0, z0.s, z2.s
+; VBITS_GE_256-NEXT:    sel z1.s, p0, z1.s, z3.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p1, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p1, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: select_v16f32:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
 ; VBITS_GE_512-NEXT:    and w8, w2, #0x1
-; VBITS_GE_512-NEXT:    ptrue p1.s
+; VBITS_GE_512-NEXT:    ptrue p0.s
 ; VBITS_GE_512-NEXT:    mov z0.s, w8
-; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1w { z2.s }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    cmpne p1.s, p1/z, z0.s, #0
-; VBITS_GE_512-NEXT:    sel z0.s, p1, z1.s, z2.s
-; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ptrue p1.s, vl16
+; VBITS_GE_512-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p1/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p1/z, [x1]
+; VBITS_GE_512-NEXT:    sel z0.s, p0, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p1, [x0]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load volatile <16 x float>, ptr %a
   %op2 = load volatile <16 x float>, ptr %b
@@ -216,15 +216,15 @@ define void @select_v16f32(ptr %a, ptr %b, i1 %mask) #0 {
 define void @select_v32f32(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 ; CHECK-LABEL: select_v32f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    and w8, w2, #0x1
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z0.s, w8
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p1.s, p1/z, z0.s, #0
-; CHECK-NEXT:    sel z0.s, p1, z1.s, z2.s
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ptrue p1.s, vl32
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <32 x float>, ptr %a
   %op2 = load volatile <32 x float>, ptr %b
@@ -236,15 +236,15 @@ define void @select_v32f32(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 define void @select_v64f32(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 {
 ; CHECK-LABEL: select_v64f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl64
 ; CHECK-NEXT:    and w8, w2, #0x1
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z0.s, w8
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p1.s, p1/z, z0.s, #0
-; CHECK-NEXT:    sel z0.s, p1, z1.s, z2.s
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ptrue p1.s, vl64
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <64 x float>, ptr %a
   %op2 = load volatile <64 x float>, ptr %b
@@ -282,16 +282,16 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask
 define void @select_v4f64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    and x8, x2, #0x1
-; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z0.d, x8
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p1.d, p1/z, z0.d, #0
-; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
-; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ptrue p1.d, vl4
+; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <4 x double>, ptr %a
   %op2 = load volatile <4 x double>, ptr %b
@@ -303,35 +303,35 @@ define void @select_v4f64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 define void @select_v8f64(ptr %a, ptr %b, i1 %mask) #0 {
 ; VBITS_GE_256-LABEL: select_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; VBITS_GE_256-NEXT:    and x8, x2, #0x1
-; VBITS_GE_256-NEXT:    ptrue p1.d
+; VBITS_GE_256-NEXT:    ptrue p0.d
 ; VBITS_GE_256-NEXT:    mov z0.d, x8
+; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z0.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    sel z1.d, p1, z1.d, z3.d
-; VBITS_GE_256-NEXT:    mov z0.d, p1/m, z2.d
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p1/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p1/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p1/z, [x1]
+; VBITS_GE_256-NEXT:    sel z0.d, p0, z0.d, z2.d
+; VBITS_GE_256-NEXT:    sel z1.d, p0, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p1, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p1, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: select_v8f64:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
 ; VBITS_GE_512-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; VBITS_GE_512-NEXT:    and x8, x2, #0x1
-; VBITS_GE_512-NEXT:    ptrue p1.d
+; VBITS_GE_512-NEXT:    ptrue p0.d
 ; VBITS_GE_512-NEXT:    mov z0.d, x8
-; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1d { z2.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    cmpne p1.d, p1/z, z0.d, #0
-; VBITS_GE_512-NEXT:    sel z0.d, p1, z1.d, z2.d
-; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ptrue p1.d, vl8
+; VBITS_GE_512-NEXT:    cmpne p0.d, p0/z, z0.d, #0
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p1/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; VBITS_GE_512-NEXT:    sel z0.d, p0, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p1, [x0]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load volatile <8 x double>, ptr %a
   %op2 = load volatile <8 x double>, ptr %b
@@ -343,16 +343,16 @@ define void @select_v8f64(ptr %a, ptr %b, i1 %mask) #0 {
 define void @select_v16f64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 ; CHECK-LABEL: select_v16f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl16
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    and x8, x2, #0x1
-; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z0.d, x8
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p1.d, p1/z, z0.d, #0
-; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
-; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ptrue p1.d, vl16
+; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <16 x double>, ptr %a
   %op2 = load volatile <16 x double>, ptr %b
@@ -364,16 +364,16 @@ define void @select_v16f64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 define void @select_v32f64(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 {
 ; CHECK-LABEL: select_v32f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl32
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    and x8, x2, #0x1
-; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z0.d, x8
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p1.d, p1/z, z0.d, #0
-; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
-; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <32 x double>, ptr %a
   %op2 = load volatile <32 x double>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
index da0cf927d74d2..af54b146c5b66 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-to-int.ll
@@ -131,8 +131,8 @@ define <4 x i32> @fcvtzu_v4f16_v4i32(<4 x half> %op1) vscale_range(2,0) #0 {
 define void @fcvtzu_v8f16_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v8f16_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
@@ -357,7 +357,6 @@ define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
-; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fcvtzu z0.s, p0/m, z0.s
@@ -366,7 +365,8 @@ define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; VBITS_GE_256-NEXT:    splice z1.h, p0, z1.h, z0.h
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p1, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fcvtzu_v16f32_v16i16:
@@ -532,8 +532,8 @@ define <2 x i64> @fcvtzu_v2f32_v2i64(<2 x float> %op1) vscale_range(2,0) #0 {
 define void @fcvtzu_v4f32_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzu_v4f32_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
@@ -752,7 +752,6 @@ define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fcvtzu z0.d, p0/m, z0.d
@@ -761,7 +760,8 @@ define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p1, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fcvtzu_v8f64_v8i32:
@@ -1024,8 +1024,8 @@ define <4 x i32> @fcvtzs_v4f16_v4i32(<4 x half> %op1) vscale_range(2,0) #0 {
 define void @fcvtzs_v8f16_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v8f16_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
@@ -1250,7 +1250,6 @@ define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
-; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fcvtzs z0.s, p0/m, z0.s
@@ -1259,7 +1258,8 @@ define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; VBITS_GE_256-NEXT:    splice z1.h, p0, z1.h, z0.h
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p1, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fcvtzs_v16f32_v16i16:
@@ -1425,8 +1425,8 @@ define <2 x i64> @fcvtzs_v2f32_v2i64(<2 x float> %op1) vscale_range(2,0) #0 {
 define void @fcvtzs_v4f32_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: fcvtzs_v4f32_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
@@ -1645,7 +1645,6 @@ define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    fcvtzs z0.d, p0/m, z0.d
@@ -1654,7 +1653,8 @@ define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p1, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: fcvtzs_v8f64_v8i32:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll
index 115f722986b5c..61e04682fa0bf 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp128.ll
@@ -85,10 +85,10 @@ define void @fcvt_v4f128_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    str q0, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    ldr q0, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl __trunctfdf2
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldr q1, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    add x8, sp, #128
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    ldr z1, [x8, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
@@ -111,14 +111,14 @@ define void @fcvt_v4f128_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ldr q0, [sp, #112] // 16-byte Folded Reload
 ; CHECK-NEXT:    bl __trunctfdf2
 ; CHECK-NEXT:    ldr q1, [sp, #96] // 16-byte Folded Reload
-; CHECK-NEXT:    ptrue p1.d, vl2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    add x8, sp, #128
-; CHECK-NEXT:    ptrue p0.d, vl4
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    ldr z1, [x8] // 16-byte Folded Reload
 ; CHECK-NEXT:    mov x8, #4 // =0x4
-; CHECK-NEXT:    splice z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x19, x8, lsl #3]
 ; CHECK-NEXT:    add x8, sp, #128
 ; CHECK-NEXT:    ldr z0, [x8, #1, mul vl] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll
index a4b5ccd69fdb7..1bd688d23050b 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-frame-offests-crash.ll
@@ -21,24 +21,24 @@ define dso_local void @func1(ptr %v1, ptr %v2, ptr %v3, ptr %v4, ptr %v5, ptr %v
 ; CHECK-NEXT:    .cfi_offset w22, -32
 ; CHECK-NEXT:    .cfi_offset w29, -48
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    add x11, sp, #176
-; CHECK-NEXT:    add x10, sp, #144
-; CHECK-NEXT:    add x9, sp, #112
+; CHECK-NEXT:    add x10, sp, #176
 ; CHECK-NEXT:    add x8, sp, #48
+; CHECK-NEXT:    add x9, sp, #144
 ; CHECK-NEXT:    add x20, sp, #176
-; CHECK-NEXT:    ldp x13, x12, [sp, #328]
 ; CHECK-NEXT:    ldr x15, [sp, #104]
+; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x10]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8]
+; CHECK-NEXT:    add x8, sp, #112
+; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x9]
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x8]
 ; CHECK-NEXT:    ldur q4, [sp, #88]
-; CHECK-NEXT:    ldp x16, x17, [sp, #208]
+; CHECK-NEXT:    ldp x9, x8, [sp, #328]
 ; CHECK-NEXT:    ldr x19, [sp, #272]
+; CHECK-NEXT:    ldp x11, x10, [sp, #312]
+; CHECK-NEXT:    ldp x13, x12, [sp, #296]
+; CHECK-NEXT:    ldp x18, x14, [sp, #280]
+; CHECK-NEXT:    ldp x16, x17, [sp, #208]
 ; CHECK-NEXT:    ldp x21, x22, [sp, #352]
-; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x11]
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x10]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x9]
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8]
-; CHECK-NEXT:    ldp x8, x14, [sp, #312]
-; CHECK-NEXT:    ldp x10, x9, [sp, #296]
-; CHECK-NEXT:    ldp x18, x11, [sp, #280]
 ; CHECK-NEXT:    st1d { z3.d }, p0, [x20]
 ; CHECK-NEXT:    add x20, sp, #144
 ; CHECK-NEXT:    st1d { z2.d }, p0, [x20]
@@ -53,10 +53,10 @@ define dso_local void @func1(ptr %v1, ptr %v2, ptr %v3, ptr %v4, ptr %v5, ptr %v
 ; CHECK-NEXT:    stp x16, x17, [sp, #208]
 ; CHECK-NEXT:    stur q4, [sp, #88]
 ; CHECK-NEXT:    str x15, [sp, #104]
-; CHECK-NEXT:    stp x11, x10, [sp, #288]
-; CHECK-NEXT:    stp x9, x8, [sp, #304]
-; CHECK-NEXT:    stp x14, x13, [sp, #320]
-; CHECK-NEXT:    str x12, [sp, #336]
+; CHECK-NEXT:    stp x14, x13, [sp, #288]
+; CHECK-NEXT:    stp x12, x11, [sp, #304]
+; CHECK-NEXT:    stp x10, x9, [sp, #320]
+; CHECK-NEXT:    str x8, [sp, #336]
 ; CHECK-NEXT:    ldr x29, [sp], #48 // 8-byte Folded Reload
 ; CHECK-NEXT:    b func2
                              ptr %v9, ptr %v10, ptr %v11, ptr %v12, ptr %v13, ptr %v14,  ptr %v15, ptr %v16,
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll
index 977c528e2583a..6f4d257039bca 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll
@@ -36,16 +36,16 @@ define <8 x half> @insertelement_v8f16(<8 x half> %op1) vscale_range(2,0) #0 {
 define void @insertelement_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: insertelement_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl16
 ; CHECK-NEXT:    mov w8, #15 // =0xf
 ; CHECK-NEXT:    index z0.h, #0, #1
-; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z1.h, w8
-; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x0]
-; CHECK-NEXT:    cmpeq p1.h, p1/z, z0.h, z1.h
-; CHECK-NEXT:    fmov h0, #5.00000000
-; CHECK-NEXT:    mov z2.h, p1/m, h0
-; CHECK-NEXT:    st1h { z2.h }, p0, [x1]
+; CHECK-NEXT:    ptrue p1.h, vl16
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0]
+; CHECK-NEXT:    fmov h1, #5.00000000
+; CHECK-NEXT:    mov z0.h, p0/m, h1
+; CHECK-NEXT:    st1h { z0.h }, p1, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x half>, ptr %a
   %r = insertelement <16 x half> %op1, half 5.0, i64 15
@@ -56,33 +56,33 @@ define void @insertelement_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @insertelement_v32f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: insertelement_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
 ; VBITS_GE_256-NEXT:    mov w8, #15 // =0xf
 ; VBITS_GE_256-NEXT:    index z0.h, #0, #1
-; VBITS_GE_256-NEXT:    ptrue p1.h
+; VBITS_GE_256-NEXT:    ptrue p0.h
 ; VBITS_GE_256-NEXT:    mov z1.h, w8
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
 ; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
-; VBITS_GE_256-NEXT:    fmov h2, #5.00000000
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    cmpeq p1.h, p1/z, z0.h, z1.h
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    mov z3.h, p1/m, h2
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1h { z3.h }, p0, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    cmpeq p0.h, p0/z, z0.h, z1.h
+; VBITS_GE_256-NEXT:    fmov h0, #5.00000000
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p1/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    mov z1.h, p0/m, h0
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p1/z, [x0]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p1, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p1, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: insertelement_v32f16:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
 ; VBITS_GE_512-NEXT:    mov w8, #31 // =0x1f
 ; VBITS_GE_512-NEXT:    index z0.h, #0, #1
-; VBITS_GE_512-NEXT:    ptrue p1.h
+; VBITS_GE_512-NEXT:    ptrue p0.h
 ; VBITS_GE_512-NEXT:    mov z1.h, w8
-; VBITS_GE_512-NEXT:    ld1h { z2.h }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    cmpeq p1.h, p1/z, z0.h, z1.h
-; VBITS_GE_512-NEXT:    fmov h0, #5.00000000
-; VBITS_GE_512-NEXT:    mov z2.h, p1/m, h0
-; VBITS_GE_512-NEXT:    st1h { z2.h }, p0, [x1]
+; VBITS_GE_512-NEXT:    ptrue p1.h, vl32
+; VBITS_GE_512-NEXT:    cmpeq p0.h, p0/z, z0.h, z1.h
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p1/z, [x0]
+; VBITS_GE_512-NEXT:    fmov h1, #5.00000000
+; VBITS_GE_512-NEXT:    mov z0.h, p0/m, h1
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p1, [x1]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load <32 x half>, ptr %a
   %r = insertelement <32 x half> %op1, half 5.0, i64 31
@@ -93,16 +93,16 @@ define void @insertelement_v32f16(ptr %a, ptr %b) #0 {
 define void @insertelement_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: insertelement_v64f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl64
 ; CHECK-NEXT:    mov w8, #63 // =0x3f
 ; CHECK-NEXT:    index z0.h, #0, #1
-; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z1.h, w8
-; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x0]
-; CHECK-NEXT:    cmpeq p1.h, p1/z, z0.h, z1.h
-; CHECK-NEXT:    fmov h0, #5.00000000
-; CHECK-NEXT:    mov z2.h, p1/m, h0
-; CHECK-NEXT:    st1h { z2.h }, p0, [x1]
+; CHECK-NEXT:    ptrue p1.h, vl64
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0]
+; CHECK-NEXT:    fmov h1, #5.00000000
+; CHECK-NEXT:    mov z0.h, p0/m, h1
+; CHECK-NEXT:    st1h { z0.h }, p1, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <64 x half>, ptr %a
   %r = insertelement <64 x half> %op1, half 5.0, i64 63
@@ -113,16 +113,16 @@ define void @insertelement_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
 define void @insertelement_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: insertelement_v128f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl128
 ; CHECK-NEXT:    mov w8, #127 // =0x7f
 ; CHECK-NEXT:    index z0.h, #0, #1
-; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z1.h, w8
-; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x0]
-; CHECK-NEXT:    cmpeq p1.h, p1/z, z0.h, z1.h
-; CHECK-NEXT:    fmov h0, #5.00000000
-; CHECK-NEXT:    mov z2.h, p1/m, h0
-; CHECK-NEXT:    st1h { z2.h }, p0, [x1]
+; CHECK-NEXT:    ptrue p1.h, vl128
+; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, z1.h
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0]
+; CHECK-NEXT:    fmov h1, #5.00000000
+; CHECK-NEXT:    mov z0.h, p0/m, h1
+; CHECK-NEXT:    st1h { z0.h }, p1, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <128 x half>, ptr %a
   %r = insertelement <128 x half> %op1, half 5.0, i64 127
@@ -157,16 +157,16 @@ define <4 x float> @insertelement_v4f32(<4 x float> %op1) vscale_range(2,0) #0 {
 define void @insertelement_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: insertelement_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    mov w8, #7 // =0x7
 ; CHECK-NEXT:    index z0.s, #0, #1
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
-; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0]
-; CHECK-NEXT:    cmpeq p1.s, p1/z, z0.s, z1.s
-; CHECK-NEXT:    fmov s0, #5.00000000
-; CHECK-NEXT:    mov z2.s, p1/m, s0
-; CHECK-NEXT:    st1w { z2.s }, p0, [x1]
+; CHECK-NEXT:    ptrue p1.s, vl8
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0]
+; CHECK-NEXT:    fmov s1, #5.00000000
+; CHECK-NEXT:    mov z0.s, p0/m, s1
+; CHECK-NEXT:    st1w { z0.s }, p1, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <8 x float>, ptr %a
   %r = insertelement <8 x float> %op1, float 5.0, i64 7
@@ -177,33 +177,33 @@ define void @insertelement_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @insertelement_v16f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: insertelement_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    mov w8, #7 // =0x7
 ; VBITS_GE_256-NEXT:    index z0.s, #0, #1
-; VBITS_GE_256-NEXT:    ptrue p1.s
+; VBITS_GE_256-NEXT:    ptrue p0.s
 ; VBITS_GE_256-NEXT:    mov z1.s, w8
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
-; VBITS_GE_256-NEXT:    fmov s2, #5.00000000
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    cmpeq p1.s, p1/z, z0.s, z1.s
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    mov z3.s, p1/m, s2
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1w { z3.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
+; VBITS_GE_256-NEXT:    fmov s0, #5.00000000
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p1/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    mov z1.s, p0/m, s0
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p1/z, [x0]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p1, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p1, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: insertelement_v16f32:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
 ; VBITS_GE_512-NEXT:    mov w8, #15 // =0xf
 ; VBITS_GE_512-NEXT:    index z0.s, #0, #1
-; VBITS_GE_512-NEXT:    ptrue p1.s
+; VBITS_GE_512-NEXT:    ptrue p0.s
 ; VBITS_GE_512-NEXT:    mov z1.s, w8
-; VBITS_GE_512-NEXT:    ld1w { z2.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    cmpeq p1.s, p1/z, z0.s, z1.s
-; VBITS_GE_512-NEXT:    fmov s0, #5.00000000
-; VBITS_GE_512-NEXT:    mov z2.s, p1/m, s0
-; VBITS_GE_512-NEXT:    st1w { z2.s }, p0, [x1]
+; VBITS_GE_512-NEXT:    ptrue p1.s, vl16
+; VBITS_GE_512-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p1/z, [x0]
+; VBITS_GE_512-NEXT:    fmov s1, #5.00000000
+; VBITS_GE_512-NEXT:    mov z0.s, p0/m, s1
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p1, [x1]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load <16 x float>, ptr %a
   %r = insertelement <16 x float> %op1, float 5.0, i64 15
@@ -214,16 +214,16 @@ define void @insertelement_v16f32(ptr %a, ptr %b) #0 {
 define void @insertelement_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: insertelement_v32f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    mov w8, #31 // =0x1f
 ; CHECK-NEXT:    index z0.s, #0, #1
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
-; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0]
-; CHECK-NEXT:    cmpeq p1.s, p1/z, z0.s, z1.s
-; CHECK-NEXT:    fmov s0, #5.00000000
-; CHECK-NEXT:    mov z2.s, p1/m, s0
-; CHECK-NEXT:    st1w { z2.s }, p0, [x1]
+; CHECK-NEXT:    ptrue p1.s, vl32
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0]
+; CHECK-NEXT:    fmov s1, #5.00000000
+; CHECK-NEXT:    mov z0.s, p0/m, s1
+; CHECK-NEXT:    st1w { z0.s }, p1, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x float>, ptr %a
   %r = insertelement <32 x float> %op1, float 5.0, i64 31
@@ -234,16 +234,16 @@ define void @insertelement_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 {
 define void @insertelement_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: insertelement_v64f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl64
 ; CHECK-NEXT:    mov w8, #63 // =0x3f
 ; CHECK-NEXT:    index z0.s, #0, #1
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
-; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0]
-; CHECK-NEXT:    cmpeq p1.s, p1/z, z0.s, z1.s
-; CHECK-NEXT:    fmov s0, #5.00000000
-; CHECK-NEXT:    mov z2.s, p1/m, s0
-; CHECK-NEXT:    st1w { z2.s }, p0, [x1]
+; CHECK-NEXT:    ptrue p1.s, vl64
+; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0]
+; CHECK-NEXT:    fmov s1, #5.00000000
+; CHECK-NEXT:    mov z0.s, p0/m, s1
+; CHECK-NEXT:    st1w { z0.s }, p1, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <64 x float>, ptr %a
   %r = insertelement <64 x float> %op1, float 5.0, i64 63
@@ -276,16 +276,16 @@ define <2 x double> @insertelement_v2f64(<2 x double> %op1) vscale_range(2,0) #0
 define void @insertelement_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: insertelement_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    mov w8, #3 // =0x3
 ; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0]
-; CHECK-NEXT:    cmpeq p1.d, p1/z, z0.d, z1.d
-; CHECK-NEXT:    fmov d0, #5.00000000
-; CHECK-NEXT:    mov z2.d, p1/m, d0
-; CHECK-NEXT:    st1d { z2.d }, p0, [x1]
+; CHECK-NEXT:    ptrue p1.d, vl4
+; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x0]
+; CHECK-NEXT:    fmov d1, #5.00000000
+; CHECK-NEXT:    mov z0.d, p0/m, d1
+; CHECK-NEXT:    st1d { z0.d }, p1, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <4 x double>, ptr %a
   %r = insertelement <4 x double> %op1, double 5.0, i64 3
@@ -296,33 +296,33 @@ define void @insertelement_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @insertelement_v8f64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: insertelement_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    mov w8, #3 // =0x3
 ; VBITS_GE_256-NEXT:    index z0.d, #0, #1
-; VBITS_GE_256-NEXT:    ptrue p1.d
+; VBITS_GE_256-NEXT:    ptrue p0.d
 ; VBITS_GE_256-NEXT:    mov z1.d, x8
+; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
-; VBITS_GE_256-NEXT:    fmov d2, #5.00000000
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    cmpeq p1.d, p1/z, z0.d, z1.d
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    mov z3.d, p1/m, d2
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1d { z3.d }, p0, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    cmpeq p0.d, p0/z, z0.d, z1.d
+; VBITS_GE_256-NEXT:    fmov d0, #5.00000000
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p1/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    mov z1.d, p0/m, d0
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [x0]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p1, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p1, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: insertelement_v8f64:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
 ; VBITS_GE_512-NEXT:    mov w8, #7 // =0x7
 ; VBITS_GE_512-NEXT:    index z0.d, #0, #1
-; VBITS_GE_512-NEXT:    ptrue p1.d
+; VBITS_GE_512-NEXT:    ptrue p0.d
 ; VBITS_GE_512-NEXT:    mov z1.d, x8
-; VBITS_GE_512-NEXT:    ld1d { z2.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    cmpeq p1.d, p1/z, z0.d, z1.d
-; VBITS_GE_512-NEXT:    fmov d0, #5.00000000
-; VBITS_GE_512-NEXT:    mov z2.d, p1/m, d0
-; VBITS_GE_512-NEXT:    st1d { z2.d }, p0, [x1]
+; VBITS_GE_512-NEXT:    ptrue p1.d, vl8
+; VBITS_GE_512-NEXT:    cmpeq p0.d, p0/z, z0.d, z1.d
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p1/z, [x0]
+; VBITS_GE_512-NEXT:    fmov d1, #5.00000000
+; VBITS_GE_512-NEXT:    mov z0.d, p0/m, d1
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p1, [x1]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load <8 x double>, ptr %a
   %r = insertelement <8 x double> %op1, double 5.0, i64 7
@@ -333,16 +333,16 @@ define void @insertelement_v8f64(ptr %a, ptr %b) #0 {
 define void @insertelement_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: insertelement_v16f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl16
 ; CHECK-NEXT:    mov w8, #15 // =0xf
 ; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0]
-; CHECK-NEXT:    cmpeq p1.d, p1/z, z0.d, z1.d
-; CHECK-NEXT:    fmov d0, #5.00000000
-; CHECK-NEXT:    mov z2.d, p1/m, d0
-; CHECK-NEXT:    st1d { z2.d }, p0, [x1]
+; CHECK-NEXT:    ptrue p1.d, vl16
+; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x0]
+; CHECK-NEXT:    fmov d1, #5.00000000
+; CHECK-NEXT:    mov z0.d, p0/m, d1
+; CHECK-NEXT:    st1d { z0.d }, p1, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x double>, ptr %a
   %r = insertelement <16 x double> %op1, double 5.0, i64 15
@@ -353,16 +353,16 @@ define void @insertelement_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
 define void @insertelement_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: insertelement_v32f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl32
 ; CHECK-NEXT:    mov w8, #31 // =0x1f
 ; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x0]
-; CHECK-NEXT:    cmpeq p1.d, p1/z, z0.d, z1.d
-; CHECK-NEXT:    fmov d0, #5.00000000
-; CHECK-NEXT:    mov z2.d, p1/m, d0
-; CHECK-NEXT:    st1d { z2.d }, p0, [x1]
+; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x0]
+; CHECK-NEXT:    fmov d1, #5.00000000
+; CHECK-NEXT:    mov z0.d, p0/m, d1
+; CHECK-NEXT:    st1d { z0.d }, p1, [x1]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x double>, ptr %a
   %r = insertelement <32 x double> %op1, double 5.0, i64 31
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
index b45a62f4e7581..58fca3a2cf8b6 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-arith.ll
@@ -1388,11 +1388,11 @@ define void @abs_v128i16(ptr %a) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    mov x10, #64 // =0x40
 ; CHECK-NEXT:    mov x11, #80 // =0x50
 ; CHECK-NEXT:    mov x12, #32 // =0x20
-; CHECK-NEXT:    mov x13, #48 // =0x30
-; CHECK-NEXT:    mov x14, #16 // =0x10
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, x9, lsl #1]
 ; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x0, x10, lsl #1]
+; CHECK-NEXT:    mov x13, #48 // =0x30
+; CHECK-NEXT:    mov x14, #16 // =0x10
 ; CHECK-NEXT:    ld1h { z3.h }, p0/z, [x0, x11, lsl #1]
 ; CHECK-NEXT:    ld1h { z4.h }, p0/z, [x0, x12, lsl #1]
 ; CHECK-NEXT:    ld1h { z5.h }, p0/z, [x0, x13, lsl #1]
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
index 11ed69513917c..0ddf434eff930 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-div.ll
@@ -15,9 +15,9 @@ target triple = "aarch64-unknown-linux-gnu"
 define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; VBITS_GE_128-LABEL: sdiv_v8i8:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    sshll v1.8h, v1.8b, #0
 ; VBITS_GE_128-NEXT:    sshll v0.8h, v0.8b, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    sshll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    sshll2 v3.4s, v0.8h, #0
 ; VBITS_GE_128-NEXT:    sshll v1.4s, v1.4h, #0
@@ -94,11 +94,11 @@ define <8 x i8> @sdiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_128-LABEL: sdiv_v16i8:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    sshll2 v2.8h, v1.16b, #0
 ; VBITS_GE_128-NEXT:    sshll2 v3.8h, v0.16b, #0
 ; VBITS_GE_128-NEXT:    sshll v1.8h, v1.8b, #0
 ; VBITS_GE_128-NEXT:    sshll v0.8h, v0.8b, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    sshll2 v4.4s, v2.8h, #0
 ; VBITS_GE_128-NEXT:    sshll2 v5.4s, v3.8h, #0
 ; VBITS_GE_128-NEXT:    sshll v2.4s, v2.4h, #0
@@ -203,7 +203,6 @@ define void @sdiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: sdiv_v128i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b, vl128
-; CHECK-NEXT:    ptrue p1.h, vl128
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl64
@@ -221,7 +220,8 @@ define void @sdiv_v128i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    splice z1.h, p0, z1.h, z0.h
-; CHECK-NEXT:    st1b { z1.h }, p1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl128
+; CHECK-NEXT:    st1b { z1.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <128 x i8>, ptr %a
   %op2 = load <128 x i8>, ptr %b
@@ -260,14 +260,14 @@ define void @sdiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    sdivr z3.s, p1/m, z3.s, z5.s
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    sdiv z0.s, p1/m, z0.s, z1.s
-; CHECK-NEXT:    ptrue p1.h, vl64
 ; CHECK-NEXT:    uzp1 z1.h, z4.h, z4.h
+; CHECK-NEXT:    ptrue p1.h, vl64
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    splice z1.h, p1, z1.h, z2.h
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    splice z3.h, p1, z3.h, z0.h
-; CHECK-NEXT:    ptrue p1.b, vl128
 ; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
+; CHECK-NEXT:    ptrue p1.b, vl128
 ; CHECK-NEXT:    uzp1 z1.b, z3.b, z3.b
 ; CHECK-NEXT:    splice z0.b, p1, z0.b, z1.b
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
@@ -284,18 +284,18 @@ define void @sdiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; VBITS_GE_128-LABEL: sdiv_v4i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    sshll v1.4s, v1.4h, #0
 ; VBITS_GE_128-NEXT:    sshll v0.4s, v0.4h, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; VBITS_GE_128-NEXT:    xtn v0.4h, v0.4s
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: sdiv_v4i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    sshll v1.4s, v1.4h, #0
 ; VBITS_GE_256-NEXT:    sshll v0.4s, v0.4h, #0
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    sdivr z1.s, p0/m, z1.s, z0.s
 ; VBITS_GE_256-NEXT:    mov w8, v1.s[1]
 ; VBITS_GE_256-NEXT:    mov v0.16b, v1.16b
@@ -309,9 +309,9 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ;
 ; VBITS_GE_512-LABEL: sdiv_v4i16:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_512-NEXT:    sshll v1.4s, v1.4h, #0
 ; VBITS_GE_512-NEXT:    sshll v0.4s, v0.4h, #0
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_512-NEXT:    sdivr z1.s, p0/m, z1.s, z0.s
 ; VBITS_GE_512-NEXT:    mov w8, v1.s[1]
 ; VBITS_GE_512-NEXT:    mov v0.16b, v1.16b
@@ -329,11 +329,11 @@ define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ; VBITS_GE_128-LABEL: sdiv_v8i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    sshll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    sshll2 v3.4s, v0.8h, #0
 ; VBITS_GE_128-NEXT:    sshll v1.4s, v1.4h, #0
 ; VBITS_GE_128-NEXT:    sshll v0.4s, v0.4h, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_128-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
@@ -341,9 +341,9 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ;
 ; VBITS_GE_256-LABEL: sdiv_v8i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
@@ -353,9 +353,9 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ;
 ; VBITS_GE_512-LABEL: sdiv_v8i16:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_512-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
 ; VBITS_GE_512-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
@@ -369,8 +369,8 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 define void @sdiv_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: sdiv_v16i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ldp q4, q1, [x1]
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ldr q0, [x0, #16]
 ; VBITS_GE_128-NEXT:    sshll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    sshll2 v3.4s, v0.8h, #0
@@ -542,8 +542,8 @@ define void @sdiv_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @sdiv_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: sdiv_v16i32:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
 ; VBITS_GE_128-NEXT:    ldp q5, q4, [x1, #32]
 ; VBITS_GE_128-NEXT:    sdivr z0.s, p0/m, z0.s, z1.s
@@ -664,8 +664,8 @@ define void @sdiv_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @sdiv_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: sdiv_v8i64:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.d, vl2
 ; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
+; VBITS_GE_128-NEXT:    ptrue p0.d, vl2
 ; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
 ; VBITS_GE_128-NEXT:    ldp q5, q4, [x1, #32]
 ; VBITS_GE_128-NEXT:    sdivr z0.d, p0/m, z0.d, z1.d
@@ -748,9 +748,9 @@ define void @sdiv_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; VBITS_GE_128-LABEL: udiv_v8i8:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ushll v1.8h, v1.8b, #0
 ; VBITS_GE_128-NEXT:    ushll v0.8h, v0.8b, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ushll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    ushll2 v3.4s, v0.8h, #0
 ; VBITS_GE_128-NEXT:    ushll v1.4s, v1.4h, #0
@@ -827,11 +827,11 @@ define <8 x i8> @udiv_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_128-LABEL: udiv_v16i8:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ushll2 v2.8h, v1.16b, #0
 ; VBITS_GE_128-NEXT:    ushll2 v3.8h, v0.16b, #0
 ; VBITS_GE_128-NEXT:    ushll v1.8h, v1.8b, #0
 ; VBITS_GE_128-NEXT:    ushll v0.8h, v0.8b, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ushll2 v4.4s, v2.8h, #0
 ; VBITS_GE_128-NEXT:    ushll2 v5.4s, v3.8h, #0
 ; VBITS_GE_128-NEXT:    ushll v2.4s, v2.4h, #0
@@ -980,14 +980,14 @@ define void @udiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    udivr z3.s, p1/m, z3.s, z5.s
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    udiv z0.s, p1/m, z0.s, z1.s
-; CHECK-NEXT:    ptrue p1.h, vl64
 ; CHECK-NEXT:    uzp1 z1.h, z4.h, z4.h
+; CHECK-NEXT:    ptrue p1.h, vl64
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    splice z1.h, p1, z1.h, z2.h
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    splice z3.h, p1, z3.h, z0.h
-; CHECK-NEXT:    ptrue p1.b, vl128
 ; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
+; CHECK-NEXT:    ptrue p1.b, vl128
 ; CHECK-NEXT:    uzp1 z1.b, z3.b, z3.b
 ; CHECK-NEXT:    splice z0.b, p1, z0.b, z1.b
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
@@ -1004,18 +1004,18 @@ define void @udiv_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; VBITS_GE_128-LABEL: udiv_v4i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ushll v1.4s, v1.4h, #0
 ; VBITS_GE_128-NEXT:    ushll v0.4s, v0.4h, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; VBITS_GE_128-NEXT:    xtn v0.4h, v0.4s
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: udiv_v4i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    ushll v1.4s, v1.4h, #0
 ; VBITS_GE_256-NEXT:    ushll v0.4s, v0.4h, #0
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    udivr z1.s, p0/m, z1.s, z0.s
 ; VBITS_GE_256-NEXT:    mov w8, v1.s[1]
 ; VBITS_GE_256-NEXT:    mov v0.16b, v1.16b
@@ -1029,9 +1029,9 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ;
 ; VBITS_GE_512-LABEL: udiv_v4i16:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_512-NEXT:    ushll v1.4s, v1.4h, #0
 ; VBITS_GE_512-NEXT:    ushll v0.4s, v0.4h, #0
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_512-NEXT:    udivr z1.s, p0/m, z1.s, z0.s
 ; VBITS_GE_512-NEXT:    mov w8, v1.s[1]
 ; VBITS_GE_512-NEXT:    mov v0.16b, v1.16b
@@ -1049,11 +1049,11 @@ define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ; VBITS_GE_128-LABEL: udiv_v8i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ushll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    ushll2 v3.4s, v0.8h, #0
 ; VBITS_GE_128-NEXT:    ushll v1.4s, v1.4h, #0
 ; VBITS_GE_128-NEXT:    ushll v0.4s, v0.4h, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_128-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; VBITS_GE_128-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
@@ -1061,9 +1061,9 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ;
 ; VBITS_GE_256-LABEL: udiv_v8i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
@@ -1073,9 +1073,9 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ;
 ; VBITS_GE_512-LABEL: udiv_v8i16:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_512-NEXT:    uunpklo z1.s, z1.h
 ; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_GE_512-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
@@ -1089,8 +1089,8 @@ define <8 x i16> @udiv_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 define void @udiv_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: udiv_v16i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ldp q4, q1, [x1]
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ldr q0, [x0, #16]
 ; VBITS_GE_128-NEXT:    ushll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    ushll2 v3.4s, v0.8h, #0
@@ -1253,8 +1253,8 @@ define void @udiv_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @udiv_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: udiv_v16i32:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
 ; VBITS_GE_128-NEXT:    ldp q5, q4, [x1, #32]
 ; VBITS_GE_128-NEXT:    udivr z0.s, p0/m, z0.s, z1.s
@@ -1375,8 +1375,8 @@ define void @udiv_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @udiv_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: udiv_v8i64:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.d, vl2
 ; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
+; VBITS_GE_128-NEXT:    ptrue p0.d, vl2
 ; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
 ; VBITS_GE_128-NEXT:    ldp q5, q4, [x1, #32]
 ; VBITS_GE_128-NEXT:    udivr z0.d, p0/m, z0.d, z1.d
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll
index 756e5f4cddf80..4feb86305f8f6 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-extends.ll
@@ -58,8 +58,8 @@ define void @sext_v4i3_v4i64(<4 x i3> %a, ptr %out) vscale_range(2,0) #0 {
 define void @sext_v16i8_v16i16(<16 x i8> %a, ptr %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sext_v16i8_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl16
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl16
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -308,8 +308,8 @@ define void @sext_v32i8_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
 define void @sext_v8i16_v8i32(<8 x i16> %a, ptr %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sext_v8i16_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -472,8 +472,8 @@ define void @sext_v32i16_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
 define void @sext_v4i32_v4i64(<4 x i32> %a, ptr %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: sext_v4i32_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -554,8 +554,8 @@ define void @sext_v32i32_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
 define void @zext_v16i8_v16i16(<16 x i8> %a, ptr %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: zext_v16i8_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl16
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl16
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -804,8 +804,8 @@ define void @zext_v32i8_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
 define void @zext_v8i16_v8i32(<8 x i16> %a, ptr %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: zext_v8i16_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -968,8 +968,8 @@ define void @zext_v32i16_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 {
 define void @zext_v4i32_v4i64(<4 x i32> %a, ptr %out) vscale_range(2,0) #0 {
 ; CHECK-LABEL: zext_v4i32_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
index 38444d83c1d7e..2d78945399176 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-rem.ll
@@ -15,9 +15,9 @@ target triple = "aarch64-unknown-linux-gnu"
 define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; VBITS_GE_128-LABEL: srem_v8i8:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    sshll v2.8h, v1.8b, #0
 ; VBITS_GE_128-NEXT:    sshll v3.8h, v0.8b, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    sshll2 v4.4s, v2.8h, #0
 ; VBITS_GE_128-NEXT:    sshll2 v5.4s, v3.8h, #0
 ; VBITS_GE_128-NEXT:    sshll v2.4s, v2.4h, #0
@@ -97,9 +97,9 @@ define <8 x i8> @srem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_128-LABEL: srem_v16i8:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    sshll2 v2.8h, v1.16b, #0
 ; VBITS_GE_128-NEXT:    sshll2 v3.8h, v0.16b, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    sshll2 v4.4s, v2.8h, #0
 ; VBITS_GE_128-NEXT:    sshll2 v5.4s, v3.8h, #0
 ; VBITS_GE_128-NEXT:    sshll v2.4s, v2.4h, #0
@@ -277,8 +277,8 @@ define void @srem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    sdivr z3.s, p1/m, z3.s, z5.s
 ; CHECK-NEXT:    ptrue p1.h, vl64
-; CHECK-NEXT:    uzp1 z5.h, z6.h, z6.h
 ; CHECK-NEXT:    splice z4.h, p1, z4.h, z2.h
+; CHECK-NEXT:    uzp1 z5.h, z6.h, z6.h
 ; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    splice z5.h, p1, z5.h, z3.h
@@ -300,9 +300,9 @@ define void @srem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; VBITS_GE_128-LABEL: srem_v4i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    sshll v2.4s, v1.4h, #0
 ; VBITS_GE_128-NEXT:    sshll v3.4s, v0.4h, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_128-NEXT:    xtn v2.4h, v2.4s
 ; VBITS_GE_128-NEXT:    mls v0.4h, v2.4h, v1.4h
@@ -310,9 +310,9 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ;
 ; VBITS_GE_256-LABEL: srem_v4i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    sshll v2.4s, v1.4h, #0
 ; VBITS_GE_256-NEXT:    sshll v3.4s, v0.4h, #0
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    mov w8, v2.s[1]
 ; VBITS_GE_256-NEXT:    mov v3.16b, v2.16b
@@ -326,9 +326,9 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ;
 ; VBITS_GE_512-LABEL: srem_v4i16:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_512-NEXT:    sshll v2.4s, v1.4h, #0
 ; VBITS_GE_512-NEXT:    sshll v3.4s, v0.4h, #0
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_512-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_512-NEXT:    mov w8, v2.s[1]
 ; VBITS_GE_512-NEXT:    mov v3.16b, v2.16b
@@ -346,9 +346,9 @@ define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ; VBITS_GE_128-LABEL: srem_v8i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    sshll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    sshll2 v3.4s, v0.8h, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    sshll v4.4s, v0.4h, #0
 ; VBITS_GE_128-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_128-NEXT:    sshll v3.4s, v1.4h, #0
@@ -359,11 +359,11 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ;
 ; VBITS_GE_256-LABEL: srem_v8i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
 ; VBITS_GE_256-NEXT:    sunpklo z3.s, z0.h
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; VBITS_GE_256-NEXT:    mls v0.8h, v2.8h, v1.8h
@@ -372,11 +372,11 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ;
 ; VBITS_GE_512-LABEL: srem_v8i16:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; VBITS_GE_512-NEXT:    sunpklo z2.s, z1.h
 ; VBITS_GE_512-NEXT:    sunpklo z3.s, z0.h
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_512-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_512-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; VBITS_GE_512-NEXT:    mls v0.8h, v2.8h, v1.8h
@@ -389,8 +389,8 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 define void @srem_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: srem_v16i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ldp q4, q1, [x1]
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ldr q0, [x0, #16]
 ; VBITS_GE_128-NEXT:    sshll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    sshll2 v3.4s, v0.8h, #0
@@ -582,25 +582,25 @@ define void @srem_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @srem_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: srem_v16i32:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
 ; VBITS_GE_128-NEXT:    ldp q16, q5, [x0, #32]
 ; VBITS_GE_128-NEXT:    ldp q17, q6, [x1, #32]
 ; VBITS_GE_128-NEXT:    movprfx z4, z1
 ; VBITS_GE_128-NEXT:    sdiv z4.s, p0/m, z4.s, z0.s
+; VBITS_GE_128-NEXT:    movprfx z19, z2
+; VBITS_GE_128-NEXT:    sdiv z19.s, p0/m, z19.s, z3.s
 ; VBITS_GE_128-NEXT:    movprfx z7, z5
 ; VBITS_GE_128-NEXT:    sdiv z7.s, p0/m, z7.s, z6.s
 ; VBITS_GE_128-NEXT:    movprfx z18, z16
 ; VBITS_GE_128-NEXT:    sdiv z18.s, p0/m, z18.s, z17.s
-; VBITS_GE_128-NEXT:    movprfx z19, z2
-; VBITS_GE_128-NEXT:    sdiv z19.s, p0/m, z19.s, z3.s
-; VBITS_GE_128-NEXT:    mls v16.4s, v18.4s, v17.4s
-; VBITS_GE_128-NEXT:    mls v5.4s, v7.4s, v6.4s
 ; VBITS_GE_128-NEXT:    mls v1.4s, v4.4s, v0.4s
 ; VBITS_GE_128-NEXT:    mls v2.4s, v19.4s, v3.4s
-; VBITS_GE_128-NEXT:    stp q16, q5, [x0, #32]
+; VBITS_GE_128-NEXT:    mls v16.4s, v18.4s, v17.4s
+; VBITS_GE_128-NEXT:    mls v5.4s, v7.4s, v6.4s
 ; VBITS_GE_128-NEXT:    stp q1, q2, [x0]
+; VBITS_GE_128-NEXT:    stp q16, q5, [x0, #32]
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: srem_v16i32:
@@ -730,26 +730,26 @@ define void @srem_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @srem_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: srem_v8i64:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.d, vl2
 ; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
+; VBITS_GE_128-NEXT:    ptrue p0.d, vl2
 ; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
 ; VBITS_GE_128-NEXT:    ldp q16, q5, [x0, #32]
 ; VBITS_GE_128-NEXT:    ldp q17, q6, [x1, #32]
 ; VBITS_GE_128-NEXT:    movprfx z4, z1
 ; VBITS_GE_128-NEXT:    sdiv z4.d, p0/m, z4.d, z0.d
+; VBITS_GE_128-NEXT:    movprfx z19, z2
+; VBITS_GE_128-NEXT:    sdiv z19.d, p0/m, z19.d, z3.d
 ; VBITS_GE_128-NEXT:    movprfx z7, z5
 ; VBITS_GE_128-NEXT:    sdiv z7.d, p0/m, z7.d, z6.d
 ; VBITS_GE_128-NEXT:    movprfx z18, z16
 ; VBITS_GE_128-NEXT:    sdiv z18.d, p0/m, z18.d, z17.d
-; VBITS_GE_128-NEXT:    movprfx z19, z2
-; VBITS_GE_128-NEXT:    sdiv z19.d, p0/m, z19.d, z3.d
-; VBITS_GE_128-NEXT:    mls z16.d, p0/m, z18.d, z17.d
-; VBITS_GE_128-NEXT:    mls z5.d, p0/m, z7.d, z6.d
 ; VBITS_GE_128-NEXT:    msb z0.d, p0/m, z4.d, z1.d
 ; VBITS_GE_128-NEXT:    movprfx z1, z2
 ; VBITS_GE_128-NEXT:    mls z1.d, p0/m, z19.d, z3.d
-; VBITS_GE_128-NEXT:    stp q16, q5, [x0, #32]
+; VBITS_GE_128-NEXT:    mls z16.d, p0/m, z18.d, z17.d
+; VBITS_GE_128-NEXT:    mls z5.d, p0/m, z7.d, z6.d
 ; VBITS_GE_128-NEXT:    stp q0, q1, [x0]
+; VBITS_GE_128-NEXT:    stp q16, q5, [x0, #32]
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: srem_v8i64:
@@ -833,9 +833,9 @@ define void @srem_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 ; VBITS_GE_128-LABEL: urem_v8i8:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ushll v2.8h, v1.8b, #0
 ; VBITS_GE_128-NEXT:    ushll v3.8h, v0.8b, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ushll2 v4.4s, v2.8h, #0
 ; VBITS_GE_128-NEXT:    ushll2 v5.4s, v3.8h, #0
 ; VBITS_GE_128-NEXT:    ushll v2.4s, v2.4h, #0
@@ -915,9 +915,9 @@ define <8 x i8> @urem_v8i8(<8 x i8> %op1, <8 x i8> %op2) #0 {
 define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) #0 {
 ; VBITS_GE_128-LABEL: urem_v16i8:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ushll2 v2.8h, v1.16b, #0
 ; VBITS_GE_128-NEXT:    ushll2 v3.8h, v0.16b, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ushll2 v4.4s, v2.8h, #0
 ; VBITS_GE_128-NEXT:    ushll2 v5.4s, v3.8h, #0
 ; VBITS_GE_128-NEXT:    ushll v2.4s, v2.4h, #0
@@ -1095,8 +1095,8 @@ define void @urem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    udivr z3.s, p1/m, z3.s, z5.s
 ; CHECK-NEXT:    ptrue p1.h, vl64
-; CHECK-NEXT:    uzp1 z5.h, z6.h, z6.h
 ; CHECK-NEXT:    splice z4.h, p1, z4.h, z2.h
+; CHECK-NEXT:    uzp1 z5.h, z6.h, z6.h
 ; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    splice z5.h, p1, z5.h, z3.h
@@ -1118,9 +1118,9 @@ define void @urem_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ; VBITS_GE_128-LABEL: urem_v4i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ushll v2.4s, v1.4h, #0
 ; VBITS_GE_128-NEXT:    ushll v3.4s, v0.4h, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_128-NEXT:    xtn v2.4h, v2.4s
 ; VBITS_GE_128-NEXT:    mls v0.4h, v2.4h, v1.4h
@@ -1128,9 +1128,9 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ;
 ; VBITS_GE_256-LABEL: urem_v4i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    ushll v2.4s, v1.4h, #0
 ; VBITS_GE_256-NEXT:    ushll v3.4s, v0.4h, #0
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    mov w8, v2.s[1]
 ; VBITS_GE_256-NEXT:    mov v3.16b, v2.16b
@@ -1144,9 +1144,9 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 ;
 ; VBITS_GE_512-LABEL: urem_v4i16:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_512-NEXT:    ushll v2.4s, v1.4h, #0
 ; VBITS_GE_512-NEXT:    ushll v3.4s, v0.4h, #0
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_512-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_512-NEXT:    mov w8, v2.s[1]
 ; VBITS_GE_512-NEXT:    mov v3.16b, v2.16b
@@ -1164,9 +1164,9 @@ define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) #0 {
 define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ; VBITS_GE_128-LABEL: urem_v8i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ushll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    ushll2 v3.4s, v0.8h, #0
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ushll v4.4s, v0.4h, #0
 ; VBITS_GE_128-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_128-NEXT:    ushll v3.4s, v1.4h, #0
@@ -1177,11 +1177,11 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ;
 ; VBITS_GE_256-LABEL: urem_v8i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; VBITS_GE_256-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; VBITS_GE_256-NEXT:    uunpklo z2.s, z1.h
 ; VBITS_GE_256-NEXT:    uunpklo z3.s, z0.h
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; VBITS_GE_256-NEXT:    mls v0.8h, v2.8h, v1.8h
@@ -1190,11 +1190,11 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 ;
 ; VBITS_GE_512-LABEL: urem_v8i16:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_512-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; VBITS_GE_512-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; VBITS_GE_512-NEXT:    uunpklo z2.s, z1.h
 ; VBITS_GE_512-NEXT:    uunpklo z3.s, z0.h
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_512-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; VBITS_GE_512-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; VBITS_GE_512-NEXT:    mls v0.8h, v2.8h, v1.8h
@@ -1207,8 +1207,8 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) #0 {
 define void @urem_v16i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: urem_v16i16:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ldp q4, q1, [x1]
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ldr q0, [x0, #16]
 ; VBITS_GE_128-NEXT:    ushll2 v2.4s, v1.8h, #0
 ; VBITS_GE_128-NEXT:    ushll2 v3.4s, v0.8h, #0
@@ -1400,25 +1400,25 @@ define void @urem_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @urem_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: urem_v16i32:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
+; VBITS_GE_128-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
 ; VBITS_GE_128-NEXT:    ldp q16, q5, [x0, #32]
 ; VBITS_GE_128-NEXT:    ldp q17, q6, [x1, #32]
 ; VBITS_GE_128-NEXT:    movprfx z4, z1
 ; VBITS_GE_128-NEXT:    udiv z4.s, p0/m, z4.s, z0.s
+; VBITS_GE_128-NEXT:    movprfx z19, z2
+; VBITS_GE_128-NEXT:    udiv z19.s, p0/m, z19.s, z3.s
 ; VBITS_GE_128-NEXT:    movprfx z7, z5
 ; VBITS_GE_128-NEXT:    udiv z7.s, p0/m, z7.s, z6.s
 ; VBITS_GE_128-NEXT:    movprfx z18, z16
 ; VBITS_GE_128-NEXT:    udiv z18.s, p0/m, z18.s, z17.s
-; VBITS_GE_128-NEXT:    movprfx z19, z2
-; VBITS_GE_128-NEXT:    udiv z19.s, p0/m, z19.s, z3.s
-; VBITS_GE_128-NEXT:    mls v16.4s, v18.4s, v17.4s
-; VBITS_GE_128-NEXT:    mls v5.4s, v7.4s, v6.4s
 ; VBITS_GE_128-NEXT:    mls v1.4s, v4.4s, v0.4s
 ; VBITS_GE_128-NEXT:    mls v2.4s, v19.4s, v3.4s
-; VBITS_GE_128-NEXT:    stp q16, q5, [x0, #32]
+; VBITS_GE_128-NEXT:    mls v16.4s, v18.4s, v17.4s
+; VBITS_GE_128-NEXT:    mls v5.4s, v7.4s, v6.4s
 ; VBITS_GE_128-NEXT:    stp q1, q2, [x0]
+; VBITS_GE_128-NEXT:    stp q16, q5, [x0, #32]
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: urem_v16i32:
@@ -1548,26 +1548,26 @@ define void @urem_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @urem_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_128-LABEL: urem_v8i64:
 ; VBITS_GE_128:       // %bb.0:
-; VBITS_GE_128-NEXT:    ptrue p0.d, vl2
 ; VBITS_GE_128-NEXT:    ldp q0, q3, [x1]
+; VBITS_GE_128-NEXT:    ptrue p0.d, vl2
 ; VBITS_GE_128-NEXT:    ldp q1, q2, [x0]
 ; VBITS_GE_128-NEXT:    ldp q16, q5, [x0, #32]
 ; VBITS_GE_128-NEXT:    ldp q17, q6, [x1, #32]
 ; VBITS_GE_128-NEXT:    movprfx z4, z1
 ; VBITS_GE_128-NEXT:    udiv z4.d, p0/m, z4.d, z0.d
+; VBITS_GE_128-NEXT:    movprfx z19, z2
+; VBITS_GE_128-NEXT:    udiv z19.d, p0/m, z19.d, z3.d
 ; VBITS_GE_128-NEXT:    movprfx z7, z5
 ; VBITS_GE_128-NEXT:    udiv z7.d, p0/m, z7.d, z6.d
 ; VBITS_GE_128-NEXT:    movprfx z18, z16
 ; VBITS_GE_128-NEXT:    udiv z18.d, p0/m, z18.d, z17.d
-; VBITS_GE_128-NEXT:    movprfx z19, z2
-; VBITS_GE_128-NEXT:    udiv z19.d, p0/m, z19.d, z3.d
-; VBITS_GE_128-NEXT:    mls z16.d, p0/m, z18.d, z17.d
-; VBITS_GE_128-NEXT:    mls z5.d, p0/m, z7.d, z6.d
 ; VBITS_GE_128-NEXT:    msb z0.d, p0/m, z4.d, z1.d
 ; VBITS_GE_128-NEXT:    movprfx z1, z2
 ; VBITS_GE_128-NEXT:    mls z1.d, p0/m, z19.d, z3.d
-; VBITS_GE_128-NEXT:    stp q16, q5, [x0, #32]
+; VBITS_GE_128-NEXT:    mls z16.d, p0/m, z18.d, z17.d
+; VBITS_GE_128-NEXT:    mls z5.d, p0/m, z7.d, z6.d
 ; VBITS_GE_128-NEXT:    stp q0, q1, [x0]
+; VBITS_GE_128-NEXT:    stp q16, q5, [x0, #32]
 ; VBITS_GE_128-NEXT:    ret
 ;
 ; VBITS_GE_256-LABEL: urem_v8i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll
index 710dce4de6dda..37396ba7011be 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-select.ll
@@ -34,14 +34,14 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) vscale_
 define void @select_v32i8(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl32
 ; CHECK-NEXT:    mov z0.b, w2
-; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x0]
-; CHECK-NEXT:    ld1b { z2.b }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p1.b, p1/z, z0.b, #0
-; CHECK-NEXT:    sel z0.b, p1, z1.b, z2.b
-; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ptrue p1.b, vl32
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x1]
+; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <32 x i8>, ptr %a
   %op2 = load volatile <32 x i8>, ptr %b
@@ -53,31 +53,31 @@ define void @select_v32i8(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 define void @select_v64i8(ptr %a, ptr %b, i1 %mask) #0 {
 ; VBITS_GE_256-LABEL: select_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
 ; VBITS_GE_256-NEXT:    mov z0.b, w2
+; VBITS_GE_256-NEXT:    ptrue p0.b
 ; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
-; VBITS_GE_256-NEXT:    ptrue p1.b
-; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0, x8]
-; VBITS_GE_256-NEXT:    ld1b { z2.b }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1b { z3.b }, p0/z, [x1, x8]
-; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z0.b, #0
-; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    sel z1.b, p1, z1.b, z3.b
-; VBITS_GE_256-NEXT:    mov z0.b, p1/m, z2.b
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0, x8]
-; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_256-NEXT:    ptrue p1.b, vl32
+; VBITS_GE_256-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0, x8]
+; VBITS_GE_256-NEXT:    ld1b { z1.b }, p1/z, [x0]
+; VBITS_GE_256-NEXT:    ld1b { z2.b }, p1/z, [x1, x8]
+; VBITS_GE_256-NEXT:    ld1b { z3.b }, p1/z, [x1]
+; VBITS_GE_256-NEXT:    sel z0.b, p0, z0.b, z2.b
+; VBITS_GE_256-NEXT:    sel z1.b, p0, z1.b, z3.b
+; VBITS_GE_256-NEXT:    st1b { z0.b }, p1, [x0, x8]
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p1, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: select_v64i8:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
 ; VBITS_GE_512-NEXT:    mov z0.b, w2
-; VBITS_GE_512-NEXT:    ptrue p1.b
-; VBITS_GE_512-NEXT:    ld1b { z1.b }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1b { z2.b }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    cmpne p1.b, p1/z, z0.b, #0
-; VBITS_GE_512-NEXT:    sel z0.b, p1, z1.b, z2.b
-; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
+; VBITS_GE_512-NEXT:    ptrue p0.b
+; VBITS_GE_512-NEXT:    ptrue p1.b, vl64
+; VBITS_GE_512-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; VBITS_GE_512-NEXT:    ld1b { z0.b }, p1/z, [x0]
+; VBITS_GE_512-NEXT:    ld1b { z1.b }, p1/z, [x1]
+; VBITS_GE_512-NEXT:    sel z0.b, p0, z0.b, z1.b
+; VBITS_GE_512-NEXT:    st1b { z0.b }, p1, [x0]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load volatile <64 x i8>, ptr %a
   %op2 = load volatile <64 x i8>, ptr %b
@@ -89,14 +89,14 @@ define void @select_v64i8(ptr %a, ptr %b, i1 %mask) #0 {
 define void @select_v128i8(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 ; CHECK-LABEL: select_v128i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl128
 ; CHECK-NEXT:    mov z0.b, w2
-; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x0]
-; CHECK-NEXT:    ld1b { z2.b }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p1.b, p1/z, z0.b, #0
-; CHECK-NEXT:    sel z0.b, p1, z1.b, z2.b
-; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ptrue p1.b, vl128
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x1]
+; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <128 x i8>, ptr %a
   %op2 = load volatile <128 x i8>, ptr %b
@@ -108,14 +108,14 @@ define void @select_v128i8(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 define void @select_v256i8(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 {
 ; CHECK-LABEL: select_v256i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl256
 ; CHECK-NEXT:    mov z0.b, w2
-; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x0]
-; CHECK-NEXT:    ld1b { z2.b }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p1.b, p1/z, z0.b, #0
-; CHECK-NEXT:    sel z0.b, p1, z1.b, z2.b
-; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ptrue p1.b, vl256
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x1]
+; CHECK-NEXT:    sel z0.b, p0, z0.b, z1.b
+; CHECK-NEXT:    st1b { z0.b }, p1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <256 x i8>, ptr %a
   %op2 = load volatile <256 x i8>, ptr %b
@@ -153,15 +153,15 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) vscale_
 define void @select_v16i16(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl16
 ; CHECK-NEXT:    mov z0.h, w2
-; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ptrue p1.h, vl16
 ; CHECK-NEXT:    and z0.h, z0.h, #0x1
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p1.h, p1/z, z0.h, #0
-; CHECK-NEXT:    sel z0.h, p1, z1.h, z2.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <16 x i16>, ptr %a
   %op2 = load volatile <16 x i16>, ptr %b
@@ -173,33 +173,33 @@ define void @select_v16i16(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 define void @select_v32i16(ptr %a, ptr %b, i1 %mask) #0 {
 ; VBITS_GE_256-LABEL: select_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
 ; VBITS_GE_256-NEXT:    mov z0.h, w2
+; VBITS_GE_256-NEXT:    ptrue p0.h
 ; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
-; VBITS_GE_256-NEXT:    ptrue p1.h
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
 ; VBITS_GE_256-NEXT:    and z0.h, z0.h, #0x1
-; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    ld1h { z2.h }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1h { z3.h }, p0/z, [x1, x8, lsl #1]
-; VBITS_GE_256-NEXT:    cmpne p1.h, p1/z, z0.h, #0
-; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    sel z1.h, p1, z1.h, z3.h
-; VBITS_GE_256-NEXT:    mov z0.h, p1/m, z2.h
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0, x8, lsl #1]
-; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_256-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; VBITS_GE_256-NEXT:    ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z1.h }, p1/z, [x0]
+; VBITS_GE_256-NEXT:    ld1h { z2.h }, p1/z, [x1, x8, lsl #1]
+; VBITS_GE_256-NEXT:    ld1h { z3.h }, p1/z, [x1]
+; VBITS_GE_256-NEXT:    sel z0.h, p0, z0.h, z2.h
+; VBITS_GE_256-NEXT:    sel z1.h, p0, z1.h, z3.h
+; VBITS_GE_256-NEXT:    st1h { z0.h }, p1, [x0, x8, lsl #1]
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p1, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: select_v32i16:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
 ; VBITS_GE_512-NEXT:    mov z0.h, w2
-; VBITS_GE_512-NEXT:    ptrue p1.h
+; VBITS_GE_512-NEXT:    ptrue p0.h
+; VBITS_GE_512-NEXT:    ptrue p1.h, vl32
 ; VBITS_GE_512-NEXT:    and z0.h, z0.h, #0x1
-; VBITS_GE_512-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1h { z2.h }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    cmpne p1.h, p1/z, z0.h, #0
-; VBITS_GE_512-NEXT:    sel z0.h, p1, z1.h, z2.h
-; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
+; VBITS_GE_512-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; VBITS_GE_512-NEXT:    ld1h { z0.h }, p1/z, [x0]
+; VBITS_GE_512-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; VBITS_GE_512-NEXT:    sel z0.h, p0, z0.h, z1.h
+; VBITS_GE_512-NEXT:    st1h { z0.h }, p1, [x0]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load volatile <32 x i16>, ptr %a
   %op2 = load volatile <32 x i16>, ptr %b
@@ -211,15 +211,15 @@ define void @select_v32i16(ptr %a, ptr %b, i1 %mask) #0 {
 define void @select_v64i16(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 ; CHECK-LABEL: select_v64i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl64
 ; CHECK-NEXT:    mov z0.h, w2
-; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ptrue p1.h, vl64
 ; CHECK-NEXT:    and z0.h, z0.h, #0x1
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p1.h, p1/z, z0.h, #0
-; CHECK-NEXT:    sel z0.h, p1, z1.h, z2.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <64 x i16>, ptr %a
   %op2 = load volatile <64 x i16>, ptr %b
@@ -231,15 +231,15 @@ define void @select_v64i16(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 define void @select_v128i16(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 {
 ; CHECK-LABEL: select_v128i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl128
 ; CHECK-NEXT:    mov z0.h, w2
-; CHECK-NEXT:    ptrue p1.h
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ptrue p1.h, vl128
 ; CHECK-NEXT:    and z0.h, z0.h, #0x1
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0]
-; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p1.h, p1/z, z0.h, #0
-; CHECK-NEXT:    sel z0.h, p1, z1.h, z2.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
+; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    st1h { z0.h }, p1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <128 x i16>, ptr %a
   %op2 = load volatile <128 x i16>, ptr %b
@@ -277,15 +277,15 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) vscale_
 define void @select_v8i32(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    and w8, w2, #0x1
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z0.s, w8
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p1.s, p1/z, z0.s, #0
-; CHECK-NEXT:    sel z0.s, p1, z1.s, z2.s
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ptrue p1.s, vl8
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <8 x i32>, ptr %a
   %op2 = load volatile <8 x i32>, ptr %b
@@ -297,33 +297,33 @@ define void @select_v8i32(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 define void @select_v16i32(ptr %a, ptr %b, i1 %mask) #0 {
 ; VBITS_GE_256-LABEL: select_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    and w8, w2, #0x1
-; VBITS_GE_256-NEXT:    ptrue p1.s
+; VBITS_GE_256-NEXT:    ptrue p0.s
 ; VBITS_GE_256-NEXT:    mov z0.s, w8
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
-; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    ld1w { z2.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1w { z3.s }, p0/z, [x1, x8, lsl #2]
-; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z0.s, #0
-; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    sel z1.s, p1, z1.s, z3.s
-; VBITS_GE_256-NEXT:    mov z0.s, p1/m, z2.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0, x8, lsl #2]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_256-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; VBITS_GE_256-NEXT:    ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z1.s }, p1/z, [x0]
+; VBITS_GE_256-NEXT:    ld1w { z2.s }, p1/z, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    ld1w { z3.s }, p1/z, [x1]
+; VBITS_GE_256-NEXT:    sel z0.s, p0, z0.s, z2.s
+; VBITS_GE_256-NEXT:    sel z1.s, p0, z1.s, z3.s
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p1, [x0, x8, lsl #2]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p1, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: select_v16i32:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
 ; VBITS_GE_512-NEXT:    and w8, w2, #0x1
-; VBITS_GE_512-NEXT:    ptrue p1.s
+; VBITS_GE_512-NEXT:    ptrue p0.s
 ; VBITS_GE_512-NEXT:    mov z0.s, w8
-; VBITS_GE_512-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1w { z2.s }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    cmpne p1.s, p1/z, z0.s, #0
-; VBITS_GE_512-NEXT:    sel z0.s, p1, z1.s, z2.s
-; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
+; VBITS_GE_512-NEXT:    ptrue p1.s, vl16
+; VBITS_GE_512-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; VBITS_GE_512-NEXT:    ld1w { z0.s }, p1/z, [x0]
+; VBITS_GE_512-NEXT:    ld1w { z1.s }, p1/z, [x1]
+; VBITS_GE_512-NEXT:    sel z0.s, p0, z0.s, z1.s
+; VBITS_GE_512-NEXT:    st1w { z0.s }, p1, [x0]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load volatile <16 x i32>, ptr %a
   %op2 = load volatile <16 x i32>, ptr %b
@@ -335,15 +335,15 @@ define void @select_v16i32(ptr %a, ptr %b, i1 %mask) #0 {
 define void @select_v32i32(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 ; CHECK-LABEL: select_v32i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    and w8, w2, #0x1
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z0.s, w8
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p1.s, p1/z, z0.s, #0
-; CHECK-NEXT:    sel z0.s, p1, z1.s, z2.s
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ptrue p1.s, vl32
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <32 x i32>, ptr %a
   %op2 = load volatile <32 x i32>, ptr %b
@@ -355,15 +355,15 @@ define void @select_v32i32(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 define void @select_v64i32(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 {
 ; CHECK-LABEL: select_v64i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl64
 ; CHECK-NEXT:    and w8, w2, #0x1
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z0.s, w8
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
-; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p1.s, p1/z, z0.s, #0
-; CHECK-NEXT:    sel z0.s, p1, z1.s, z2.s
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
+; CHECK-NEXT:    ptrue p1.s, vl64
+; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0]
+; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1]
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    st1w { z0.s }, p1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <64 x i32>, ptr %a
   %op2 = load volatile <64 x i32>, ptr %b
@@ -401,16 +401,16 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) vscale_
 define void @select_v4i64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 ; CHECK-LABEL: select_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    and x8, x2, #0x1
-; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z0.d, x8
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p1.d, p1/z, z0.d, #0
-; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
-; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ptrue p1.d, vl4
+; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <4 x i64>, ptr %a
   %op2 = load volatile <4 x i64>, ptr %b
@@ -422,35 +422,35 @@ define void @select_v4i64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 {
 define void @select_v8i64(ptr %a, ptr %b, i1 %mask) #0 {
 ; VBITS_GE_256-LABEL: select_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; VBITS_GE_256-NEXT:    and x8, x2, #0x1
-; VBITS_GE_256-NEXT:    ptrue p1.d
+; VBITS_GE_256-NEXT:    ptrue p0.d
 ; VBITS_GE_256-NEXT:    mov z0.d, x8
+; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    cmpne p1.d, p1/z, z0.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    sel z1.d, p1, z1.d, z3.d
-; VBITS_GE_256-NEXT:    mov z0.d, p1/m, z2.d
-; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p1/z, [x0]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p1/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p1/z, [x1]
+; VBITS_GE_256-NEXT:    sel z0.d, p0, z0.d, z2.d
+; VBITS_GE_256-NEXT:    sel z1.d, p0, z1.d, z3.d
+; VBITS_GE_256-NEXT:    st1d { z0.d }, p1, [x0, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1d { z1.d }, p1, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: select_v8i64:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
 ; VBITS_GE_512-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; VBITS_GE_512-NEXT:    and x8, x2, #0x1
-; VBITS_GE_512-NEXT:    ptrue p1.d
+; VBITS_GE_512-NEXT:    ptrue p0.d
 ; VBITS_GE_512-NEXT:    mov z0.d, x8
-; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1d { z2.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    cmpne p1.d, p1/z, z0.d, #0
-; VBITS_GE_512-NEXT:    sel z0.d, p1, z1.d, z2.d
-; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
+; VBITS_GE_512-NEXT:    ptrue p1.d, vl8
+; VBITS_GE_512-NEXT:    cmpne p0.d, p0/z, z0.d, #0
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p1/z, [x0]
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; VBITS_GE_512-NEXT:    sel z0.d, p0, z0.d, z1.d
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p1, [x0]
 ; VBITS_GE_512-NEXT:    ret
   %op1 = load volatile <8 x i64>, ptr %a
   %op2 = load volatile <8 x i64>, ptr %b
@@ -462,16 +462,16 @@ define void @select_v8i64(ptr %a, ptr %b, i1 %mask) #0 {
 define void @select_v16i64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 ; CHECK-LABEL: select_v16i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl16
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    and x8, x2, #0x1
-; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z0.d, x8
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p1.d, p1/z, z0.d, #0
-; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
-; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ptrue p1.d, vl16
+; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <16 x i64>, ptr %a
   %op2 = load volatile <16 x i64>, ptr %b
@@ -483,16 +483,16 @@ define void @select_v16i64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 {
 define void @select_v32i64(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 {
 ; CHECK-LABEL: select_v32i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl32
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    and x8, x2, #0x1
-; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z0.d, x8
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p1.d, p1/z, z0.d, #0
-; CHECK-NEXT:    sel z0.d, p1, z1.d, z2.d
-; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
+; CHECK-NEXT:    ptrue p1.d, vl32
+; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x0]
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
+; CHECK-NEXT:    st1d { z0.d }, p1, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load volatile <32 x i64>, ptr %a
   %op2 = load volatile <32 x i64>, ptr %b
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
index 50040eaa61e6c..5bb012ae57503 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll
@@ -131,8 +131,8 @@ define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) vscale_range(2,0) #0 {
 define void @ucvtf_v8i16_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v8i16_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
@@ -354,7 +354,6 @@ define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
-; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ucvtf z0.h, p0/m, z0.s
@@ -363,7 +362,8 @@ define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; VBITS_GE_256-NEXT:    splice z1.h, p0, z1.h, z0.h
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p1, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: ucvtf_v16i32_v16f16:
@@ -535,8 +535,8 @@ define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) vscale_range(2,0) #0 {
 define void @ucvtf_v4i32_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: ucvtf_v4i32_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
@@ -759,7 +759,6 @@ define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ucvtf z0.s, p0/m, z0.d
@@ -768,7 +767,8 @@ define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p1, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: ucvtf_v8i64_v8f32:
@@ -1038,8 +1038,8 @@ define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) vscale_range(2,0) #0 {
 define void @scvtf_v8i16_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v8i16_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
@@ -1273,7 +1273,6 @@ define void @scvtf_v16i32_v16f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
-; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    scvtf z0.h, p0/m, z0.s
@@ -1282,7 +1281,8 @@ define void @scvtf_v16i32_v16f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; VBITS_GE_256-NEXT:    splice z1.h, p0, z1.h, z0.h
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p1, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: scvtf_v16i32_v16f16:
@@ -1454,8 +1454,8 @@ define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) vscale_range(2,0) #0 {
 define void @scvtf_v4i32_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: scvtf_v4i32_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
@@ -1684,7 +1684,6 @@ define void @scvtf_v8i64_v8f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    scvtf z0.s, p0/m, z0.d
@@ -1693,7 +1692,8 @@ define void @scvtf_v8i64_v8f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p1, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: scvtf_v8i64_v8f32:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll
index e23151475014d..f2ad98f8caec9 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll
@@ -246,16 +246,16 @@ define void @masked_gather_v8i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
 ; VBITS_GE_256-NEXT:    ld1w { z1.d }, p0/z, [z1.d]
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p1, [x0]
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_gather_v8i32:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll
index 67a53d4e15f3b..55d37d1bda5e4 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-loads.ll
@@ -11,8 +11,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <16 x i8> @masked_load_v16i8(ptr %src, <16 x i1> %mask) {
 ; CHECK-LABEL: masked_load_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #7
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll
index bdd6ce0647016..1a19b77f53c67 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-128bit-stores.ll
@@ -11,12 +11,12 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @masked_store_v16i8(ptr %dst, <16 x i1> %mask) {
 ; CHECK-LABEL: masked_store_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    shl v0.16b, v0.16b, #7
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
-; CHECK-NEXT:    st1b { z1.b }, p0, [x0]
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v16i8(<16 x i8> zeroinitializer, ptr %dst, i32 8, <16 x i1> %mask)
   ret void
@@ -27,11 +27,11 @@ define void @masked_store_v8f16(ptr %dst, <8 x i1> %mask) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    shl v0.8h, v0.8h, #15
 ; CHECK-NEXT:    cmlt v0.8h, v0.8h, #0
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
-; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v8f16(<8 x half> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
   ret void
@@ -42,11 +42,11 @@ define void @masked_store_v4f32(ptr %dst, <4 x i1> %mask) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    shl v0.4s, v0.4s, #31
 ; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
-; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v4f32(<4 x float> zeroinitializer, ptr %dst, i32 8, <4 x i1> %mask)
   ret void
@@ -57,11 +57,11 @@ define void @masked_store_v2f64(ptr %dst, <2 x i1> %mask) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    movi v1.2d, #0000000000000000
 ; CHECK-NEXT:    shl v0.2d, v0.2d, #63
 ; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; CHECK-NEXT:    st1d { z1.d }, p0, [x0]
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v2f64(<2 x double> zeroinitializer, ptr %dst, i32 8, <2 x i1> %mask)
   ret void
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
index 92fce4584f6a9..27e95489f8ad7 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
@@ -41,11 +41,11 @@ define void @masked_gather_v4i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    cmeq v0.4h, v0.4h, #0
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    cmpne p1.d, p0/z, z0.d, #0
-; CHECK-NEXT:    ld1b { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1b { z0.d }, p1/z, [z0.d]
 ; CHECK-NEXT:    st1b { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %cval = load <4 x i8>, ptr %a
@@ -65,7 +65,6 @@ define void @masked_gather_v8i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    cmeq v0.8b, v0.8b, #0
 ; VBITS_GE_256-NEXT:    zip2 v1.8b, v0.8b, v0.8b
 ; VBITS_GE_256-NEXT:    zip1 v0.8b, v0.8b, v0.8b
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    shl v1.4h, v1.4h, #8
 ; VBITS_GE_256-NEXT:    shl v0.4h, v0.4h, #8
 ; VBITS_GE_256-NEXT:    sshr v1.4h, v1.4h, #8
@@ -75,10 +74,11 @@ define void @masked_gather_v8i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    ld1b { z1.d }, p1/z, [z2.d]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; VBITS_GE_256-NEXT:    ld1b { z0.d }, p0/z, [z2.d]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1b { z1.d }, p1/z, [z1.d]
+; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    ld1b { z0.d }, p1/z, [z0.d]
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
@@ -93,11 +93,11 @@ define void @masked_gather_v8i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
 ; VBITS_GE_512-NEXT:    cmeq v0.8b, v0.8b, #0
 ; VBITS_GE_512-NEXT:    sunpklo z0.h, z0.b
-; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
 ; VBITS_GE_512-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_512-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; VBITS_GE_512-NEXT:    ld1b { z0.d }, p0/z, [z1.d]
+; VBITS_GE_512-NEXT:    cmpne p1.d, p0/z, z0.d, #0
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    ld1b { z0.d }, p1/z, [z0.d]
 ; VBITS_GE_512-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_512-NEXT:    uzp1 z0.b, z0.b, z0.b
@@ -118,11 +118,11 @@ define void @masked_gather_v16i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-NEXT:    ptrue p0.d, vl16
 ; CHECK-NEXT:    cmeq v0.16b, v0.16b, #0
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; CHECK-NEXT:    ld1b { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    cmpne p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1b { z0.d }, p1/z, [z0.d]
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
@@ -194,10 +194,10 @@ define void @masked_gather_v4i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    cmeq v0.4h, v0.4h, #0
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    cmpne p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1h { z0.d }, p1/z, [z0.d]
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    str d0, [x0]
@@ -219,15 +219,15 @@ define void @masked_gather_v8i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    cmeq v0.8h, v0.8h, #0
 ; VBITS_GE_256-NEXT:    sunpklo z1.s, z0.h
 ; VBITS_GE_256-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    ld1h { z1.d }, p1/z, [z2.d]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; VBITS_GE_256-NEXT:    ld1h { z0.d }, p0/z, [z2.d]
+; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    ld1h { z1.d }, p1/z, [z1.d]
+; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1h { z0.d }, p1/z, [z0.d]
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
@@ -242,10 +242,10 @@ define void @masked_gather_v8i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
 ; VBITS_GE_512-NEXT:    cmeq v0.8h, v0.8h, #0
 ; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_512-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; VBITS_GE_512-NEXT:    ld1h { z0.d }, p0/z, [z1.d]
+; VBITS_GE_512-NEXT:    cmpne p1.d, p0/z, z0.d, #0
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    ld1h { z0.d }, p1/z, [z0.d]
 ; VBITS_GE_512-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_512-NEXT:    str q0, [x0]
@@ -332,9 +332,9 @@ define void @masked_gather_v4i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    cmeq v0.4s, v0.4s, #0
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    cmpne p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [z0.d]
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
@@ -354,18 +354,18 @@ define void @masked_gather_v8i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ptrue p2.d, vl4
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p2/z, [x1]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p2/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
-; VBITS_GE_256-NEXT:    punpklo p3.h, p1.b
 ; VBITS_GE_256-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    punpklo p1.h, p1.b
+; VBITS_GE_256-NEXT:    and p1.b, p1/z, p1.b, p2.b
 ; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    and p1.b, p3/z, p3.b, p2.b
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    ld1w { z1.d }, p1/z, [z1.d]
+; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    cmpne p1.d, p2/z, z0.d, #0
-; VBITS_GE_256-NEXT:    ld1w { z0.d }, p1/z, [z2.d]
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p2/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    ld1w { z0.d }, p1/z, [z0.d]
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
 ; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
@@ -460,8 +460,8 @@ define void @masked_gather_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @masked_gather_v2i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    cmeq v0.2d, v0.2d, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    ldr q0, [x1]
@@ -481,9 +481,9 @@ define void @masked_gather_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
-; CHECK-NEXT:    ld1d { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [z0.d]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %cval = load <4 x i64>, ptr %a
@@ -500,13 +500,13 @@ define void @masked_gather_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [z1.d]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
+; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p1/z, [z0.d]
 ; VBITS_GE_256-NEXT:    cmpeq p1.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p1/z, [z2.d]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p1/z, [z1.d]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
 ; VBITS_GE_256-NEXT:    ret
@@ -515,9 +515,9 @@ define void @masked_gather_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_512:       // %bb.0:
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
 ; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
-; VBITS_GE_512-NEXT:    ld1d { z0.d }, p1/z, [z1.d]
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p1/z, [z0.d]
 ; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
   %cval = load <8 x i64>, ptr %a
@@ -533,9 +533,9 @@ define void @masked_gather_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl16
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
-; CHECK-NEXT:    ld1d { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [z0.d]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %cval = load <16 x i64>, ptr %a
@@ -551,9 +551,9 @@ define void @masked_gather_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d, vl32
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
-; CHECK-NEXT:    ld1d { z0.d }, p1/z, [z1.d]
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1d { z0.d }, p1/z, [z0.d]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %cval = load <32 x i64>, ptr %a
@@ -603,10 +603,10 @@ define void @masked_gather_v4f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    fcmeq v0.4h, v0.4h, #0.0
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    cmpne p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1h { z0.d }, p1/z, [z0.d]
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    str d0, [x0]
@@ -626,17 +626,17 @@ define void @masked_gather_v8f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    fcmeq v0.8h, v0.8h, #0.0
-; VBITS_GE_256-NEXT:    sunpklo z2.s, z0.h
+; VBITS_GE_256-NEXT:    sunpklo z1.s, z0.h
 ; VBITS_GE_256-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
-; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
+; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z1.d, #0
 ; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z2.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.d }, p1/z, [z1.d]
-; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; VBITS_GE_256-NEXT:    ld1h { z0.d }, p0/z, [z2.d]
+; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z0.d, #0
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1h { z0.d }, p1/z, [z0.d]
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
@@ -651,10 +651,10 @@ define void @masked_gather_v8f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
 ; VBITS_GE_512-NEXT:    fcmeq v0.8h, v0.8h, #0.0
 ; VBITS_GE_512-NEXT:    sunpklo z0.s, z0.h
-; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    sunpklo z0.d, z0.s
-; VBITS_GE_512-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; VBITS_GE_512-NEXT:    ld1h { z0.d }, p0/z, [z1.d]
+; VBITS_GE_512-NEXT:    cmpne p1.d, p0/z, z0.d, #0
+; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    ld1h { z0.d }, p1/z, [z0.d]
 ; VBITS_GE_512-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_512-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_512-NEXT:    str q0, [x0]
@@ -741,9 +741,9 @@ define void @masked_gather_v4f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    fcmeq v0.4s, v0.4s, #0.0
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z1.d]
+; CHECK-NEXT:    cmpne p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [z0.d]
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    str q0, [x0]
 ; CHECK-NEXT:    ret
@@ -763,18 +763,18 @@ define void @masked_gather_v8f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    ptrue p2.d, vl4
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p2/z, [x1]
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p2/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_256-NEXT:    punpklo p3.h, p1.b
 ; VBITS_GE_256-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    punpklo p1.h, p1.b
+; VBITS_GE_256-NEXT:    and p1.b, p1/z, p1.b, p2.b
 ; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    and p1.b, p3/z, p3.b, p2.b
-; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    ld1w { z1.d }, p1/z, [z1.d]
+; VBITS_GE_256-NEXT:    sunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    cmpne p1.d, p2/z, z0.d, #0
-; VBITS_GE_256-NEXT:    ld1w { z0.d }, p1/z, [z2.d]
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
+; VBITS_GE_256-NEXT:    ld1d { z0.d }, p2/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    ld1w { z0.d }, p1/z, [z0.d]
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
 ; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
@@ -869,8 +869,8 @@ define void @masked_gather_v1f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @masked_gather_v2f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    fcmeq v0.2d, v0.2d, #0.0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    ldr q0, [x1]
@@ -1202,8 +1202,8 @@ define void @masked_gather_passthru(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #
 ; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x2]
 ; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    ld1d { z0.d }, p2/z, [x1]
-; CHECK-NEXT:    punpklo p3.h, p1.b
-; CHECK-NEXT:    ld1w { z0.d }, p3/z, [z0.d]
+; CHECK-NEXT:    punpklo p2.h, p1.b
+; CHECK-NEXT:    ld1w { z0.d }, p2/z, [z0.d]
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
index 467378e7da59b..c22d9e71c51a9 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-loads.ll
@@ -35,9 +35,9 @@ define <2 x half> @masked_load_v2f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 define <2 x float> @masked_load_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_load_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    fcmeq v0.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
@@ -53,9 +53,9 @@ define <2 x float> @masked_load_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 define <4 x float> @masked_load_v4f32(ptr %ap, ptr %bp) vscale_range(1,0) #0 {
 ; CHECK-LABEL: masked_load_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    fcmeq v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
@@ -401,8 +401,8 @@ define void @masked_load_sext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 {
 define void @masked_load_sext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl16
 ; VBITS_GE_256-NEXT:    ldr q0, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl16
 ; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    cmeq v0.16b, v0.16b, #0
 ; VBITS_GE_256-NEXT:    cmpne p0.b, p0/z, z0.b, #0
@@ -436,8 +436,8 @@ define void @masked_load_sext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 {
 define void @masked_load_sext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl8
 ; VBITS_GE_256-NEXT:    ldr d0, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl8
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    cmeq v0.8b, v0.8b, #0
 ; VBITS_GE_256-NEXT:    cmpne p0.b, p0/z, z0.b, #0
@@ -504,8 +504,8 @@ define void @masked_load_sext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 {
 define void @masked_load_sext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
 ; VBITS_GE_256-NEXT:    ldr q0, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    cmeq v0.8h, v0.8h, #0
 ; VBITS_GE_256-NEXT:    cmpne p0.h, p0/z, z0.h, #0
@@ -603,8 +603,8 @@ define void @masked_load_zext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 {
 define void @masked_load_zext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl16
 ; VBITS_GE_256-NEXT:    ldr q0, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl16
 ; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    cmeq v0.16b, v0.16b, #0
 ; VBITS_GE_256-NEXT:    cmpne p0.b, p0/z, z0.b, #0
@@ -638,8 +638,8 @@ define void @masked_load_zext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 {
 define void @masked_load_zext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl8
 ; VBITS_GE_256-NEXT:    ldr d0, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl8
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    cmeq v0.8b, v0.8b, #0
 ; VBITS_GE_256-NEXT:    cmpne p0.b, p0/z, z0.b, #0
@@ -706,8 +706,8 @@ define void @masked_load_zext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 {
 define void @masked_load_zext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
 ; VBITS_GE_256-NEXT:    ldr q0, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    cmeq v0.8h, v0.8h, #0
 ; VBITS_GE_256-NEXT:    cmpne p0.h, p0/z, z0.h, #0
@@ -782,11 +782,11 @@ define void @masked_load_sext_v32i8i16_m16(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; VBITS_GE_256-NEXT:    mov z1.h, p2/z, #-1 // =0xffffffffffffffff
 ; VBITS_GE_256-NEXT:    ptrue p1.b, vl16
-; VBITS_GE_256-NEXT:    ptrue p2.b, vl32
 ; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
 ; VBITS_GE_256-NEXT:    splice z1.b, p1, z1.b, z0.b
-; VBITS_GE_256-NEXT:    cmpne p1.b, p2/z, z1.b, #0
+; VBITS_GE_256-NEXT:    ptrue p1.b, vl32
+; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z1.b, #0
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0]
 ; VBITS_GE_256-NEXT:    sunpklo z1.h, z0.b
 ; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
@@ -1000,11 +1000,11 @@ define void @masked_load_sext_v8i32i64_m64(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
 ; VBITS_GE_256-NEXT:    mov z1.d, p2/z, #-1 // =0xffffffffffffffff
 ; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
-; VBITS_GE_256-NEXT:    ptrue p2.s, vl8
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
-; VBITS_GE_256-NEXT:    cmpne p1.s, p2/z, z1.s, #0
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
+; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z1.s, #0
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p1/z, [x0]
 ; VBITS_GE_256-NEXT:    sunpklo z1.d, z0.s
 ; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
@@ -1041,11 +1041,11 @@ define void @masked_load_zext_v32i8i16_m16(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
 ; VBITS_GE_256-NEXT:    mov z1.h, p2/z, #-1 // =0xffffffffffffffff
 ; VBITS_GE_256-NEXT:    ptrue p1.b, vl16
-; VBITS_GE_256-NEXT:    ptrue p2.b, vl32
 ; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
 ; VBITS_GE_256-NEXT:    splice z1.b, p1, z1.b, z0.b
-; VBITS_GE_256-NEXT:    cmpne p1.b, p2/z, z1.b, #0
+; VBITS_GE_256-NEXT:    ptrue p1.b, vl32
+; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z1.b, #0
 ; VBITS_GE_256-NEXT:    ld1b { z0.b }, p1/z, [x0]
 ; VBITS_GE_256-NEXT:    uunpklo z1.h, z0.b
 ; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
@@ -1259,11 +1259,11 @@ define void @masked_load_zext_v8i32i64_m64(ptr %ap, ptr %bp, ptr %c) #0 {
 ; VBITS_GE_256-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
 ; VBITS_GE_256-NEXT:    mov z1.d, p2/z, #-1 // =0xffffffffffffffff
 ; VBITS_GE_256-NEXT:    ptrue p1.s, vl4
-; VBITS_GE_256-NEXT:    ptrue p2.s, vl8
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; VBITS_GE_256-NEXT:    splice z1.s, p1, z1.s, z0.s
-; VBITS_GE_256-NEXT:    cmpne p1.s, p2/z, z1.s, #0
+; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
+; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z1.s, #0
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p1/z, [x0]
 ; VBITS_GE_256-NEXT:    uunpklo z1.d, z0.s
 ; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
index e2d341c22efc2..e3e06dcdf17f3 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
@@ -39,12 +39,12 @@ define void @masked_scatter_v4i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    cmeq v1.4h, v0.4h, #0
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; CHECK-NEXT:    st1b { z0.d }, p0, [z2.d]
+; CHECK-NEXT:    cmpne p1.d, p0/z, z1.d, #0
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    st1b { z0.d }, p1, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <4 x i8>, ptr %a
   %ptrs = load <4 x ptr>, ptr %b
@@ -61,11 +61,11 @@ define void @masked_scatter_v8i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    cmeq v1.8b, v0.8b, #0
 ; VBITS_GE_256-NEXT:    zip1 v3.8b, v0.8b, v0.8b
+; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    zip1 v2.8b, v1.8b, v0.8b
 ; VBITS_GE_256-NEXT:    zip2 v1.8b, v1.8b, v0.8b
 ; VBITS_GE_256-NEXT:    zip2 v0.8b, v0.8b, v0.8b
 ; VBITS_GE_256-NEXT:    uunpklo z3.s, z3.h
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    shl v2.4h, v2.4h, #8
 ; VBITS_GE_256-NEXT:    shl v1.4h, v1.4h, #8
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
@@ -78,10 +78,10 @@ define void @masked_scatter_v8i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
 ; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z2.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    st1b { z3.d }, p1, [z4.d]
-; VBITS_GE_256-NEXT:    st1b { z0.d }, p0, [z2.d]
+; VBITS_GE_256-NEXT:    st1b { z3.d }, p1, [z2.d]
+; VBITS_GE_256-NEXT:    st1b { z0.d }, p0, [z4.d]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_scatter_v8i8:
@@ -92,12 +92,12 @@ define void @masked_scatter_v8i8(ptr %a, ptr %b) #0 {
 ; VBITS_GE_512-NEXT:    uunpklo z0.h, z0.b
 ; VBITS_GE_512-NEXT:    sunpklo z1.h, z1.b
 ; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_512-NEXT:    ld1d { z2.d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_512-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_512-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; VBITS_GE_512-NEXT:    st1b { z0.d }, p0, [z2.d]
+; VBITS_GE_512-NEXT:    cmpne p1.d, p0/z, z1.d, #0
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    st1b { z0.d }, p1, [z1.d]
 ; VBITS_GE_512-NEXT:    ret
   %vals = load <8 x i8>, ptr %a
   %ptrs = load <8 x ptr>, ptr %b
@@ -115,12 +115,12 @@ define void @masked_scatter_v16i8(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    sunpklo z1.h, z1.b
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; CHECK-NEXT:    st1b { z0.d }, p0, [z2.d]
+; CHECK-NEXT:    cmpne p1.d, p0/z, z1.d, #0
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    st1b { z0.d }, p1, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <16 x i8>, ptr %a
   %ptrs = load <16 x ptr>, ptr %b
@@ -135,13 +135,13 @@ define void @masked_scatter_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p0.b, vl32
 ; CHECK-NEXT:    ptrue p1.d, vl32
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
-; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
+; CHECK-NEXT:    uunpklo z1.h, z0.b
 ; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
-; CHECK-NEXT:    uunpklo z0.h, z0.b
+; CHECK-NEXT:    uunpklo z0.s, z1.h
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
 ; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    st1b { z0.d }, p0, [z1.d]
 ; CHECK-NEXT:    ret
@@ -187,10 +187,10 @@ define void @masked_scatter_v4i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; CHECK-NEXT:    st1h { z0.d }, p0, [z2.d]
+; CHECK-NEXT:    cmpne p1.d, p0/z, z1.d, #0
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    st1h { z0.d }, p1, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <4 x i16>, ptr %a
   %ptrs = load <4 x ptr>, ptr %b
@@ -208,20 +208,20 @@ define void @masked_scatter_v8i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    cmeq v1.8h, v0.8h, #0
 ; VBITS_GE_256-NEXT:    uunpklo z3.s, z0.h
 ; VBITS_GE_256-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
 ; VBITS_GE_256-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    uunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
 ; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z2.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1h { z3.d }, p1, [z4.d]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    st1h { z0.d }, p0, [z2.d]
+; VBITS_GE_256-NEXT:    st1h { z3.d }, p1, [z2.d]
+; VBITS_GE_256-NEXT:    st1h { z0.d }, p0, [z4.d]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_scatter_v8i16:
@@ -232,10 +232,10 @@ define void @masked_scatter_v8i16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_GE_512-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_512-NEXT:    ld1d { z2.d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_512-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; VBITS_GE_512-NEXT:    st1h { z0.d }, p0, [z2.d]
+; VBITS_GE_512-NEXT:    cmpne p1.d, p0/z, z1.d, #0
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    st1h { z0.d }, p1, [z1.d]
 ; VBITS_GE_512-NEXT:    ret
   %vals = load <8 x i16>, ptr %a
   %ptrs = load <8 x ptr>, ptr %b
@@ -253,9 +253,9 @@ define void @masked_scatter_v16i16(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <16 x i16>, ptr %a
@@ -274,9 +274,9 @@ define void @masked_scatter_v32i16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <32 x i16>, ptr %a
@@ -317,9 +317,9 @@ define void @masked_scatter_v4i32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    cmeq v1.4s, v0.4s, #0
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; CHECK-NEXT:    st1w { z0.d }, p0, [z2.d]
+; CHECK-NEXT:    cmpne p1.d, p0/z, z1.d, #0
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    st1w { z0.d }, p1, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <4 x i32>, ptr %a
   %ptrs = load <4 x ptr>, ptr %b
@@ -333,21 +333,21 @@ define void @masked_scatter_v8i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    cmpeq p1.s, p0/z, z0.s, #0
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p1/z, [x1]
+; VBITS_GE_256-NEXT:    ld1d { z4.d }, p1/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
 ; VBITS_GE_256-NEXT:    uunpklo z2.d, z0.s
 ; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    mov z1.s, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    punpklo p2.h, p1.b
 ; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
+; VBITS_GE_256-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    punpklo p0.h, p0.b
+; VBITS_GE_256-NEXT:    and p0.b, p0/z, p0.b, p1.b
 ; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    st1w { z2.d }, p0, [z3.d]
 ; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    and p1.b, p2/z, p2.b, p0.b
-; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    st1w { z2.d }, p1, [z3.d]
+; VBITS_GE_256-NEXT:    cmpne p0.d, p1/z, z1.d, #0
 ; VBITS_GE_256-NEXT:    st1w { z0.d }, p0, [z4.d]
 ; VBITS_GE_256-NEXT:    ret
 ;
@@ -434,8 +434,8 @@ define void @masked_scatter_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 define void @masked_scatter_v2i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    cmeq v1.2d, v0.2d, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
 ; CHECK-NEXT:    ldr q1, [x1]
@@ -454,8 +454,8 @@ define void @masked_scatter_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, #0
-; CHECK-NEXT:    st1d { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    st1d { z0.d }, p1, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <4 x i64>, ptr %a
   %ptrs = load <4 x ptr>, ptr %b
@@ -484,8 +484,8 @@ define void @masked_scatter_v8i64(ptr %a, ptr %b) #0 {
 ; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
 ; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; VBITS_GE_512-NEXT:    cmpeq p0.d, p0/z, z0.d, #0
-; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [z1.d]
+; VBITS_GE_512-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; VBITS_GE_512-NEXT:    st1d { z0.d }, p1, [z1.d]
 ; VBITS_GE_512-NEXT:    ret
   %vals = load <8 x i64>, ptr %a
   %ptrs = load <8 x ptr>, ptr %b
@@ -500,8 +500,8 @@ define void @masked_scatter_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-NEXT:    ptrue p0.d, vl16
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, #0
-; CHECK-NEXT:    st1d { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    st1d { z0.d }, p1, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <16 x i64>, ptr %a
   %ptrs = load <16 x ptr>, ptr %b
@@ -516,8 +516,8 @@ define void @masked_scatter_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p0.d, vl32
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
-; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, #0
-; CHECK-NEXT:    st1d { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, #0
+; CHECK-NEXT:    st1d { z0.d }, p1, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <32 x i64>, ptr %a
   %ptrs = load <32 x ptr>, ptr %b
@@ -539,15 +539,15 @@ define void @masked_scatter_v2f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    fcmeq v2.4h, v1.4h, #0.0
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    sshll v2.4s, v2.4h, #0
-; CHECK-NEXT:    uunpklo z1.d, z1.s
 ; CHECK-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-NEXT:    mov w8, v2.s[1]
 ; CHECK-NEXT:    mov v0.h[1], w8
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; CHECK-NEXT:    ldr q0, [x1]
-; CHECK-NEXT:    st1h { z1.d }, p0, [z0.d]
+; CHECK-NEXT:    uunpklo z0.d, z1.s
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <2 x half>, ptr %a
   %ptrs = load <2 x ptr>, ptr %b
@@ -565,10 +565,10 @@ define void @masked_scatter_v4f16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; CHECK-NEXT:    st1h { z0.d }, p0, [z2.d]
+; CHECK-NEXT:    cmpne p1.d, p0/z, z1.d, #0
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    st1h { z0.d }, p1, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <4 x half>, ptr %a
   %ptrs = load <4 x ptr>, ptr %b
@@ -586,20 +586,20 @@ define void @masked_scatter_v8f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    fcmeq v1.8h, v0.8h, #0.0
 ; VBITS_GE_256-NEXT:    uunpklo z3.s, z0.h
 ; VBITS_GE_256-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    sunpklo z2.s, z1.h
 ; VBITS_GE_256-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_GE_256-NEXT:    uunpklo z3.d, z3.s
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    sunpklo z2.d, z2.s
 ; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
 ; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
 ; VBITS_GE_256-NEXT:    cmpne p1.d, p0/z, z2.d, #0
-; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
-; VBITS_GE_256-NEXT:    st1h { z3.d }, p1, [z4.d]
+; VBITS_GE_256-NEXT:    ld1d { z2.d }, p0/z, [x1]
 ; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    st1h { z0.d }, p0, [z2.d]
+; VBITS_GE_256-NEXT:    st1h { z3.d }, p1, [z2.d]
+; VBITS_GE_256-NEXT:    st1h { z0.d }, p0, [z4.d]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_scatter_v8f16:
@@ -610,10 +610,10 @@ define void @masked_scatter_v8f16(ptr %a, ptr %b) #0 {
 ; VBITS_GE_512-NEXT:    uunpklo z0.s, z0.h
 ; VBITS_GE_512-NEXT:    sunpklo z1.s, z1.h
 ; VBITS_GE_512-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_512-NEXT:    ld1d { z2.d }, p0/z, [x1]
 ; VBITS_GE_512-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_512-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; VBITS_GE_512-NEXT:    st1h { z0.d }, p0, [z2.d]
+; VBITS_GE_512-NEXT:    cmpne p1.d, p0/z, z1.d, #0
+; VBITS_GE_512-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; VBITS_GE_512-NEXT:    st1h { z0.d }, p1, [z1.d]
 ; VBITS_GE_512-NEXT:    ret
   %vals = load <8 x half>, ptr %a
   %ptrs = load <8 x ptr>, ptr %b
@@ -631,9 +631,9 @@ define void @masked_scatter_v16f16(ptr %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
 ; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <16 x half>, ptr %a
@@ -652,9 +652,9 @@ define void @masked_scatter_v32f16(ptr %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1]
 ; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
-; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <32 x half>, ptr %a
@@ -695,9 +695,9 @@ define void @masked_scatter_v4f32(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    fcmeq v1.4s, v0.4s, #0.0
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    sunpklo z1.d, z1.s
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x1]
-; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; CHECK-NEXT:    st1w { z0.d }, p0, [z2.d]
+; CHECK-NEXT:    cmpne p1.d, p0/z, z1.d, #0
+; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1]
+; CHECK-NEXT:    st1w { z0.d }, p1, [z1.d]
 ; CHECK-NEXT:    ret
   %vals = load <4 x float>, ptr %a
   %ptrs = load <4 x ptr>, ptr %b
@@ -711,21 +711,21 @@ define void @masked_scatter_v8f32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
+; VBITS_GE_256-NEXT:    ptrue p1.d, vl4
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0]
-; VBITS_GE_256-NEXT:    fcmeq p1.s, p0/z, z0.s, #0.0
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
+; VBITS_GE_256-NEXT:    ld1d { z3.d }, p1/z, [x1]
+; VBITS_GE_256-NEXT:    ld1d { z4.d }, p1/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
 ; VBITS_GE_256-NEXT:    uunpklo z2.d, z0.s
 ; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
-; VBITS_GE_256-NEXT:    mov z1.s, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    punpklo p2.h, p1.b
 ; VBITS_GE_256-NEXT:    uunpklo z0.d, z0.s
-; VBITS_GE_256-NEXT:    ld1d { z3.d }, p0/z, [x1]
-; VBITS_GE_256-NEXT:    ld1d { z4.d }, p0/z, [x1, x8, lsl #3]
+; VBITS_GE_256-NEXT:    mov z1.s, p0/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    punpklo p0.h, p0.b
+; VBITS_GE_256-NEXT:    and p0.b, p0/z, p0.b, p1.b
 ; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z1.b, #16
+; VBITS_GE_256-NEXT:    st1w { z2.d }, p0, [z3.d]
 ; VBITS_GE_256-NEXT:    sunpklo z1.d, z1.s
-; VBITS_GE_256-NEXT:    and p1.b, p2/z, p2.b, p0.b
-; VBITS_GE_256-NEXT:    cmpne p0.d, p0/z, z1.d, #0
-; VBITS_GE_256-NEXT:    st1w { z2.d }, p1, [z3.d]
+; VBITS_GE_256-NEXT:    cmpne p0.d, p1/z, z1.d, #0
 ; VBITS_GE_256-NEXT:    st1w { z0.d }, p0, [z4.d]
 ; VBITS_GE_256-NEXT:    ret
 ;
@@ -812,8 +812,8 @@ define void @masked_scatter_v1f64(ptr %a, ptr %b) vscale_range(8,0) #0 {
 define void @masked_scatter_v2f64(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    fcmeq v1.2d, v0.2d, #0.0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
 ; CHECK-NEXT:    ldr q1, [x1]
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
index 68fb4cc6afb09..b0d4f79aea110 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-stores.ll
@@ -34,9 +34,9 @@ define void @masked_store_v2f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 define void @masked_store_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_store_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    ldr d0, [x0]
 ; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    fcmeq v1.2s, v0.2s, v1.2s
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
@@ -51,9 +51,9 @@ define void @masked_store_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 define void @masked_store_v4f32(ptr %ap, ptr %bp) vscale_range(1,0) #0 {
 ; CHECK-LABEL: masked_store_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    fcmeq v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
@@ -161,11 +161,11 @@ define void @masked_store_trunc_v8i64i8(ptr %ap, ptr %bp, ptr %dest) #0 {
 ; VBITS_GE_256-NEXT:    mov z3.d, p0/z, #-1 // =0xffffffffffffffff
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
 ; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
+; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
 ; VBITS_GE_256-NEXT:    splice z3.s, p0, z3.s, z2.s
-; VBITS_GE_256-NEXT:    cmpne p0.s, p1/z, z3.s, #0
-; VBITS_GE_256-NEXT:    st1b { z1.s }, p0, [x2]
+; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z3.s, #0
+; VBITS_GE_256-NEXT:    st1b { z1.s }, p1, [x2]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i8:
@@ -197,14 +197,14 @@ define void @masked_store_trunc_v8i64i16(ptr %ap, ptr %bp, ptr %dest) #0 {
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    cmpeq p0.d, p0/z, z1.d, z3.d
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
+; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    mov z2.d, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; VBITS_GE_256-NEXT:    mov z3.d, p0/z, #-1 // =0xffffffffffffffff
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
-; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
-; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
 ; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
+; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
 ; VBITS_GE_256-NEXT:    splice z3.s, p0, z3.s, z2.s
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
 ; VBITS_GE_256-NEXT:    uzp1 z2.h, z3.h, z3.h
@@ -246,11 +246,11 @@ define void @masked_store_trunc_v8i64i32(ptr %ap, ptr %bp, ptr %dest) #0 {
 ; VBITS_GE_256-NEXT:    mov z3.d, p0/z, #-1 // =0xffffffffffffffff
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    uzp1 z2.s, z2.s, z2.s
-; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
 ; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
+; VBITS_GE_256-NEXT:    uzp1 z3.s, z3.s, z3.s
 ; VBITS_GE_256-NEXT:    splice z3.s, p0, z3.s, z2.s
-; VBITS_GE_256-NEXT:    cmpne p0.s, p1/z, z3.s, #0
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x2]
+; VBITS_GE_256-NEXT:    cmpne p1.s, p1/z, z3.s, #0
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p1, [x2]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_store_trunc_v8i64i32:
@@ -282,14 +282,14 @@ define void @masked_store_trunc_v16i32i8(ptr %ap, ptr %bp, ptr %dest) #0 {
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    cmpeq p0.s, p0/z, z1.s, z3.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
+; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; VBITS_GE_256-NEXT:    mov z2.s, p1/z, #-1 // =0xffffffffffffffff
+; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
 ; VBITS_GE_256-NEXT:    mov z3.s, p0/z, #-1 // =0xffffffffffffffff
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl16
-; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
-; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
 ; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
-; VBITS_GE_256-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; VBITS_GE_256-NEXT:    mov v1.d[1], v0.d[0]
+; VBITS_GE_256-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; VBITS_GE_256-NEXT:    uzp1 z2.b, z2.b, z2.b
 ; VBITS_GE_256-NEXT:    uzp1 z3.b, z3.b, z3.b
 ; VBITS_GE_256-NEXT:    mov v3.d[1], v2.d[0]
@@ -327,17 +327,17 @@ define void @masked_store_trunc_v16i32i16(ptr %ap, ptr %bp, ptr %dest) #0 {
 ; VBITS_GE_256-NEXT:    cmpeq p0.s, p0/z, z1.s, z3.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; VBITS_GE_256-NEXT:    mov z2.s, p1/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
+; VBITS_GE_256-NEXT:    ptrue p1.h, vl8
 ; VBITS_GE_256-NEXT:    mov z3.s, p0/z, #-1 // =0xffffffffffffffff
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    splice z1.h, p1, z1.h, z0.h
 ; VBITS_GE_256-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; VBITS_GE_256-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; VBITS_GE_256-NEXT:    uzp1 z2.b, z2.b, z2.b
-; VBITS_GE_256-NEXT:    splice z1.h, p0, z1.h, z0.h
 ; VBITS_GE_256-NEXT:    uzp1 z3.b, z3.b, z3.b
 ; VBITS_GE_256-NEXT:    mov v3.d[1], v2.d[0]
 ; VBITS_GE_256-NEXT:    sunpklo z2.h, z3.b
-; VBITS_GE_256-NEXT:    cmpne p0.h, p1/z, z2.h, #0
+; VBITS_GE_256-NEXT:    cmpne p0.h, p0/z, z2.h, #0
 ; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x2]
 ; VBITS_GE_256-NEXT:    ret
 ;
@@ -375,11 +375,11 @@ define void @masked_store_trunc_v32i16i8(ptr %ap, ptr %bp, ptr %dest) #0 {
 ; VBITS_GE_256-NEXT:    mov z3.h, p0/z, #-1 // =0xffffffffffffffff
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl16
 ; VBITS_GE_256-NEXT:    uzp1 z2.b, z2.b, z2.b
-; VBITS_GE_256-NEXT:    uzp1 z3.b, z3.b, z3.b
 ; VBITS_GE_256-NEXT:    splice z1.b, p0, z1.b, z0.b
+; VBITS_GE_256-NEXT:    uzp1 z3.b, z3.b, z3.b
 ; VBITS_GE_256-NEXT:    splice z3.b, p0, z3.b, z2.b
-; VBITS_GE_256-NEXT:    cmpne p0.b, p1/z, z3.b, #0
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x2]
+; VBITS_GE_256-NEXT:    cmpne p1.b, p1/z, z3.b, #0
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p1, [x2]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: masked_store_trunc_v32i16i8:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
index a5303c901b80f..fb169491b0c90 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-shuffles.ll
@@ -64,7 +64,6 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    mov x8, #16 // =0x10
 ; CHECK-NEXT:    mov x10, #8 // =0x8
-; CHECK-NEXT:    ld1w { z4.s }, p0/z, [x0, x8, lsl #2]
 ; CHECK-NEXT:    mov v2.b[7], w11
 ; CHECK-NEXT:    mov v1.b[7], w9
 ; CHECK-NEXT:    uunpklo z3.h, z3.b
@@ -86,22 +85,23 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) vscale_rang
 ; CHECK-NEXT:    asr z2.s, z2.s, #31
 ; CHECK-NEXT:    asr z1.s, z1.s, #31
 ; CHECK-NEXT:    cmpne p1.s, p0/z, z0.s, #0
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; CHECK-NEXT:    cmpne p2.s, p0/z, z3.s, #0
-; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x0, x10, lsl #2]
+; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x0, x9, lsl #2]
 ; CHECK-NEXT:    and z2.s, z2.s, #0x1
 ; CHECK-NEXT:    and z1.s, z1.s, #0x1
-; CHECK-NEXT:    mov z4.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.s, p2/m, #0 // =0x0
+; CHECK-NEXT:    mov z0.s, p1/m, #0 // =0x0
 ; CHECK-NEXT:    cmpne p3.s, p0/z, z2.s, #0
-; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0]
-; CHECK-NEXT:    cmpne p1.s, p0/z, z1.s, #0
-; CHECK-NEXT:    st1w { z4.s }, p0, [x0, x8, lsl #2]
-; CHECK-NEXT:    st1w { z0.s }, p0, [x0, x9, lsl #2]
-; CHECK-NEXT:    mov z3.s, p3/m, #0 // =0x0
-; CHECK-NEXT:    mov z2.s, p1/m, #0 // =0x0
-; CHECK-NEXT:    st1w { z3.s }, p0, [x0, x10, lsl #2]
-; CHECK-NEXT:    st1w { z2.s }, p0, [x0]
+; CHECK-NEXT:    cmpne p4.s, p0/z, z1.s, #0
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0, x10, lsl #2]
+; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT:    mov z3.s, p2/m, #0 // =0x0
+; CHECK-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
+; CHECK-NEXT:    mov z2.s, p3/m, #0 // =0x0
+; CHECK-NEXT:    mov z1.s, p4/m, #0 // =0x0
+; CHECK-NEXT:    st1w { z3.s }, p0, [x0, x9, lsl #2]
+; CHECK-NEXT:    st1w { z2.s }, p0, [x0, x10, lsl #2]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
 ; CHECK-NEXT:  .LBB1_2: // %exit
 ; CHECK-NEXT:    ret
   %broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
index f97ca05f3bdd4..b633057be139c 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-splat-vector.ll
@@ -34,8 +34,8 @@ define <16 x i8> @splat_v16i8(i8 %a) vscale_range(2,0) #0 {
 define void @splat_v32i8(i8 %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl32
 ; CHECK-NEXT:    mov z0.b, w0
+; CHECK-NEXT:    ptrue p0.b, vl32
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
 ; CHECK-NEXT:    ret
   %insert = insertelement <32 x i8> undef, i8 %a, i64 0
@@ -47,8 +47,8 @@ define void @splat_v32i8(i8 %a, ptr %b) vscale_range(2,0) #0 {
 define void @splat_v64i8(i8 %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: splat_v64i8:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
 ; VBITS_GE_256-NEXT:    mov z0.b, w0
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
 ; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x1, x8]
 ; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x1]
@@ -56,8 +56,8 @@ define void @splat_v64i8(i8 %a, ptr %b) #0 {
 ;
 ; VBITS_GE_512-LABEL: splat_v64i8:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
 ; VBITS_GE_512-NEXT:    mov z0.b, w0
+; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
 ; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x1]
 ; VBITS_GE_512-NEXT:    ret
   %insert = insertelement <64 x i8> undef, i8 %a, i64 0
@@ -69,8 +69,8 @@ define void @splat_v64i8(i8 %a, ptr %b) #0 {
 define void @splat_v128i8(i8 %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: splat_v128i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl128
 ; CHECK-NEXT:    mov z0.b, w0
+; CHECK-NEXT:    ptrue p0.b, vl128
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
 ; CHECK-NEXT:    ret
   %insert = insertelement <128 x i8> undef, i8 %a, i64 0
@@ -82,8 +82,8 @@ define void @splat_v128i8(i8 %a, ptr %b) vscale_range(8,0) #0 {
 define void @splat_v256i8(i8 %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: splat_v256i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl256
 ; CHECK-NEXT:    mov z0.b, w0
+; CHECK-NEXT:    ptrue p0.b, vl256
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x1]
 ; CHECK-NEXT:    ret
   %insert = insertelement <256 x i8> undef, i8 %a, i64 0
@@ -117,8 +117,8 @@ define <8 x i16> @splat_v8i16(i16 %a) vscale_range(2,0) #0 {
 define void @splat_v16i16(i16 %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl16
 ; CHECK-NEXT:    mov z0.h, w0
+; CHECK-NEXT:    ptrue p0.h, vl16
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
 ; CHECK-NEXT:    ret
   %insert = insertelement <16 x i16> undef, i16 %a, i64 0
@@ -130,8 +130,8 @@ define void @splat_v16i16(i16 %a, ptr %b) vscale_range(2,0) #0 {
 define void @splat_v32i16(i16 %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: splat_v32i16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
 ; VBITS_GE_256-NEXT:    mov z0.h, w0
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
 ; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x1]
@@ -139,8 +139,8 @@ define void @splat_v32i16(i16 %a, ptr %b) #0 {
 ;
 ; VBITS_GE_512-LABEL: splat_v32i16:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
 ; VBITS_GE_512-NEXT:    mov z0.h, w0
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
 ; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x1]
 ; VBITS_GE_512-NEXT:    ret
   %insert = insertelement <32 x i16> undef, i16 %a, i64 0
@@ -152,8 +152,8 @@ define void @splat_v32i16(i16 %a, ptr %b) #0 {
 define void @splat_v64i16(i16 %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: splat_v64i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl64
 ; CHECK-NEXT:    mov z0.h, w0
+; CHECK-NEXT:    ptrue p0.h, vl64
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
 ; CHECK-NEXT:    ret
   %insert = insertelement <64 x i16> undef, i16 %a, i64 0
@@ -165,8 +165,8 @@ define void @splat_v64i16(i16 %a, ptr %b) vscale_range(8,0) #0 {
 define void @splat_v128i16(i16 %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: splat_v128i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl128
 ; CHECK-NEXT:    mov z0.h, w0
+; CHECK-NEXT:    ptrue p0.h, vl128
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x1]
 ; CHECK-NEXT:    ret
   %insert = insertelement <128 x i16> undef, i16 %a, i64 0
@@ -200,8 +200,8 @@ define <4 x i32> @splat_v4i32(i32 %a) vscale_range(2,0) #0 {
 define void @splat_v8i32(i32 %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    mov z0.s, w0
+; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
   %insert = insertelement <8 x i32> undef, i32 %a, i64 0
@@ -213,8 +213,8 @@ define void @splat_v8i32(i32 %a, ptr %b) vscale_range(2,0) #0 {
 define void @splat_v16i32(i32 %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: splat_v16i32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    mov z0.s, w0
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
@@ -222,8 +222,8 @@ define void @splat_v16i32(i32 %a, ptr %b) #0 {
 ;
 ; VBITS_GE_512-LABEL: splat_v16i32:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
 ; VBITS_GE_512-NEXT:    mov z0.s, w0
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
 ; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x1]
 ; VBITS_GE_512-NEXT:    ret
   %insert = insertelement <16 x i32> undef, i32 %a, i64 0
@@ -235,8 +235,8 @@ define void @splat_v16i32(i32 %a, ptr %b) #0 {
 define void @splat_v32i32(i32 %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: splat_v32i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    mov z0.s, w0
+; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
   %insert = insertelement <32 x i32> undef, i32 %a, i64 0
@@ -248,8 +248,8 @@ define void @splat_v32i32(i32 %a, ptr %b) vscale_range(8,0) #0 {
 define void @splat_v64i32(i32 %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: splat_v64i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl64
 ; CHECK-NEXT:    mov z0.s, w0
+; CHECK-NEXT:    ptrue p0.s, vl64
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x1]
 ; CHECK-NEXT:    ret
   %insert = insertelement <64 x i32> undef, i32 %a, i64 0
@@ -283,8 +283,8 @@ define <2 x i64> @splat_v2i64(i64 %a) vscale_range(2,0) #0 {
 define void @splat_v4i64(i64 %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    mov z0.d, x0
+; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
   %insert = insertelement <4 x i64> undef, i64 %a, i64 0
@@ -296,8 +296,8 @@ define void @splat_v4i64(i64 %a, ptr %b) vscale_range(2,0) #0 {
 define void @splat_v8i64(i64 %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: splat_v8i64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    mov z0.d, x0
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x1]
@@ -305,8 +305,8 @@ define void @splat_v8i64(i64 %a, ptr %b) #0 {
 ;
 ; VBITS_GE_512-LABEL: splat_v8i64:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
 ; VBITS_GE_512-NEXT:    mov z0.d, x0
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
 ; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x1]
 ; VBITS_GE_512-NEXT:    ret
   %insert = insertelement <8 x i64> undef, i64 %a, i64 0
@@ -318,8 +318,8 @@ define void @splat_v8i64(i64 %a, ptr %b) #0 {
 define void @splat_v16i64(i64 %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: splat_v16i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl16
 ; CHECK-NEXT:    mov z0.d, x0
+; CHECK-NEXT:    ptrue p0.d, vl16
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
   %insert = insertelement <16 x i64> undef, i64 %a, i64 0
@@ -331,8 +331,8 @@ define void @splat_v16i64(i64 %a, ptr %b) vscale_range(8,0) #0 {
 define void @splat_v32i64(i64 %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: splat_v32i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl32
 ; CHECK-NEXT:    mov z0.d, x0
+; CHECK-NEXT:    ptrue p0.d, vl32
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
   %insert = insertelement <32 x i64> undef, i64 %a, i64 0
@@ -372,8 +372,8 @@ define <8 x half> @splat_v8f16(half %a) vscale_range(2,0) #0 {
 define void @splat_v16f16(half %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl16
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl16
 ; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -386,8 +386,8 @@ define void @splat_v16f16(half %a, ptr %b) vscale_range(2,0) #0 {
 define void @splat_v32f16(half %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: splat_v32f16:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
 ; VBITS_GE_256-NEXT:    // kill: def $h0 killed $h0 def $z0
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
 ; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
 ; VBITS_GE_256-NEXT:    mov z0.h, h0
 ; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
@@ -396,8 +396,8 @@ define void @splat_v32f16(half %a, ptr %b) #0 {
 ;
 ; VBITS_GE_512-LABEL: splat_v32f16:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
 ; VBITS_GE_512-NEXT:    // kill: def $h0 killed $h0 def $z0
+; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
 ; VBITS_GE_512-NEXT:    mov z0.h, h0
 ; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
@@ -410,8 +410,8 @@ define void @splat_v32f16(half %a, ptr %b) #0 {
 define void @splat_v64f16(half %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: splat_v64f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl64
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl64
 ; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -424,8 +424,8 @@ define void @splat_v64f16(half %a, ptr %b) vscale_range(8,0) #0 {
 define void @splat_v128f16(half %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: splat_v128f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl128
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl128
 ; CHECK-NEXT:    mov z0.h, h0
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -462,8 +462,8 @@ define <4 x float> @splat_v4f32(float %a, <4 x float> %op2) vscale_range(2,0) #0
 define void @splat_v8f32(float %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -476,8 +476,8 @@ define void @splat_v8f32(float %a, ptr %b) vscale_range(2,0) #0 {
 define void @splat_v16f32(float %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: splat_v16f32:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    // kill: def $s0 killed $s0 def $z0
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    mov z0.s, s0
 ; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
@@ -486,8 +486,8 @@ define void @splat_v16f32(float %a, ptr %b) #0 {
 ;
 ; VBITS_GE_512-LABEL: splat_v16f32:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
 ; VBITS_GE_512-NEXT:    // kill: def $s0 killed $s0 def $z0
+; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
 ; VBITS_GE_512-NEXT:    mov z0.s, s0
 ; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
@@ -500,8 +500,8 @@ define void @splat_v16f32(float %a, ptr %b) #0 {
 define void @splat_v32f32(float %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: splat_v32f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -514,8 +514,8 @@ define void @splat_v32f32(float %a, ptr %b) vscale_range(8,0) #0 {
 define void @splat_v64f32(float %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: splat_v64f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl64
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl64
 ; CHECK-NEXT:    mov z0.s, s0
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -550,8 +550,8 @@ define <2 x double> @splat_v2f64(double %a, <2 x double> %op2) vscale_range(2,0)
 define void @splat_v4f64(double %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: splat_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    mov z0.d, d0
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -564,8 +564,8 @@ define void @splat_v4f64(double %a, ptr %b) vscale_range(2,0) #0 {
 define void @splat_v8f64(double %a, ptr %b) #0 {
 ; VBITS_GE_256-LABEL: splat_v8f64:
 ; VBITS_GE_256:       // %bb.0:
-; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
 ; VBITS_GE_256-NEXT:    mov z0.d, d0
 ; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
@@ -574,8 +574,8 @@ define void @splat_v8f64(double %a, ptr %b) #0 {
 ;
 ; VBITS_GE_512-LABEL: splat_v8f64:
 ; VBITS_GE_512:       // %bb.0:
-; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
 ; VBITS_GE_512-NEXT:    // kill: def $d0 killed $d0 def $z0
+; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
 ; VBITS_GE_512-NEXT:    mov z0.d, d0
 ; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
 ; VBITS_GE_512-NEXT:    ret
@@ -588,8 +588,8 @@ define void @splat_v8f64(double %a, ptr %b) #0 {
 define void @splat_v16f64(double %a, ptr %b) vscale_range(8,0) #0 {
 ; CHECK-LABEL: splat_v16f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl16
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl16
 ; CHECK-NEXT:    mov z0.d, d0
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -602,8 +602,8 @@ define void @splat_v16f64(double %a, ptr %b) vscale_range(8,0) #0 {
 define void @splat_v32f64(double %a, ptr %b) vscale_range(16,0) #0 {
 ; CHECK-LABEL: splat_v32f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl32
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl32
 ; CHECK-NEXT:    mov z0.d, d0
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -620,8 +620,8 @@ define void @splat_v32f64(double %a, ptr %b) vscale_range(16,0) #0 {
 define void @splat_imm_v64i8(ptr %a) vscale_range(4,0) #0 {
 ; CHECK-LABEL: splat_imm_v64i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl64
 ; CHECK-NEXT:    mov z0.b, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.b, vl64
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
   %insert = insertelement <64 x i8> undef, i8 1, i64 0
@@ -633,8 +633,8 @@ define void @splat_imm_v64i8(ptr %a) vscale_range(4,0) #0 {
 define void @splat_imm_v32i16(ptr %a) vscale_range(4,0) #0 {
 ; CHECK-LABEL: splat_imm_v32i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl32
 ; CHECK-NEXT:    mov z0.h, #2 // =0x2
+; CHECK-NEXT:    ptrue p0.h, vl32
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %insert = insertelement <32 x i16> undef, i16 2, i64 0
@@ -646,8 +646,8 @@ define void @splat_imm_v32i16(ptr %a) vscale_range(4,0) #0 {
 define void @splat_imm_v16i32(ptr %a) vscale_range(4,0) #0 {
 ; CHECK-LABEL: splat_imm_v16i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl16
 ; CHECK-NEXT:    mov z0.s, #3 // =0x3
+; CHECK-NEXT:    ptrue p0.s, vl16
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
   %insert = insertelement <16 x i32> undef, i32 3, i64 0
@@ -659,8 +659,8 @@ define void @splat_imm_v16i32(ptr %a) vscale_range(4,0) #0 {
 define void @splat_imm_v8i64(ptr %a) vscale_range(4,0) #0 {
 ; CHECK-LABEL: splat_imm_v8i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl8
 ; CHECK-NEXT:    mov z0.d, #4 // =0x4
+; CHECK-NEXT:    ptrue p0.d, vl8
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %insert = insertelement <8 x i64> undef, i64 4, i64 0
@@ -676,8 +676,8 @@ define void @splat_imm_v8i64(ptr %a) vscale_range(4,0) #0 {
 define void @splat_imm_v32f16(ptr %a) vscale_range(4,0) #0 {
 ; CHECK-LABEL: splat_imm_v32f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl32
 ; CHECK-NEXT:    fmov z0.h, #5.00000000
+; CHECK-NEXT:    ptrue p0.h, vl32
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %insert = insertelement <32 x half> undef, half 5.0, i64 0
@@ -689,8 +689,8 @@ define void @splat_imm_v32f16(ptr %a) vscale_range(4,0) #0 {
 define void @splat_imm_v16f32(ptr %a) vscale_range(4,0) #0 {
 ; CHECK-LABEL: splat_imm_v16f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl16
 ; CHECK-NEXT:    fmov z0.s, #6.00000000
+; CHECK-NEXT:    ptrue p0.s, vl16
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
   %insert = insertelement <16 x float> undef, float 6.0, i64 0
@@ -702,8 +702,8 @@ define void @splat_imm_v16f32(ptr %a) vscale_range(4,0) #0 {
 define void @splat_imm_v8f64(ptr %a) vscale_range(4,0) #0 {
 ; CHECK-LABEL: splat_imm_v8f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl8
 ; CHECK-NEXT:    fmov z0.d, #7.00000000
+; CHECK-NEXT:    ptrue p0.d, vl8
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %insert = insertelement <8 x double> undef, double 7.0, i64 0
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll
index 2dc4bddb81a6d..020d5cb53bf21 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-trunc-stores.ll
@@ -36,14 +36,14 @@ define void @store_trunc_v8i64i8(ptr %ap, ptr %dest) #0 {
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
-; VBITS_GE_256-NEXT:    st1b { z1.s }, p1, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    st1b { z1.s }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: store_trunc_v8i64i8:
@@ -117,14 +117,14 @@ define void @store_trunc_v8i64i32(ptr %ap, ptr %dest) #0 {
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
 ; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
-; VBITS_GE_256-NEXT:    ptrue p1.s, vl8
 ; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
 ; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl4
 ; VBITS_GE_256-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; VBITS_GE_256-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; VBITS_GE_256-NEXT:    splice z1.s, p0, z1.s, z0.s
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p1, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: store_trunc_v8i64i32:
@@ -172,14 +172,14 @@ define void @store_trunc_v16i32i16(ptr %ap, ptr %dest) #0 {
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
 ; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
-; VBITS_GE_256-NEXT:    ptrue p1.h, vl16
 ; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl8
 ; VBITS_GE_256-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; VBITS_GE_256-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; VBITS_GE_256-NEXT:    splice z1.h, p0, z1.h, z0.h
-; VBITS_GE_256-NEXT:    st1h { z1.h }, p1, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
+; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: store_trunc_v16i32i16:
@@ -199,14 +199,14 @@ define void @store_trunc_v32i16i8(ptr %ap, ptr %dest) #0 {
 ; VBITS_GE_256:       // %bb.0:
 ; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
 ; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
-; VBITS_GE_256-NEXT:    ptrue p1.b, vl32
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.b, vl16
 ; VBITS_GE_256-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; VBITS_GE_256-NEXT:    uzp1 z1.b, z1.b, z1.b
 ; VBITS_GE_256-NEXT:    splice z1.b, p0, z1.b, z0.b
-; VBITS_GE_256-NEXT:    st1b { z1.b }, p1, [x1]
+; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
+; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x1]
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: store_trunc_v32i16i8:
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
index 68c234a20d110..28094c7b68e7c 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-vector-shuffle-tbl.ll
@@ -559,13 +559,13 @@ define <8 x i16> @shuffle_index_indices_from_both_ops_i16(ptr %a, ptr %b) {
 ;
 ; SVE2_128_NOMAX-LABEL: shuffle_index_indices_from_both_ops_i16:
 ; SVE2_128_NOMAX:       // %bb.0:
-; SVE2_128_NOMAX-NEXT:    ptrue p0.h, vl8
 ; SVE2_128_NOMAX-NEXT:    cnth x8
 ; SVE2_128_NOMAX-NEXT:    adrp x9, .LCPI7_0
 ; SVE2_128_NOMAX-NEXT:    adrp x10, .LCPI7_1
 ; SVE2_128_NOMAX-NEXT:    mov z0.h, w8
 ; SVE2_128_NOMAX-NEXT:    ldr q1, [x9, :lo12:.LCPI7_0]
 ; SVE2_128_NOMAX-NEXT:    ldr q2, [x10, :lo12:.LCPI7_1]
+; SVE2_128_NOMAX-NEXT:    ptrue p0.h, vl8
 ; SVE2_128_NOMAX-NEXT:    mad z0.h, p0/m, z1.h, z2.h
 ; SVE2_128_NOMAX-NEXT:    ldr q1, [x0]
 ; SVE2_128_NOMAX-NEXT:    ldr q2, [x1]
@@ -575,13 +575,13 @@ define <8 x i16> @shuffle_index_indices_from_both_ops_i16(ptr %a, ptr %b) {
 ;
 ; SVE2_NOMIN_NOMAX-LABEL: shuffle_index_indices_from_both_ops_i16:
 ; SVE2_NOMIN_NOMAX:       // %bb.0:
-; SVE2_NOMIN_NOMAX-NEXT:    ptrue p0.h, vl8
 ; SVE2_NOMIN_NOMAX-NEXT:    cnth x8
 ; SVE2_NOMIN_NOMAX-NEXT:    adrp x9, .LCPI7_0
 ; SVE2_NOMIN_NOMAX-NEXT:    adrp x10, .LCPI7_1
 ; SVE2_NOMIN_NOMAX-NEXT:    mov z0.h, w8
 ; SVE2_NOMIN_NOMAX-NEXT:    ldr q1, [x9, :lo12:.LCPI7_0]
 ; SVE2_NOMIN_NOMAX-NEXT:    ldr q2, [x10, :lo12:.LCPI7_1]
+; SVE2_NOMIN_NOMAX-NEXT:    ptrue p0.h, vl8
 ; SVE2_NOMIN_NOMAX-NEXT:    mad z0.h, p0/m, z1.h, z2.h
 ; SVE2_NOMIN_NOMAX-NEXT:    ldr q1, [x0]
 ; SVE2_NOMIN_NOMAX-NEXT:    ldr q2, [x1]
@@ -597,9 +597,9 @@ define <8 x i16> @shuffle_index_indices_from_both_ops_i16(ptr %a, ptr %b) {
 ; SVE2_MIN_256_NOMAX-NEXT:    adrp x9, .LCPI7_1
 ; SVE2_MIN_256_NOMAX-NEXT:    add x9, x9, :lo12:.LCPI7_1
 ; SVE2_MIN_256_NOMAX-NEXT:    cnth x10
-; SVE2_MIN_256_NOMAX-NEXT:    mov z2.h, w10
 ; SVE2_MIN_256_NOMAX-NEXT:    ld1h { z0.h }, p0/z, [x8]
 ; SVE2_MIN_256_NOMAX-NEXT:    ld1h { z1.h }, p0/z, [x9]
+; SVE2_MIN_256_NOMAX-NEXT:    mov z2.h, w10
 ; SVE2_MIN_256_NOMAX-NEXT:    mad z0.h, p0/m, z2.h, z1.h
 ; SVE2_MIN_256_NOMAX-NEXT:    ldr q1, [x0]
 ; SVE2_MIN_256_NOMAX-NEXT:    ldr q2, [x1]
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
index 5ff9f0f0df62f..afe13851f0b95 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-int-min-max.ll
@@ -7,11 +7,11 @@ define i64 @scalable_int_min_max(ptr %arg, ptr %arg1, <vscale x 2 x ptr> %i37, <
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #3745 // =0xea1
 ; CHECK-NEXT:    movk w8, #16618, lsl #16
+; CHECK-NEXT:    ld1w { z3.d }, p0/z, [x0]
 ; CHECK-NEXT:    mov z4.s, w8
 ; CHECK-NEXT:    mov w8, #57344 // =0xe000
 ; CHECK-NEXT:    movk w8, #17535, lsl #16
 ; CHECK-NEXT:    mov z5.s, w8
-; CHECK-NEXT:    ld1w { z3.d }, p0/z, [x0]
 ; CHECK-NEXT:    fmul z4.s, p0/m, z4.s, z3.s
 ; CHECK-NEXT:    fadd z4.s, p0/m, z4.s, z5.s
 ; CHECK-NEXT:    mov z5.d, #1023 // =0x3ff
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll b/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll
index aefc8de431436..6420071b3dce4 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll
@@ -92,8 +92,8 @@ define <vscale x 8 x half> @fsqrt_recip_8f16(<vscale x 8 x half> %a) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frsqrte z1.h, z0.h
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fmul z2.h, z1.h, z1.h
 ; CHECK-NEXT:    fcmne p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    fmul z2.h, z1.h, z1.h
 ; CHECK-NEXT:    frsqrts z2.h, z0.h, z2.h
 ; CHECK-NEXT:    fmul z1.h, z1.h, z2.h
 ; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
@@ -117,8 +117,8 @@ define <vscale x 4 x float> @fsqrt_recip_4f32(<vscale x 4 x float> %a) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frsqrte z1.s, z0.s
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fmul z2.s, z1.s, z1.s
 ; CHECK-NEXT:    fcmne p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    fmul z2.s, z1.s, z1.s
 ; CHECK-NEXT:    frsqrts z2.s, z0.s, z2.s
 ; CHECK-NEXT:    fmul z1.s, z1.s, z2.s
 ; CHECK-NEXT:    fmul z2.s, z1.s, z1.s
@@ -145,8 +145,8 @@ define <vscale x 2 x double> @fsqrt_recip_2f64(<vscale x 2 x double> %a) #0 {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    frsqrte z1.d, z0.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    fmul z2.d, z1.d, z1.d
 ; CHECK-NEXT:    fcmne p0.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    fmul z2.d, z1.d, z1.d
 ; CHECK-NEXT:    frsqrts z2.d, z0.d, z2.d
 ; CHECK-NEXT:    fmul z1.d, z1.d, z2.d
 ; CHECK-NEXT:    fmul z2.d, z1.d, z1.d
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll b/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll
index 460d8a8694bc4..1a2ab8d4253ab 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-reduce-fadda.ll
@@ -51,10 +51,10 @@ define half @fadda_nxv6f16(<vscale x 6 x half> %v, half %s) {
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w8, #32768 // =0x8000
-; CHECK-NEXT:    ptrue p1.d
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w8
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK-NEXT:    fmov s0, s1
 ; CHECK-NEXT:    st1h { z2.d }, p1, [sp, #3, mul vl]
diff --git a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
index 813f1601e809e..584c29ebcfc04 100644
--- a/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fptosi-sat.ll
@@ -14,8 +14,8 @@ declare <vscale x 4 x i64> @llvm.fptosi.sat.nxv4f32.nxv4i64(<vscale x 4 x float>
 define <vscale x 2 x i32> @test_signed_v2f32_v2i32(<vscale x 2 x float> %f) {
 ; CHECK-LABEL: test_signed_v2f32_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #-822083584 // =0xcf000000
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.d, #0xffffffff80000000
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov w8, #1325400063 // =0x4effffff
@@ -38,8 +38,8 @@ define <vscale x 2 x i32> @test_signed_v2f32_v2i32(<vscale x 2 x float> %f) {
 define <vscale x 4 x i32> @test_signed_v4f32_v4i32(<vscale x 4 x float> %f) {
 ; CHECK-LABEL: test_signed_v4f32_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #-822083584 // =0xcf000000
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, #0x80000000
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov w8, #1325400063 // =0x4effffff
@@ -67,29 +67,29 @@ define <vscale x 8 x i32> @test_signed_v8f32_v8i32(<vscale x 8 x float> %f) {
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #-822083584 // =0xcf000000
-; CHECK-NEXT:    mov z3.s, #0x80000000
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z6.s, #0x7fffffff
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    mov w8, #1325400063 // =0x4effffff
-; CHECK-NEXT:    mov z6.s, #0x7fffffff
-; CHECK-NEXT:    mov z4.s, w8
-; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    fcmge p2.s, p0/z, z1.s, z2.s
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzs z2.s, p0/m, z0.s
+; CHECK-NEXT:    mov z3.s, w8
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    fcvtzs z4.s, p0/m, z0.s
 ; CHECK-NEXT:    movprfx z5, z1
 ; CHECK-NEXT:    fcvtzs z5.s, p0/m, z1.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z0.s, z4.s
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z1.s, z4.s
+; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z1.s, z2.s
+; CHECK-NEXT:    mov z2.s, #0x80000000
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z0.s, z3.s
+; CHECK-NEXT:    fcmgt p4.s, p0/z, z1.s, z3.s
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z2.s, p1/m, z3.s
+; CHECK-NEXT:    sel z3.s, p1, z2.s, z4.s
 ; CHECK-NEXT:    fcmuo p1.s, p0/z, z0.s, z0.s
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z1.s, z1.s
-; CHECK-NEXT:    sel z3.s, p2, z3.s, z5.s
-; CHECK-NEXT:    sel z0.s, p3, z6.s, z2.s
-; CHECK-NEXT:    sel z1.s, p4, z6.s, z3.s
+; CHECK-NEXT:    sel z2.s, p2, z2.s, z5.s
+; CHECK-NEXT:    sel z0.s, p3, z6.s, z3.s
+; CHECK-NEXT:    sel z1.s, p4, z6.s, z2.s
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z0.s, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.s, p0/m, #0 // =0x0
@@ -103,8 +103,8 @@ define <vscale x 8 x i32> @test_signed_v8f32_v8i32(<vscale x 8 x float> %f) {
 define <vscale x 4 x i16> @test_signed_v4f32_v4i16(<vscale x 4 x float> %f) {
 ; CHECK-LABEL: test_signed_v4f32_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #-956301312 // =0xc7000000
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov w8, #65024 // =0xfe00
 ; CHECK-NEXT:    movk w8, #18175, lsl #16
@@ -132,28 +132,28 @@ define <vscale x 8 x i16> @test_signed_v8f32_v8i16(<vscale x 8 x float> %f) {
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #-956301312 // =0xc7000000
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z5.s, #32767 // =0x7fff
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    mov w8, #65024 // =0xfe00
 ; CHECK-NEXT:    movk w8, #18175, lsl #16
-; CHECK-NEXT:    mov z3.s, w8
-; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, z2.s
-; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    movprfx z2, z1
-; CHECK-NEXT:    fcvtzs z2.s, p0/m, z1.s
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    fcvtzs z3.s, p0/m, z1.s
 ; CHECK-NEXT:    movprfx z4, z0
 ; CHECK-NEXT:    fcvtzs z4.s, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z3.s
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z0.s, z3.s
+; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, z2.s
+; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z2.s
+; CHECK-NEXT:    mov z2.s, w8
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z2.s
+; CHECK-NEXT:    fcmgt p4.s, p0/z, z0.s, z2.s
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z2.s, p1/m, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    mov z3.s, p1/m, #-32768 // =0xffffffffffff8000
 ; CHECK-NEXT:    fcmuo p1.s, p0/z, z1.s, z1.s
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
 ; CHECK-NEXT:    mov z4.s, p2/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    sel z0.s, p3, z5.s, z2.s
+; CHECK-NEXT:    sel z0.s, p3, z5.s, z3.s
 ; CHECK-NEXT:    sel z1.s, p4, z5.s, z4.s
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z0.s, p1/m, #0 // =0x0
@@ -169,8 +169,8 @@ define <vscale x 8 x i16> @test_signed_v8f32_v8i16(<vscale x 8 x float> %f) {
 define <vscale x 2 x i64> @test_signed_v2f32_v2i64(<vscale x 2 x float> %f) {
 ; CHECK-LABEL: test_signed_v2f32_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
@@ -198,31 +198,31 @@ define <vscale x 4 x i64> @test_signed_v4f32_v4i64(<vscale x 4 x float> %f) {
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #-553648128 // =0xdf000000
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    mov z2.s, w8
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #1593835519 // =0x5effffff
-; CHECK-NEXT:    mov z3.d, #0x8000000000000000
-; CHECK-NEXT:    mov z4.s, w8
+; CHECK-NEXT:    mov z3.s, w8
 ; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, z2.s
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, z2.s
-; CHECK-NEXT:    movprfx z2, z1
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z1.s
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    movprfx z4, z1
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.s
 ; CHECK-NEXT:    movprfx z5, z0
 ; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.s
-; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z4.s
-; CHECK-NEXT:    fcmgt p4.s, p0/z, z0.s, z4.s
+; CHECK-NEXT:    fcmgt p3.s, p0/z, z1.s, z3.s
+; CHECK-NEXT:    fcmgt p4.s, p0/z, z0.s, z3.s
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z2.d, p1/m, z3.d
+; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
 ; CHECK-NEXT:    fcmuo p1.s, p0/z, z1.s, z1.s
 ; CHECK-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
-; CHECK-NEXT:    sel z3.d, p2, z3.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z2.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z3.d
+; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
+; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
+; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
@@ -246,8 +246,8 @@ declare <vscale x 4 x i64> @llvm.fptosi.sat.nxv4f64.nxv4i64(<vscale x 4 x double
 define <vscale x 2 x i32> @test_signed_v2f64_v2i32(<vscale x 2 x double> %f) {
 ; CHECK-LABEL: test_signed_v2f64_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4476578029606273024 // =0xc1e0000000000000
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.d, #0xffffffff80000000
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mov x8, #281474972516352 // =0xffffffc00000
@@ -276,30 +276,30 @@ define <vscale x 4 x i32> @test_signed_v4f64_v4i32(<vscale x 4 x double> %f) {
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4476578029606273024 // =0xc1e0000000000000
-; CHECK-NEXT:    mov z3.d, #0xffffffff80000000
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z6.d, #0x7fffffff
 ; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    mov x8, #281474972516352 // =0xffffffc00000
-; CHECK-NEXT:    mov z6.d, #0x7fffffff
 ; CHECK-NEXT:    movk x8, #16863, lsl #48
-; CHECK-NEXT:    mov z4.d, x8
-; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, z2.d
-; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, z2.d
-; CHECK-NEXT:    movprfx z2, z1
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z1.d
+; CHECK-NEXT:    movprfx z4, z1
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.d
 ; CHECK-NEXT:    movprfx z5, z0
 ; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z4.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z0.d, z4.d
+; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, z2.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    mov z2.d, #0xffffffff80000000
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z3.d
+; CHECK-NEXT:    fcmgt p4.d, p0/z, z0.d, z3.d
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z2.d, p1/m, z3.d
+; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
 ; CHECK-NEXT:    fcmuo p1.d, p0/z, z1.d, z1.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    sel z3.d, p2, z3.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z2.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z3.d
+; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
+; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
+; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
@@ -322,48 +322,48 @@ define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4476578029606273024 // =0xc1e0000000000000
-; CHECK-NEXT:    mov z26.d, #0x7fffffff
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z5.d, #0xffffffff80000000
 ; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    mov x8, #281474972516352 // =0xffffffc00000
+; CHECK-NEXT:    mov z26.d, #0x7fffffff
 ; CHECK-NEXT:    movk x8, #16863, lsl #48
-; CHECK-NEXT:    mov z5.d, x8
-; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, z4.d
-; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, z4.d
-; CHECK-NEXT:    fcmge p3.d, p0/z, z3.d, z4.d
-; CHECK-NEXT:    fcmge p4.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    mov z4.d, #0xffffffff80000000
-; CHECK-NEXT:    movprfx z6, z1
-; CHECK-NEXT:    fcvtzs z6.d, p0/m, z1.d
 ; CHECK-NEXT:    movprfx z7, z0
 ; CHECK-NEXT:    fcvtzs z7.d, p0/m, z0.d
 ; CHECK-NEXT:    movprfx z24, z3
 ; CHECK-NEXT:    fcvtzs z24.d, p0/m, z3.d
+; CHECK-NEXT:    mov z6.d, x8
 ; CHECK-NEXT:    movprfx z25, z2
 ; CHECK-NEXT:    fcvtzs z25.d, p0/m, z2.d
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z1.d, z5.d
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z0.d, z5.d
-; CHECK-NEXT:    fcmgt p7.d, p0/z, z3.d, z5.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, z4.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, z4.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z3.d, z4.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z2.d, z4.d
+; CHECK-NEXT:    movprfx z4, z1
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z1.d, z6.d
+; CHECK-NEXT:    fcmgt p6.d, p0/z, z0.d, z6.d
+; CHECK-NEXT:    fcmgt p7.d, p0/z, z3.d, z6.d
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
 ; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    mov z4.d, p1/m, z5.d
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z2.d, z6.d
 ; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    mov z6.d, p1/m, z4.d
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z2.d, z5.d
-; CHECK-NEXT:    sel z5.d, p2, z4.d, z7.d
+; CHECK-NEXT:    sel z6.d, p2, z5.d, z7.d
 ; CHECK-NEXT:    fcmuo p2.d, p0/z, z1.d, z1.d
-; CHECK-NEXT:    sel z7.d, p3, z4.d, z24.d
+; CHECK-NEXT:    sel z7.d, p3, z5.d, z24.d
 ; CHECK-NEXT:    fcmuo p3.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    sel z4.d, p4, z4.d, z25.d
+; CHECK-NEXT:    sel z5.d, p4, z5.d, z25.d
 ; CHECK-NEXT:    fcmuo p4.d, p0/z, z3.d, z3.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z2.d, z2.d
-; CHECK-NEXT:    sel z0.d, p5, z26.d, z6.d
-; CHECK-NEXT:    sel z1.d, p6, z26.d, z5.d
+; CHECK-NEXT:    sel z0.d, p5, z26.d, z4.d
+; CHECK-NEXT:    sel z1.d, p6, z26.d, z6.d
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    sel z2.d, p7, z26.d, z7.d
 ; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    sel z3.d, p1, z26.d, z4.d
+; CHECK-NEXT:    sel z3.d, p1, z26.d, z5.d
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z0.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p3/m, #0 // =0x0
@@ -387,28 +387,28 @@ define <vscale x 4 x i16> @test_signed_v4f64_v4i16(<vscale x 4 x double> %f) {
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4548635623644200960 // =0xc0e0000000000000
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z5.d, #32767 // =0x7fff
 ; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    mov x8, #281200098803712 // =0xffc000000000
 ; CHECK-NEXT:    movk x8, #16607, lsl #48
-; CHECK-NEXT:    mov z3.d, x8
-; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, z2.d
-; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, z2.d
-; CHECK-NEXT:    movprfx z2, z1
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z1.d
+; CHECK-NEXT:    movprfx z3, z1
+; CHECK-NEXT:    fcvtzs z3.d, p0/m, z1.d
 ; CHECK-NEXT:    movprfx z4, z0
 ; CHECK-NEXT:    fcvtzs z4.d, p0/m, z0.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z3.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z0.d, z3.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, z2.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z1.d, z2.d
+; CHECK-NEXT:    fcmgt p4.d, p0/z, z0.d, z2.d
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z2.d, p1/m, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    mov z3.d, p1/m, #-32768 // =0xffffffffffff8000
 ; CHECK-NEXT:    fcmuo p1.d, p0/z, z1.d, z1.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
 ; CHECK-NEXT:    mov z4.d, p2/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    sel z0.d, p3, z5.d, z2.d
+; CHECK-NEXT:    sel z0.d, p3, z5.d, z3.d
 ; CHECK-NEXT:    sel z1.d, p4, z5.d, z4.d
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
@@ -432,34 +432,34 @@ define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4548635623644200960 // =0xc0e0000000000000
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z25.d, #32767 // =0x7fff
 ; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    mov x8, #281200098803712 // =0xffc000000000
 ; CHECK-NEXT:    movk x8, #16607, lsl #48
-; CHECK-NEXT:    fcmge p1.d, p0/z, z3.d, z4.d
-; CHECK-NEXT:    fcmge p2.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    fcmge p3.d, p0/z, z1.d, z4.d
-; CHECK-NEXT:    fcmge p4.d, p0/z, z0.d, z4.d
-; CHECK-NEXT:    movprfx z5, z3
-; CHECK-NEXT:    fcvtzs z5.d, p0/m, z3.d
-; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    movprfx z6, z2
 ; CHECK-NEXT:    fcvtzs z6.d, p0/m, z2.d
 ; CHECK-NEXT:    movprfx z7, z1
 ; CHECK-NEXT:    fcvtzs z7.d, p0/m, z1.d
+; CHECK-NEXT:    mov z5.d, x8
 ; CHECK-NEXT:    movprfx z24, z0
 ; CHECK-NEXT:    fcvtzs z24.d, p0/m, z0.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z3.d, z4.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z2.d, z4.d
+; CHECK-NEXT:    fcmge p3.d, p0/z, z1.d, z4.d
+; CHECK-NEXT:    fcmge p4.d, p0/z, z0.d, z4.d
+; CHECK-NEXT:    movprfx z4, z3
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z3.d
+; CHECK-NEXT:    fcmgt p5.d, p0/z, z3.d, z5.d
+; CHECK-NEXT:    fcmgt p6.d, p0/z, z2.d, z5.d
+; CHECK-NEXT:    fcmgt p7.d, p0/z, z1.d, z5.d
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    fcmgt p5.d, p0/z, z3.d, z4.d
-; CHECK-NEXT:    fcmgt p6.d, p0/z, z2.d, z4.d
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    fcmgt p7.d, p0/z, z1.d, z4.d
 ; CHECK-NEXT:    not p3.b, p0/z, p3.b
+; CHECK-NEXT:    mov z4.d, p1/m, #-32768 // =0xffffffffffff8000
+; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z5.d
 ; CHECK-NEXT:    not p4.b, p0/z, p4.b
-; CHECK-NEXT:    mov z5.d, p1/m, #-32768 // =0xffffffffffff8000
-; CHECK-NEXT:    fcmgt p1.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    mov z6.d, p2/m, #-32768 // =0xffffffffffff8000
 ; CHECK-NEXT:    fcmuo p2.d, p0/z, z3.d, z3.d
 ; CHECK-NEXT:    mov z7.d, p3/m, #-32768 // =0xffffffffffff8000
@@ -467,7 +467,7 @@ define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
 ; CHECK-NEXT:    mov z24.d, p4/m, #-32768 // =0xffffffffffff8000
 ; CHECK-NEXT:    fcmuo p4.d, p0/z, z1.d, z1.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z0.d, z0.d
-; CHECK-NEXT:    sel z2.d, p5, z25.d, z5.d
+; CHECK-NEXT:    sel z2.d, p5, z25.d, z4.d
 ; CHECK-NEXT:    sel z0.d, p6, z25.d, z6.d
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    sel z1.d, p7, z25.d, z7.d
@@ -492,8 +492,8 @@ define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
 define <vscale x 2 x i64> @test_signed_v2f64_v2i64(<vscale x 2 x double> %f) {
 ; CHECK-LABEL: test_signed_v2f64_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
@@ -521,29 +521,29 @@ define <vscale x 4 x i64> @test_signed_v4f64_v4i64(<vscale x 4 x double> %f) {
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #-4332462841530417152 // =0xc3e0000000000000
-; CHECK-NEXT:    mov z3.d, #0x8000000000000000
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    mov x8, #4890909195324358655 // =0x43dfffffffffffff
-; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
-; CHECK-NEXT:    mov z4.d, x8
-; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z2.d
-; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z2.d
-; CHECK-NEXT:    movprfx z2, z0
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z0.d
+; CHECK-NEXT:    mov z3.d, x8
+; CHECK-NEXT:    movprfx z4, z0
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z0.d
 ; CHECK-NEXT:    movprfx z5, z1
 ; CHECK-NEXT:    fcvtzs z5.d, p0/m, z1.d
-; CHECK-NEXT:    fcmgt p3.d, p0/z, z0.d, z4.d
-; CHECK-NEXT:    fcmgt p4.d, p0/z, z1.d, z4.d
+; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, z2.d
+; CHECK-NEXT:    fcmge p2.d, p0/z, z1.d, z2.d
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    fcmgt p3.d, p0/z, z0.d, z3.d
+; CHECK-NEXT:    fcmgt p4.d, p0/z, z1.d, z3.d
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z2.d, p1/m, z3.d
+; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
 ; CHECK-NEXT:    fcmuo p1.d, p0/z, z0.d, z0.d
 ; CHECK-NEXT:    fcmuo p0.d, p0/z, z1.d, z1.d
-; CHECK-NEXT:    sel z3.d, p2, z3.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z2.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z3.d
+; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
+; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
+; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
@@ -568,8 +568,8 @@ declare <vscale x 4 x i64> @llvm.fptosi.sat.nxv4f16.nxv4i64(<vscale x 4 x half>)
 define <vscale x 2 x i32> @test_signed_v2f16_v2i32(<vscale x 2 x half> %f) {
 ; CHECK-LABEL: test_signed_v2f16_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.d, #0xffffffff80000000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
@@ -592,8 +592,8 @@ define <vscale x 2 x i32> @test_signed_v2f16_v2i32(<vscale x 2 x half> %f) {
 define <vscale x 4 x i32> @test_signed_v4f16_v4i32(<vscale x 4 x half> %f) {
 ; CHECK-LABEL: test_signed_v4f16_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, #0x80000000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
@@ -621,31 +621,31 @@ define <vscale x 8 x i32> @test_signed_v8f16_v8i32(<vscale x 8 x half> %f) {
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
 ; CHECK-NEXT:    uunpklo z1.s, z0.h
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
 ; CHECK-NEXT:    mov z2.h, w8
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z3.s, #0x80000000
-; CHECK-NEXT:    mov z4.h, w8
+; CHECK-NEXT:    mov z3.h, w8
 ; CHECK-NEXT:    mov z6.s, #0x7fffffff
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z2.h
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    movprfx z2, z1
-; CHECK-NEXT:    fcvtzs z2.s, p0/m, z1.h
+; CHECK-NEXT:    mov z2.s, #0x80000000
+; CHECK-NEXT:    movprfx z4, z1
+; CHECK-NEXT:    fcvtzs z4.s, p0/m, z1.h
 ; CHECK-NEXT:    movprfx z5, z0
 ; CHECK-NEXT:    fcvtzs z5.s, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z1.h, z4.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z0.h, z4.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z1.h, z3.h
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z0.h, z3.h
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z2.s, p1/m, z3.s
+; CHECK-NEXT:    sel z3.s, p1, z2.s, z4.s
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z1.h, z1.h
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    sel z3.s, p2, z3.s, z5.s
-; CHECK-NEXT:    sel z0.s, p3, z6.s, z2.s
-; CHECK-NEXT:    sel z1.s, p4, z6.s, z3.s
+; CHECK-NEXT:    sel z2.s, p2, z2.s, z5.s
+; CHECK-NEXT:    sel z0.s, p3, z6.s, z3.s
+; CHECK-NEXT:    sel z1.s, p4, z6.s, z2.s
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z0.s, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.s, p0/m, #0 // =0x0
@@ -659,8 +659,8 @@ define <vscale x 8 x i32> @test_signed_v8f16_v8i32(<vscale x 8 x half> %f) {
 define <vscale x 4 x i16> @test_signed_v4f16_v4i16(<vscale x 4 x half> %f) {
 ; CHECK-LABEL: test_signed_v4f16_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #63488 // =0xf800
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #30719 // =0x77ff
 ; CHECK-NEXT:    mov z2.h, w8
@@ -682,8 +682,8 @@ define <vscale x 4 x i16> @test_signed_v4f16_v4i16(<vscale x 4 x half> %f) {
 define <vscale x 8 x i16> @test_signed_v8f16_v8i16(<vscale x 8 x half> %f) {
 ; CHECK-LABEL: test_signed_v8f16_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w8, #63488 // =0xf800
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #30719 // =0x77ff
 ; CHECK-NEXT:    mov z2.h, w8
@@ -705,8 +705,8 @@ define <vscale x 8 x i16> @test_signed_v8f16_v8i16(<vscale x 8 x half> %f) {
 define <vscale x 2 x i64> @test_signed_v2f16_v2i64(<vscale x 2 x half> %f) {
 ; CHECK-LABEL: test_signed_v2f16_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.d, #0x8000000000000000
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
@@ -734,31 +734,31 @@ define <vscale x 4 x i64> @test_signed_v4f16_v4i64(<vscale x 4 x half> %f) {
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #64511 // =0xfbff
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    mov z2.h, w8
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
-; CHECK-NEXT:    mov z3.d, #0x8000000000000000
-; CHECK-NEXT:    mov z4.h, w8
+; CHECK-NEXT:    mov z3.h, w8
 ; CHECK-NEXT:    mov z6.d, #0x7fffffffffffffff
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z2.h
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z0.h, z2.h
-; CHECK-NEXT:    movprfx z2, z1
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z1.h
+; CHECK-NEXT:    mov z2.d, #0x8000000000000000
+; CHECK-NEXT:    movprfx z4, z1
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z1.h
 ; CHECK-NEXT:    movprfx z5, z0
 ; CHECK-NEXT:    fcvtzs z5.d, p0/m, z0.h
-; CHECK-NEXT:    fcmgt p3.h, p0/z, z1.h, z4.h
-; CHECK-NEXT:    fcmgt p4.h, p0/z, z0.h, z4.h
+; CHECK-NEXT:    fcmgt p3.h, p0/z, z1.h, z3.h
+; CHECK-NEXT:    fcmgt p4.h, p0/z, z0.h, z3.h
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
-; CHECK-NEXT:    mov z2.d, p1/m, z3.d
+; CHECK-NEXT:    sel z3.d, p1, z2.d, z4.d
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z1.h, z1.h
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z0.h, z0.h
-; CHECK-NEXT:    sel z3.d, p2, z3.d, z5.d
-; CHECK-NEXT:    sel z0.d, p3, z6.d, z2.d
-; CHECK-NEXT:    sel z1.d, p4, z6.d, z3.d
+; CHECK-NEXT:    sel z2.d, p2, z2.d, z5.d
+; CHECK-NEXT:    sel z0.d, p3, z6.d, z3.d
+; CHECK-NEXT:    sel z1.d, p4, z6.d, z2.d
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    mov z0.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    mov z1.d, p0/m, #0 // =0x0
diff --git a/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll b/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll
index c56c0b37888dc..ed352ffec339f 100644
--- a/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fptoui-sat.ll
@@ -82,8 +82,8 @@ define <vscale x 4 x i16> @test_signed_v4f32_v4i16(<vscale x 4 x float> %f) {
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #65280 // =0xff00
 ; CHECK-NEXT:    movk w8, #18303, lsl #16
-; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    fcvtzu z2.s, p0/m, z0.s
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
@@ -102,9 +102,9 @@ define <vscale x 8 x i16> @test_signed_v8f32_v8i16(<vscale x 8 x float> %f) {
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #65280 // =0xff00
 ; CHECK-NEXT:    movk w8, #18303, lsl #16
-; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z1.s, #0.0
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    movprfx z3, z1
 ; CHECK-NEXT:    fcvtzu z3.s, p0/m, z1.s
 ; CHECK-NEXT:    movprfx z4, z0
@@ -146,10 +146,10 @@ define <vscale x 2 x i64> @test_signed_v2f32_v2i64(<vscale x 2 x float> %f) {
 define <vscale x 4 x i64> @test_signed_v4f32_v4i64(<vscale x 4 x float> %f) {
 ; CHECK-LABEL: test_signed_v4f32_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    uunpkhi z3.d, z0.s
 ; CHECK-NEXT:    mov w8, #1602224127 // =0x5f7fffff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z4.s, w8
 ; CHECK-NEXT:    fcmge p1.s, p0/z, z2.s, #0.0
 ; CHECK-NEXT:    fcmge p2.s, p0/z, z3.s, #0.0
@@ -186,8 +186,8 @@ define <vscale x 2 x i32> @test_signed_v2f64_v2i32(<vscale x 2 x double> %f) {
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #281474974613504 // =0xffffffe00000
 ; CHECK-NEXT:    movk x8, #16879, lsl #48
-; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    movprfx z2, z0
 ; CHECK-NEXT:    fcvtzu z2.d, p0/m, z0.d
 ; CHECK-NEXT:    not p1.b, p0/z, p1.b
@@ -206,9 +206,9 @@ define <vscale x 4 x i32> @test_signed_v4f64_v4i32(<vscale x 4 x double> %f) {
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #281474974613504 // =0xffffffe00000
 ; CHECK-NEXT:    movk x8, #16879, lsl #48
-; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, #0.0
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    movprfx z3, z1
 ; CHECK-NEXT:    fcvtzu z3.d, p0/m, z1.d
 ; CHECK-NEXT:    movprfx z4, z0
@@ -241,28 +241,28 @@ define <vscale x 8 x i32> @test_signed_v8f64_v8i32(<vscale x 8 x double> %f) {
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #281474974613504 // =0xffffffe00000
 ; CHECK-NEXT:    movk x8, #16879, lsl #48
-; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, #0.0
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, #0.0
 ; CHECK-NEXT:    fcmge p3.d, p0/z, z3.d, #0.0
 ; CHECK-NEXT:    fcmge p4.d, p0/z, z2.d, #0.0
 ; CHECK-NEXT:    movprfx z5, z1
 ; CHECK-NEXT:    fcvtzu z5.d, p0/m, z1.d
+; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    movprfx z6, z0
 ; CHECK-NEXT:    fcvtzu z6.d, p0/m, z0.d
 ; CHECK-NEXT:    movprfx z7, z3
 ; CHECK-NEXT:    fcvtzu z7.d, p0/m, z3.d
 ; CHECK-NEXT:    movprfx z24, z2
 ; CHECK-NEXT:    fcvtzu z24.d, p0/m, z2.d
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p5.d, p0/z, z1.d, z4.d
 ; CHECK-NEXT:    fcmgt p6.d, p0/z, z0.d, z4.d
-; CHECK-NEXT:    mov z0.d, #0xffffffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    mov z0.d, #0xffffffff
 ; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
 ; CHECK-NEXT:    mov z5.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p1.d, p0/z, z3.d, z4.d
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
 ; CHECK-NEXT:    fcmgt p0.d, p0/z, z2.d, z4.d
 ; CHECK-NEXT:    mov z6.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z7.d, p3/m, #0 // =0x0
@@ -289,9 +289,9 @@ define <vscale x 4 x i16> @test_signed_v4f64_v4i16(<vscale x 4 x double> %f) {
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #281337537757184 // =0xffe000000000
 ; CHECK-NEXT:    movk x8, #16623, lsl #48
-; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z1.d, #0.0
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z0.d, #0.0
+; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    movprfx z3, z1
 ; CHECK-NEXT:    fcvtzu z3.d, p0/m, z1.d
 ; CHECK-NEXT:    movprfx z4, z0
@@ -324,28 +324,28 @@ define <vscale x 8 x i16> @test_signed_v8f64_v8i16(<vscale x 8 x double> %f) {
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, #281337537757184 // =0xffe000000000
 ; CHECK-NEXT:    movk x8, #16623, lsl #48
-; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    fcmge p1.d, p0/z, z3.d, #0.0
 ; CHECK-NEXT:    fcmge p2.d, p0/z, z2.d, #0.0
 ; CHECK-NEXT:    fcmge p3.d, p0/z, z1.d, #0.0
 ; CHECK-NEXT:    fcmge p4.d, p0/z, z0.d, #0.0
 ; CHECK-NEXT:    movprfx z5, z3
 ; CHECK-NEXT:    fcvtzu z5.d, p0/m, z3.d
+; CHECK-NEXT:    mov z4.d, x8
 ; CHECK-NEXT:    movprfx z6, z2
 ; CHECK-NEXT:    fcvtzu z6.d, p0/m, z2.d
 ; CHECK-NEXT:    movprfx z7, z1
 ; CHECK-NEXT:    fcvtzu z7.d, p0/m, z1.d
 ; CHECK-NEXT:    movprfx z24, z0
 ; CHECK-NEXT:    fcvtzu z24.d, p0/m, z0.d
+; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    fcmgt p5.d, p0/z, z3.d, z4.d
 ; CHECK-NEXT:    fcmgt p6.d, p0/z, z2.d, z4.d
-; CHECK-NEXT:    mov z2.d, #65535 // =0xffff
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
 ; CHECK-NEXT:    not p2.b, p0/z, p2.b
+; CHECK-NEXT:    mov z2.d, #65535 // =0xffff
 ; CHECK-NEXT:    not p3.b, p0/z, p3.b
-; CHECK-NEXT:    not p4.b, p0/z, p4.b
 ; CHECK-NEXT:    mov z5.d, p1/m, #0 // =0x0
 ; CHECK-NEXT:    fcmgt p1.d, p0/z, z1.d, z4.d
+; CHECK-NEXT:    not p4.b, p0/z, p4.b
 ; CHECK-NEXT:    fcmgt p0.d, p0/z, z0.d, z4.d
 ; CHECK-NEXT:    mov z6.d, p2/m, #0 // =0x0
 ; CHECK-NEXT:    mov z7.d, p3/m, #0 // =0x0
@@ -465,10 +465,10 @@ define <vscale x 4 x i32> @test_signed_v4f16_v4i32(<vscale x 4 x half> %f) {
 define <vscale x 8 x i32> @test_signed_v8f16_v8i32(<vscale x 8 x half> %f) {
 ; CHECK-LABEL: test_signed_v8f16_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    uunpkhi z3.s, z0.h
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z4.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z2.h, #0.0
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z3.h, #0.0
@@ -549,10 +549,10 @@ define <vscale x 2 x i64> @test_signed_v2f16_v2i64(<vscale x 2 x half> %f) {
 define <vscale x 4 x i64> @test_signed_v4f16_v4i64(<vscale x 4 x half> %f) {
 ; CHECK-LABEL: test_signed_v4f16_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    uunpkhi z3.d, z0.s
 ; CHECK-NEXT:    mov w8, #31743 // =0x7bff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z4.h, w8
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z2.h, #0.0
 ; CHECK-NEXT:    fcmge p2.h, p0/z, z3.h, #0.0
diff --git a/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll b/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll
index 5c4c9463528b8..ad6371f78ec08 100644
--- a/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-gather-scatter-addr-opts.ll
@@ -71,12 +71,12 @@ define <vscale x 4 x i8> @gather_i8_index_offset_8(ptr %base, i64 %offset, <vsca
 define void @scatter_f16_index_offset_var(ptr %base, i64 %offset, i64 %scale, <vscale x 4 x i1> %pg, <vscale x 4 x half> %data) #0 {
 ; CHECK-LABEL: scatter_f16_index_offset_var:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    index z1.d, #0, #1
 ; CHECK-NEXT:    mov z2.d, x1
-; CHECK-NEXT:    punpklo p2.h, p0.b
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    uunpklo z3.d, z0.s
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    punpklo p2.h, p0.b
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    movprfx z4, z2
 ; CHECK-NEXT:    mla z4.d, p1/m, z1.d, z2.d
@@ -101,16 +101,16 @@ define void @scatter_f16_index_offset_var(ptr %base, i64 %offset, i64 %scale, <v
 define void @scatter_i8_index_offset_maximum_plus_one(ptr %base, i64 %offset, <vscale x 4 x i1> %pg, <vscale x 4 x i8> %data) #0 {
 ; CHECK-LABEL: scatter_i8_index_offset_maximum_plus_one:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    mov w8, #33554432 // =0x2000000
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
-; CHECK-NEXT:    index z1.d, #0, x8
 ; CHECK-NEXT:    rdvl x9, #1
-; CHECK-NEXT:    add x8, x0, x1
+; CHECK-NEXT:    index z1.d, #0, x8
+; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    lsr x9, x9, #4
+; CHECK-NEXT:    add x8, x0, x1
 ; CHECK-NEXT:    mov w10, #67108864 // =0x4000000
-; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    st1b { z2.d }, p1, [x8, z1.d]
 ; CHECK-NEXT:    madd x8, x9, x10, x8
 ; CHECK-NEXT:    st1b { z0.d }, p0, [x8, z1.d]
@@ -131,17 +131,17 @@ define void @scatter_i8_index_offset_maximum_plus_one(ptr %base, i64 %offset, <v
 define void @scatter_i8_index_offset_minimum_minus_one(ptr %base, i64 %offset, <vscale x 4 x i1> %pg, <vscale x 4 x i8> %data) #0 {
 ; CHECK-LABEL: scatter_i8_index_offset_minimum_minus_one:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    mov x8, #-33554433 // =0xfffffffffdffffff
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
-; CHECK-NEXT:    index z1.d, #0, x8
 ; CHECK-NEXT:    rdvl x9, #1
-; CHECK-NEXT:    mov x10, #-2 // =0xfffffffffffffffe
+; CHECK-NEXT:    index z1.d, #0, x8
+; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    lsr x9, x9, #4
+; CHECK-NEXT:    mov x10, #-2 // =0xfffffffffffffffe
 ; CHECK-NEXT:    add x8, x0, x1
+; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    movk x10, #64511, lsl #16
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
-; CHECK-NEXT:    uunpkhi z0.d, z0.s
 ; CHECK-NEXT:    st1b { z2.d }, p1, [x8, z1.d]
 ; CHECK-NEXT:    madd x8, x9, x10, x8
 ; CHECK-NEXT:    st1b { z0.d }, p0, [x8, z1.d]
@@ -162,16 +162,16 @@ define void @scatter_i8_index_offset_minimum_minus_one(ptr %base, i64 %offset, <
 define void @scatter_i8_index_stride_too_big(ptr %base, i64 %offset, <vscale x 4 x i1> %pg, <vscale x 4 x i8> %data) #0 {
 ; CHECK-LABEL: scatter_i8_index_stride_too_big:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    mov x8, #4611686018427387904 // =0x4000000000000000
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
-; CHECK-NEXT:    index z1.d, #0, x8
 ; CHECK-NEXT:    rdvl x9, #1
-; CHECK-NEXT:    add x8, x0, x1
+; CHECK-NEXT:    index z1.d, #0, x8
+; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    lsr x9, x9, #4
+; CHECK-NEXT:    add x8, x0, x1
 ; CHECK-NEXT:    mov x10, #-9223372036854775808 // =0x8000000000000000
-; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    uunpkhi z0.d, z0.s
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    st1b { z2.d }, p1, [x8, z1.d]
 ; CHECK-NEXT:    madd x8, x9, x10, x8
 ; CHECK-NEXT:    st1b { z0.d }, p0, [x8, z1.d]
diff --git a/llvm/test/CodeGen/AArch64/sve-hadd.ll b/llvm/test/CodeGen/AArch64/sve-hadd.ll
index c73370d50287b..f90aef8daa5dc 100644
--- a/llvm/test/CodeGen/AArch64/sve-hadd.ll
+++ b/llvm/test/CodeGen/AArch64/sve-hadd.ll
@@ -129,9 +129,9 @@ define <vscale x 2 x i32> @haddu_v2i32(<vscale x 2 x i32> %s0, <vscale x 2 x i32
 ;
 ; SVE2-LABEL: haddu_v2i32:
 ; SVE2:       // %bb.0: // %entry
-; SVE2-NEXT:    ptrue p0.d
 ; SVE2-NEXT:    and z0.d, z0.d, #0xffffffff
 ; SVE2-NEXT:    and z1.d, z1.d, #0xffffffff
+; SVE2-NEXT:    ptrue p0.d
 ; SVE2-NEXT:    uhadd z0.d, p0/m, z0.d, z1.d
 ; SVE2-NEXT:    ret
 entry:
@@ -274,9 +274,9 @@ define <vscale x 2 x i16> @haddu_v2i16(<vscale x 2 x i16> %s0, <vscale x 2 x i16
 ;
 ; SVE2-LABEL: haddu_v2i16:
 ; SVE2:       // %bb.0: // %entry
-; SVE2-NEXT:    ptrue p0.d
 ; SVE2-NEXT:    and z0.d, z0.d, #0xffff
 ; SVE2-NEXT:    and z1.d, z1.d, #0xffff
+; SVE2-NEXT:    ptrue p0.d
 ; SVE2-NEXT:    uhadd z0.d, p0/m, z0.d, z1.d
 ; SVE2-NEXT:    ret
 entry:
@@ -343,9 +343,9 @@ define <vscale x 4 x i16> @haddu_v4i16(<vscale x 4 x i16> %s0, <vscale x 4 x i16
 ;
 ; SVE2-LABEL: haddu_v4i16:
 ; SVE2:       // %bb.0: // %entry
-; SVE2-NEXT:    ptrue p0.s
 ; SVE2-NEXT:    and z0.s, z0.s, #0xffff
 ; SVE2-NEXT:    and z1.s, z1.s, #0xffff
+; SVE2-NEXT:    ptrue p0.s
 ; SVE2-NEXT:    uhadd z0.s, p0/m, z0.s, z1.s
 ; SVE2-NEXT:    ret
 entry:
@@ -488,9 +488,9 @@ define <vscale x 4 x i8> @haddu_v4i8(<vscale x 4 x i8> %s0, <vscale x 4 x i8> %s
 ;
 ; SVE2-LABEL: haddu_v4i8:
 ; SVE2:       // %bb.0: // %entry
-; SVE2-NEXT:    ptrue p0.s
 ; SVE2-NEXT:    and z0.s, z0.s, #0xff
 ; SVE2-NEXT:    and z1.s, z1.s, #0xff
+; SVE2-NEXT:    ptrue p0.s
 ; SVE2-NEXT:    uhadd z0.s, p0/m, z0.s, z1.s
 ; SVE2-NEXT:    ret
 entry:
@@ -557,9 +557,9 @@ define <vscale x 8 x i8> @haddu_v8i8(<vscale x 8 x i8> %s0, <vscale x 8 x i8> %s
 ;
 ; SVE2-LABEL: haddu_v8i8:
 ; SVE2:       // %bb.0: // %entry
-; SVE2-NEXT:    ptrue p0.h
 ; SVE2-NEXT:    and z0.h, z0.h, #0xff
 ; SVE2-NEXT:    and z1.h, z1.h, #0xff
+; SVE2-NEXT:    ptrue p0.h
 ; SVE2-NEXT:    uhadd z0.h, p0/m, z0.h, z1.h
 ; SVE2-NEXT:    ret
 entry:
@@ -787,9 +787,9 @@ define <vscale x 2 x i32> @rhaddu_v2i32(<vscale x 2 x i32> %s0, <vscale x 2 x i3
 ;
 ; SVE2-LABEL: rhaddu_v2i32:
 ; SVE2:       // %bb.0: // %entry
-; SVE2-NEXT:    ptrue p0.d
 ; SVE2-NEXT:    and z0.d, z0.d, #0xffffffff
 ; SVE2-NEXT:    and z1.d, z1.d, #0xffffffff
+; SVE2-NEXT:    ptrue p0.d
 ; SVE2-NEXT:    urhadd z0.d, p0/m, z0.d, z1.d
 ; SVE2-NEXT:    ret
 entry:
@@ -936,9 +936,9 @@ define <vscale x 2 x i16> @rhaddu_v2i16(<vscale x 2 x i16> %s0, <vscale x 2 x i1
 ;
 ; SVE2-LABEL: rhaddu_v2i16:
 ; SVE2:       // %bb.0: // %entry
-; SVE2-NEXT:    ptrue p0.d
 ; SVE2-NEXT:    and z0.d, z0.d, #0xffff
 ; SVE2-NEXT:    and z1.d, z1.d, #0xffff
+; SVE2-NEXT:    ptrue p0.d
 ; SVE2-NEXT:    urhadd z0.d, p0/m, z0.d, z1.d
 ; SVE2-NEXT:    ret
 entry:
@@ -1014,9 +1014,9 @@ define <vscale x 4 x i16> @rhaddu_v4i16(<vscale x 4 x i16> %s0, <vscale x 4 x i1
 ;
 ; SVE2-LABEL: rhaddu_v4i16:
 ; SVE2:       // %bb.0: // %entry
-; SVE2-NEXT:    ptrue p0.s
 ; SVE2-NEXT:    and z0.s, z0.s, #0xffff
 ; SVE2-NEXT:    and z1.s, z1.s, #0xffff
+; SVE2-NEXT:    ptrue p0.s
 ; SVE2-NEXT:    urhadd z0.s, p0/m, z0.s, z1.s
 ; SVE2-NEXT:    ret
 entry:
@@ -1163,9 +1163,9 @@ define <vscale x 4 x i8> @rhaddu_v4i8(<vscale x 4 x i8> %s0, <vscale x 4 x i8> %
 ;
 ; SVE2-LABEL: rhaddu_v4i8:
 ; SVE2:       // %bb.0: // %entry
-; SVE2-NEXT:    ptrue p0.s
 ; SVE2-NEXT:    and z0.s, z0.s, #0xff
 ; SVE2-NEXT:    and z1.s, z1.s, #0xff
+; SVE2-NEXT:    ptrue p0.s
 ; SVE2-NEXT:    urhadd z0.s, p0/m, z0.s, z1.s
 ; SVE2-NEXT:    ret
 entry:
@@ -1241,9 +1241,9 @@ define <vscale x 8 x i8> @rhaddu_v8i8(<vscale x 8 x i8> %s0, <vscale x 8 x i8> %
 ;
 ; SVE2-LABEL: rhaddu_v8i8:
 ; SVE2:       // %bb.0: // %entry
-; SVE2-NEXT:    ptrue p0.h
 ; SVE2-NEXT:    and z0.h, z0.h, #0xff
 ; SVE2-NEXT:    and z1.h, z1.h, #0xff
+; SVE2-NEXT:    ptrue p0.h
 ; SVE2-NEXT:    urhadd z0.h, p0/m, z0.h, z1.h
 ; SVE2-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll b/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll
index e20399de70bf8..73bbee094827e 100644
--- a/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll
+++ b/llvm/test/CodeGen/AArch64/sve-implicit-zero-filling.ll
@@ -175,14 +175,14 @@ define <vscale x 2 x i64> @uminv_zero_fill(<vscale x 2 x i1> %pg, <vscale x 2 x
 define <vscale x 2 x i64> @zero_fill_non_zero_index(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
 ; CHECK-LABEL: zero_fill_non_zero_index:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    index z1.d, #0, #1
 ; CHECK-NEXT:    uminv d3, p0, z0.d
 ; CHECK-NEXT:    mov z2.d, x8
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
-; CHECK-NEXT:    fmov x8, d3
 ; CHECK-NEXT:    cmpeq p0.d, p1/z, z1.d, z2.d
+; CHECK-NEXT:    fmov x8, d3
 ; CHECK-NEXT:    mov z0.d, p0/m, x8
 ; CHECK-NEXT:    ret
   %t1 = call i64 @llvm.aarch64.sve.uminv.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a)
@@ -210,11 +210,11 @@ define <vscale x 4 x i64> @zero_fill_type_mismatch(<vscale x 2 x i1> %pg, <vscal
 define <vscale x 2 x i64> @zero_fill_no_zero_upper_lanes(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a) #0 {
 ; CHECK-LABEL: zero_fill_no_zero_upper_lanes:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d, vl1
 ; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z0.d
 ; CHECK-NEXT:    mov z1.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d, vl1
 ; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    mov z1.d, p1/m, x8
+; CHECK-NEXT:    mov z1.d, p0/m, x8
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
   %t1 = call <vscale x 2 x i64> @llvm.aarch64.sve.umin.nxv2i64(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %a, <vscale x 2 x i64> %a)
diff --git a/llvm/test/CodeGen/AArch64/sve-insert-element.ll b/llvm/test/CodeGen/AArch64/sve-insert-element.ll
index 2aa298f6d9173..7344964f13bba 100644
--- a/llvm/test/CodeGen/AArch64/sve-insert-element.ll
+++ b/llvm/test/CodeGen/AArch64/sve-insert-element.ll
@@ -48,8 +48,8 @@ define <vscale x 2 x i64> @test_lane0_2xi64(<vscale x 2 x i64> %a) {
 define <vscale x 2 x double> @test_lane0_2xf64(<vscale x 2 x double> %a) {
 ; CHECK-LABEL: test_lane0_2xf64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl1
 ; CHECK-NEXT:    fmov d1, #1.00000000
+; CHECK-NEXT:    ptrue p0.d, vl1
 ; CHECK-NEXT:    mov z0.d, p0/m, z1.d
 ; CHECK-NEXT:    ret
   %b = insertelement <vscale x 2 x double> %a, double 1.0, i32 0
@@ -59,8 +59,8 @@ define <vscale x 2 x double> @test_lane0_2xf64(<vscale x 2 x double> %a) {
 define <vscale x 4 x float> @test_lane0_4xf32(<vscale x 4 x float> %a) {
 ; CHECK-LABEL: test_lane0_4xf32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl1
 ; CHECK-NEXT:    fmov s1, #1.00000000
+; CHECK-NEXT:    ptrue p0.s, vl1
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
   %b = insertelement <vscale x 4 x float> %a, float 1.0, i32 0
@@ -70,8 +70,8 @@ define <vscale x 4 x float> @test_lane0_4xf32(<vscale x 4 x float> %a) {
 define <vscale x 8 x half> @test_lane0_8xf16(<vscale x 8 x half> %a) {
 ; CHECK-LABEL: test_lane0_8xf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl1
 ; CHECK-NEXT:    fmov h1, #1.00000000
+; CHECK-NEXT:    ptrue p0.h, vl1
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
   %b = insertelement <vscale x 8 x half> %a, half 1.0, i32 0
@@ -93,9 +93,9 @@ define <vscale x 8 x bfloat> @test_lane0_8xbf16(<vscale x 8 x bfloat> %a, bfloat
 define <vscale x 2 x i64> @test_lane4_2xi64(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: test_lane4_2xi64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #4 // =0x4
 ; CHECK-NEXT:    index z1.d, #0, #1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    mov w8, #30 // =0x1e
 ; CHECK-NEXT:    cmpeq p0.d, p0/z, z1.d, z2.d
@@ -109,9 +109,9 @@ define <vscale x 2 x i64> @test_lane4_2xi64(<vscale x 2 x i64> %a) {
 define <vscale x 8 x half> @test_lane9_8xf16(<vscale x 8 x half> %a) {
 ; CHECK-LABEL: test_lane9_8xf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w8, #9 // =0x9
 ; CHECK-NEXT:    index z1.h, #0, #1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z1.h, z2.h
 ; CHECK-NEXT:    fmov h1, #1.00000000
@@ -124,9 +124,9 @@ define <vscale x 8 x half> @test_lane9_8xf16(<vscale x 8 x half> %a) {
 define <vscale x 8 x bfloat> @test_lane9_8xbf16(<vscale x 8 x bfloat> %a, bfloat %x) {
 ; CHECK-LABEL: test_lane9_8xbf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w8, #9 // =0x9
 ; CHECK-NEXT:    index z2.h, #0, #1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z3.h, w8
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z2.h, z3.h
 ; CHECK-NEXT:    mov z0.h, p0/m, h1
@@ -138,9 +138,9 @@ define <vscale x 8 x bfloat> @test_lane9_8xbf16(<vscale x 8 x bfloat> %a, bfloat
 define <vscale x 16 x i8> @test_lane1_16xi8(<vscale x 16 x i8> %a) {
 ; CHECK-LABEL: test_lane1_16xi8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    index z1.b, #0, #1
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z2.b, w8
 ; CHECK-NEXT:    mov w8, #30 // =0x1e
 ; CHECK-NEXT:    cmpeq p0.b, p0/z, z1.b, z2.b
@@ -153,9 +153,9 @@ define <vscale x 16 x i8> @test_lane1_16xi8(<vscale x 16 x i8> %a) {
 define <vscale x 16 x i8> @test_lanex_16xi8(<vscale x 16 x i8> %a, i32 %x) {
 ; CHECK-LABEL: test_lanex_16xi8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    index z1.b, #0, #1
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z2.b, w8
 ; CHECK-NEXT:    mov w8, #30 // =0x1e
 ; CHECK-NEXT:    cmpeq p0.b, p0/z, z1.b, z2.b
@@ -179,9 +179,9 @@ define <vscale x 4 x i32> @extract_insert_4xi32(<vscale x 4 x i32> %a) {
 define <vscale x 8 x i16> @test_lane6_undef_8xi16(i16 %a) {
 ; CHECK-LABEL: test_lane6_undef_8xi16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w8, #6 // =0x6
 ; CHECK-NEXT:    index z0.h, #0, #1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, z1.h
 ; CHECK-NEXT:    mov z0.h, p0/m, w0
@@ -202,8 +202,8 @@ define <vscale x 16 x i8> @test_lane0_undef_16xi8(i8 %a) {
 define <vscale x 16 x i8> @test_insert0_of_extract0_16xi8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: test_insert0_of_extract0_16xi8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl1
 ; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    ptrue p0.b, vl1
 ; CHECK-NEXT:    mov z0.b, p0/m, w8
 ; CHECK-NEXT:    ret
   %c = extractelement <vscale x 16 x i8> %b, i32 0
@@ -215,12 +215,12 @@ define <vscale x 16 x i8> @test_insert64_of_extract64_16xi8(<vscale x 16 x i8> %
 ; CHECK-LABEL: test_insert64_of_extract64_16xi8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov w8, #64 // =0x40
-; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    whilels p0.b, xzr, x8
 ; CHECK-NEXT:    mov z2.b, w8
 ; CHECK-NEXT:    lastb w9, p0, z1.b
 ; CHECK-NEXT:    index z1.b, #0, #1
-; CHECK-NEXT:    cmpeq p0.b, p1/z, z1.b, z2.b
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    cmpeq p0.b, p0/z, z1.b, z2.b
 ; CHECK-NEXT:    mov z0.b, p0/m, w9
 ; CHECK-NEXT:    ret
   %c = extractelement <vscale x 16 x i8> %b, i32 64
@@ -231,9 +231,9 @@ define <vscale x 16 x i8> @test_insert64_of_extract64_16xi8(<vscale x 16 x i8> %
 define <vscale x 16 x i8> @test_insert3_of_extract1_16xi8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: test_insert3_of_extract1_16xi8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov w8, #3 // =0x3
 ; CHECK-NEXT:    index z2.b, #0, #1
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z3.b, w8
 ; CHECK-NEXT:    umov w8, v1.b[1]
 ; CHECK-NEXT:    cmpeq p0.b, p0/z, z2.b, z3.b
@@ -329,9 +329,9 @@ define <vscale x 2 x double> @test_insert_into_undef_nxv2f64(double %a) {
 define <vscale x 2 x half> @test_insert_with_index_nxv2f16(half %h, i64 %idx) {
 ; CHECK-LABEL: test_insert_with_index_nxv2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    index z1.d, #0, #1
 ; CHECK-NEXT:    mov z2.d, x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cmpeq p0.d, p0/z, z1.d, z2.d
 ; CHECK-NEXT:    mov z0.h, p0/m, h0
 ; CHECK-NEXT:    ret
@@ -342,9 +342,9 @@ define <vscale x 2 x half> @test_insert_with_index_nxv2f16(half %h, i64 %idx) {
 define <vscale x 4 x half> @test_insert_with_index_nxv4f16(half %h, i64 %idx) {
 ; CHECK-LABEL: test_insert_with_index_nxv4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    index z1.s, #0, #1
 ; CHECK-NEXT:    mov z2.s, w0
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.h, p0/m, h0
 ; CHECK-NEXT:    ret
@@ -355,9 +355,9 @@ define <vscale x 4 x half> @test_insert_with_index_nxv4f16(half %h, i64 %idx) {
 define <vscale x 8 x half> @test_insert_with_index_nxv8f16(half %h, i64 %idx) {
 ; CHECK-LABEL: test_insert_with_index_nxv8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    index z1.h, #0, #1
 ; CHECK-NEXT:    mov z2.h, w0
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z1.h, z2.h
 ; CHECK-NEXT:    mov z0.h, p0/m, h0
 ; CHECK-NEXT:    ret
@@ -368,9 +368,9 @@ define <vscale x 8 x half> @test_insert_with_index_nxv8f16(half %h, i64 %idx) {
 define <vscale x 2 x bfloat> @test_insert_with_index_nxv2bf16(bfloat %h, i64 %idx) {
 ; CHECK-LABEL: test_insert_with_index_nxv2bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    index z1.d, #0, #1
 ; CHECK-NEXT:    mov z2.d, x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cmpeq p0.d, p0/z, z1.d, z2.d
 ; CHECK-NEXT:    mov z0.h, p0/m, h0
 ; CHECK-NEXT:    ret
@@ -381,9 +381,9 @@ define <vscale x 2 x bfloat> @test_insert_with_index_nxv2bf16(bfloat %h, i64 %id
 define <vscale x 4 x bfloat> @test_insert_with_index_nxv4bf16(bfloat %h, i64 %idx) {
 ; CHECK-LABEL: test_insert_with_index_nxv4bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    index z1.s, #0, #1
 ; CHECK-NEXT:    mov z2.s, w0
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.h, p0/m, h0
 ; CHECK-NEXT:    ret
@@ -394,9 +394,9 @@ define <vscale x 4 x bfloat> @test_insert_with_index_nxv4bf16(bfloat %h, i64 %id
 define <vscale x 8 x bfloat> @test_insert_with_index_nxv8bf16(bfloat %h, i64 %idx) {
 ; CHECK-LABEL: test_insert_with_index_nxv8bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    index z1.h, #0, #1
 ; CHECK-NEXT:    mov z2.h, w0
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z1.h, z2.h
 ; CHECK-NEXT:    mov z0.h, p0/m, h0
 ; CHECK-NEXT:    ret
@@ -407,9 +407,9 @@ define <vscale x 8 x bfloat> @test_insert_with_index_nxv8bf16(bfloat %h, i64 %id
 define <vscale x 2 x float> @test_insert_with_index_nxv2f32(float %f, i64 %idx) {
 ; CHECK-LABEL: test_insert_with_index_nxv2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    index z1.d, #0, #1
 ; CHECK-NEXT:    mov z2.d, x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cmpeq p0.d, p0/z, z1.d, z2.d
 ; CHECK-NEXT:    mov z0.s, p0/m, s0
 ; CHECK-NEXT:    ret
@@ -420,9 +420,9 @@ define <vscale x 2 x float> @test_insert_with_index_nxv2f32(float %f, i64 %idx)
 define <vscale x 4 x float> @test_insert_with_index_nxv4f32(float %f, i64 %idx) {
 ; CHECK-LABEL: test_insert_with_index_nxv4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    index z1.s, #0, #1
 ; CHECK-NEXT:    mov z2.s, w0
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.s, p0/m, s0
 ; CHECK-NEXT:    ret
@@ -433,9 +433,9 @@ define <vscale x 4 x float> @test_insert_with_index_nxv4f32(float %f, i64 %idx)
 define <vscale x 2 x double> @test_insert_with_index_nxv2f64(double %d, i64 %idx) {
 ; CHECK-LABEL: test_insert_with_index_nxv2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    index z1.d, #0, #1
 ; CHECK-NEXT:    mov z2.d, x0
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cmpeq p0.d, p0/z, z1.d, z2.d
 ; CHECK-NEXT:    mov z0.d, p0/m, d0
 ; CHECK-NEXT:    ret
@@ -447,11 +447,11 @@ define <vscale x 2 x double> @test_insert_with_index_nxv2f64(double %d, i64 %idx
 define <vscale x 2 x i1> @test_predicate_insert_2xi1_immediate (<vscale x 2 x i1> %val, i1 %elt) {
 ; CHECK-LABEL: test_predicate_insert_2xi1_immediate:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d, vl1
 ; CHECK-NEXT:    mov z0.d, p0/z, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.d, vl1
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    mov z0.d, p0/m, x0
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    mov z0.d, p1/m, x0
 ; CHECK-NEXT:    and z0.d, z0.d, #0x1
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    ret
@@ -462,9 +462,9 @@ define <vscale x 2 x i1> @test_predicate_insert_2xi1_immediate (<vscale x 2 x i1
 define <vscale x 4 x i1> @test_predicate_insert_4xi1_immediate (<vscale x 4 x i1> %val, i1 %elt) {
 ; CHECK-LABEL: test_predicate_insert_4xi1_immediate:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    mov w8, #2 // =0x2
 ; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    cmpeq p2.s, p1/z, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.s, p0/z, #1 // =0x1
@@ -479,9 +479,9 @@ define <vscale x 4 x i1> @test_predicate_insert_4xi1_immediate (<vscale x 4 x i1
 define <vscale x 8 x i1> @test_predicate_insert_8xi1_immediate (<vscale x 8 x i1> %val, i32 %idx) {
 ; CHECK-LABEL: test_predicate_insert_8xi1_immediate:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    mov w8, w0
 ; CHECK-NEXT:    index z0.h, #0, #1
+; CHECK-NEXT:    mov w8, w0
+; CHECK-NEXT:    ptrue p1.h
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    cmpeq p2.h, p1/z, z0.h, z1.h
@@ -497,9 +497,9 @@ define <vscale x 8 x i1> @test_predicate_insert_8xi1_immediate (<vscale x 8 x i1
 define <vscale x 16 x i1> @test_predicate_insert_16xi1_immediate (<vscale x 16 x i1> %val) {
 ; CHECK-LABEL: test_predicate_insert_16xi1_immediate:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    mov w8, #4 // =0x4
 ; CHECK-NEXT:    index z0.b, #0, #1
+; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    mov z1.b, w8
 ; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    cmpeq p2.b, p1/z, z0.b, z1.b
@@ -516,9 +516,9 @@ define <vscale x 16 x i1> @test_predicate_insert_16xi1_immediate (<vscale x 16 x
 define <vscale x 2 x i1> @test_predicate_insert_2xi1(<vscale x 2 x i1> %val, i1 %elt, i32 %idx) {
 ; CHECK-LABEL: test_predicate_insert_2xi1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    mov w8, w1
 ; CHECK-NEXT:    index z0.d, #0, #1
+; CHECK-NEXT:    mov w8, w1
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    cmpeq p2.d, p1/z, z0.d, z1.d
@@ -534,9 +534,9 @@ define <vscale x 2 x i1> @test_predicate_insert_2xi1(<vscale x 2 x i1> %val, i1
 define <vscale x 4 x i1> @test_predicate_insert_4xi1(<vscale x 4 x i1> %val, i1 %elt, i32 %idx) {
 ; CHECK-LABEL: test_predicate_insert_4xi1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    mov w8, w1
 ; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    mov w8, w1
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    cmpeq p2.s, p1/z, z0.s, z1.s
 ; CHECK-NEXT:    mov z0.s, p0/z, #1 // =0x1
@@ -550,9 +550,9 @@ define <vscale x 4 x i1> @test_predicate_insert_4xi1(<vscale x 4 x i1> %val, i1
 define <vscale x 8 x i1> @test_predicate_insert_8xi1(<vscale x 8 x i1> %val, i1 %elt, i32 %idx) {
 ; CHECK-LABEL: test_predicate_insert_8xi1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    mov w8, w1
 ; CHECK-NEXT:    index z0.h, #0, #1
+; CHECK-NEXT:    mov w8, w1
+; CHECK-NEXT:    ptrue p1.h
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    cmpeq p2.h, p1/z, z0.h, z1.h
 ; CHECK-NEXT:    mov z0.h, p0/z, #1 // =0x1
@@ -567,9 +567,9 @@ define <vscale x 8 x i1> @test_predicate_insert_8xi1(<vscale x 8 x i1> %val, i1
 define <vscale x 16 x i1> @test_predicate_insert_16xi1(<vscale x 16 x i1> %val, i1 %elt, i32 %idx) {
 ; CHECK-LABEL: test_predicate_insert_16xi1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    mov w8, w1
 ; CHECK-NEXT:    index z0.b, #0, #1
+; CHECK-NEXT:    mov w8, w1
+; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    mov z1.b, w8
 ; CHECK-NEXT:    cmpeq p2.b, p1/z, z0.b, z1.b
 ; CHECK-NEXT:    mov z0.b, p0/z, #1 // =0x1
@@ -589,24 +589,24 @@ define <vscale x 32 x i1> @test_predicate_insert_32xi1(<vscale x 32 x i1> %val,
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEXT:    ptrue p2.b
 ; CHECK-NEXT:    rdvl x8, #2
 ; CHECK-NEXT:    mov z0.b, p1/z, #1 // =0x1
 ; CHECK-NEXT:    mov z1.b, p0/z, #1 // =0x1
 ; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    mov w9, w1
+; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    cmp x9, x8
 ; CHECK-NEXT:    csel x8, x9, x8, lo
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    st1b { z0.b }, p2, [sp, #1, mul vl]
-; CHECK-NEXT:    st1b { z1.b }, p2, [sp]
+; CHECK-NEXT:    st1b { z0.b }, p1, [sp, #1, mul vl]
+; CHECK-NEXT:    st1b { z1.b }, p1, [sp]
 ; CHECK-NEXT:    strb w0, [x9, x8]
-; CHECK-NEXT:    ld1b { z0.b }, p2/z, [sp]
-; CHECK-NEXT:    ld1b { z1.b }, p2/z, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1b { z0.b }, p1/z, [sp]
+; CHECK-NEXT:    ld1b { z1.b }, p1/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    and z0.b, z0.b, #0x1
 ; CHECK-NEXT:    and z1.b, z1.b, #0x1
-; CHECK-NEXT:    cmpne p0.b, p2/z, z0.b, #0
-; CHECK-NEXT:    cmpne p1.b, p2/z, z1.b, #0
+; CHECK-NEXT:    cmpne p0.b, p1/z, z0.b, #0
+; CHECK-NEXT:    cmpne p1.b, p1/z, z1.b, #0
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
index 4a5e272582d8e..5efe9e2819d5e 100644
--- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
@@ -17,15 +17,15 @@ define <vscale x 2 x i64> @insert_v2i64_nxv2i64_idx2(<vscale x 2 x i64> %vec, <2
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cntd x8
 ; CHECK-NEXT:    mov w9, #2 // =0x2
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    sub x8, x8, #2
 ; CHECK-NEXT:    cmp x8, #2
+; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    lsl x8, x8, #3
-; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
 ; CHECK-NEXT:    str q1, [x9, x8]
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -51,15 +51,15 @@ define <vscale x 4 x i32> @insert_v4i32_nxv4i32_idx4(<vscale x 4 x i32> %vec, <4
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cntw x8
 ; CHECK-NEXT:    mov w9, #4 // =0x4
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sub x8, x8, #4
 ; CHECK-NEXT:    cmp x8, #4
+; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    lsl x8, x8, #2
-; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK-NEXT:    str q1, [x9, x8]
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -85,15 +85,15 @@ define <vscale x 8 x i16> @insert_v8i16_nxv8i16_idx8(<vscale x 8 x i16> %vec, <8
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cnth x8
 ; CHECK-NEXT:    mov w9, #8 // =0x8
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    sub x8, x8, #8
 ; CHECK-NEXT:    cmp x8, #8
+; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    lsl x8, x8, #1
-; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
 ; CHECK-NEXT:    str q1, [x9, x8]
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -119,15 +119,15 @@ define <vscale x 16 x i8> @insert_v16i8_nxv16i8_idx16(<vscale x 16 x i8> %vec, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov w9, #16 // =0x10
 ; CHECK-NEXT:    sub x8, x8, #16
-; CHECK-NEXT:    mov x10, sp
 ; CHECK-NEXT:    cmp x8, #16
-; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
-; CHECK-NEXT:    str q1, [x10, x8]
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str q1, [x9, x8]
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [sp]
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -239,8 +239,8 @@ define void @insert_v2i64_nxv16i64_lo2(ptr %psv, ptr %out) uwtable {
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    str q0, [sp, #16]
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp, #1, mul vl]
 ; CHECK-NEXT:    ld1d { z1.d }, p0/z, [sp]
diff --git a/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll
index c0ddceb42e1d0..52bd79e7a7e60 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-arith-imm.ll
@@ -55,8 +55,8 @@ define <vscale x 8 x i16> @smax_i16_neg(<vscale x 8 x i16> %a) {
 define <vscale x 8 x i16> @smax_i16_out_of_range(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: smax_i16_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    dupm z1.b, #0x1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 8 x i16> undef, i16 257, i32 0
@@ -93,8 +93,8 @@ define <vscale x 4 x i32> @smax_i32_neg(<vscale x 4 x i32> %a) {
 define <vscale x 4 x i32> @smax_i32_out_of_range(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: smax_i32_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, #-129 // =0xffffffffffffff7f
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 4 x i32> undef, i32 -129, i32 0
@@ -131,8 +131,8 @@ define <vscale x 2 x i64> @smax_i64_neg(<vscale x 2 x i64> %a) {
 define <vscale x 2 x i64> @smax_i64_out_of_range(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: smax_i64_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, #65535 // =0xffff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 2 x i64> undef, i64 65535, i32 0
@@ -196,8 +196,8 @@ define <vscale x 8 x i16> @smin_i16_neg(<vscale x 8 x i16> %a) {
 define <vscale x 8 x i16> @smin_i16_out_of_range(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: smin_i16_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    dupm z1.b, #0x1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 8 x i16> undef, i16 257, i32 0
@@ -234,8 +234,8 @@ define <vscale x 4 x i32> @smin_i32_neg(<vscale x 4 x i32> %a) {
 define <vscale x 4 x i32> @smin_i32_out_of_range(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: smin_i32_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, #-129 // =0xffffffffffffff7f
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 4 x i32> undef, i32 -129, i32 0
@@ -272,8 +272,8 @@ define <vscale x 2 x i64> @smin_i64_neg(<vscale x 2 x i64> %a) {
 define <vscale x 2 x i64> @smin_i64_out_of_range(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: smin_i64_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, #65535 // =0xffff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 2 x i64> undef, i64 65535, i32 0
@@ -325,8 +325,8 @@ define <vscale x 8 x i16> @umax_i16_pos(<vscale x 8 x i16> %a) {
 define <vscale x 8 x i16> @umax_i16_out_of_range(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: umax_i16_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    dupm z1.b, #0x1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 8 x i16> undef, i16 257, i32 0
@@ -351,8 +351,8 @@ define <vscale x 4 x i32> @umax_i32_pos(<vscale x 4 x i32> %a) {
 define <vscale x 4 x i32> @umax_i32_out_of_range(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: umax_i32_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #257 // =0x101
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -378,8 +378,8 @@ define <vscale x 2 x i64> @umax_i64_pos(<vscale x 2 x i64> %a) {
 define <vscale x 2 x i64> @umax_i64_out_of_range(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: umax_i64_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, #65535 // =0xffff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 2 x i64> undef, i64 65535, i32 0
@@ -431,8 +431,8 @@ define <vscale x 8 x i16> @umin_i16_pos(<vscale x 8 x i16> %a) {
 define <vscale x 8 x i16> @umin_i16_out_of_range(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: umin_i16_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    dupm z1.b, #0x1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 8 x i16> undef, i16 257, i32 0
@@ -457,8 +457,8 @@ define <vscale x 4 x i32> @umin_i32_pos(<vscale x 4 x i32> %a) {
 define <vscale x 4 x i32> @umin_i32_out_of_range(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: umin_i32_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #257 // =0x101
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -484,8 +484,8 @@ define <vscale x 2 x i64> @umin_i64_pos(<vscale x 2 x i64> %a) {
 define <vscale x 2 x i64> @umin_i64_out_of_range(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: umin_i64_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, #65535 // =0xffff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 2 x i64> undef, i64 65535, i32 0
@@ -589,8 +589,8 @@ define <vscale x 2 x i64> @mul_i64_pos(<vscale x 2 x i64> %a) {
 define <vscale x 8 x i16> @mul_i16_range(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: mul_i16_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z1.h, #255 // =0xff
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 8 x i16> undef, i16 255, i32 0
@@ -602,8 +602,8 @@ define <vscale x 8 x i16> @mul_i16_range(<vscale x 8 x i16> %a) {
 define <vscale x 4 x i32> @mul_i32_range(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: mul_i32_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, #255 // =0xff
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 4 x i32> undef, i32 255, i32 0
@@ -615,8 +615,8 @@ define <vscale x 4 x i32> @mul_i32_range(<vscale x 4 x i32> %a) {
 define <vscale x 2 x i64> @mul_i64_range(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: mul_i64_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, #255 // =0xff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %elt = insertelement <vscale x 2 x i64> undef, i64 255, i32 0
@@ -766,8 +766,8 @@ define <vscale x 2 x i64> @lsr_i64(<vscale x 2 x i64> %a){
 define <vscale x 4 x i32> @sdiv_const(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: sdiv_const:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, #3 // =0x3
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
@@ -778,8 +778,8 @@ entry:
 define <vscale x 4 x i32> @udiv_const(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: udiv_const:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, #3 // =0x3
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/sve-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-int-arith.ll
index fc2672f8c80a8..5f92dee3b5305 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-arith.ll
@@ -532,8 +532,8 @@ define <vscale x 2 x i64> @mls_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b,
  define <vscale x 2 x i64> @muladd_i64_positiveAddend(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
 ; CHECK-LABEL: muladd_i64_positiveAddend:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.d, #0xffffffff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mad z0.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    ret
 {
@@ -545,8 +545,8 @@ define <vscale x 2 x i64> @mls_i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b,
 define <vscale x 2 x i64> @muladd_i64_negativeAddend(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b)
 ; CHECK-LABEL: muladd_i64_negativeAddend:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.d, #0xffffffff00000001
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mad z0.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    ret
 {
@@ -559,8 +559,8 @@ define <vscale x 2 x i64> @muladd_i64_negativeAddend(<vscale x 2 x i64> %a, <vsc
 define <vscale x 4 x i32> @muladd_i32_positiveAddend(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
 ; CHECK-LABEL: muladd_i32_positiveAddend:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, #0x10000
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mad z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    ret
 {
@@ -572,8 +572,8 @@ define <vscale x 4 x i32> @muladd_i32_positiveAddend(<vscale x 4 x i32> %a, <vsc
 define <vscale x 4 x i32> @muladd_i32_negativeAddend(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b)
 ; CHECK-LABEL: muladd_i32_negativeAddend:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, #0xffff0000
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mad z0.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    ret
 {
@@ -585,8 +585,8 @@ define <vscale x 4 x i32> @muladd_i32_negativeAddend(<vscale x 4 x i32> %a, <vsc
 define <vscale x 8 x i16> @muladd_i16_positiveAddend(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
 ; CHECK-LABEL: muladd_i16_positiveAddend:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, #255 // =0xff
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mad z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    ret
 {
@@ -598,8 +598,8 @@ define <vscale x 8 x i16> @muladd_i16_positiveAddend(<vscale x 8 x i16> %a, <vsc
 define <vscale x 8 x i16> @muladd_i16_negativeAddend(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
 ; CHECK-LABEL: muladd_i16_negativeAddend:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, #-255 // =0xffffffffffffff01
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mad z0.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    ret
 {
@@ -611,8 +611,8 @@ define <vscale x 8 x i16> @muladd_i16_negativeAddend(<vscale x 8 x i16> %a, <vsc
 define <vscale x 16 x i8> @muladd_i8_positiveAddend(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
 ; CHECK-LABEL: muladd_i8_positiveAddend:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z2.b, #15 // =0xf
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mad z0.b, p0/m, z1.b, z2.b
 ; CHECK-NEXT:    ret
 {
@@ -624,8 +624,8 @@ define <vscale x 16 x i8> @muladd_i8_positiveAddend(<vscale x 16 x i8> %a, <vsca
 define <vscale x 16 x i8> @muladd_i8_negativeAddend(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
 ; CHECK-LABEL: muladd_i8_negativeAddend:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z2.b, #-15 // =0xfffffffffffffff1
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mad z0.b, p0/m, z1.b, z2.b
 ; CHECK-NEXT:    ret
 {
@@ -748,8 +748,8 @@ define <vscale x 16 x i8> @mulsub_i8_negativeAddend(<vscale x 16 x i8> %a, <vsca
 define <vscale x 8 x i16> @multiple_fused_ops(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b)
 ; CHECK-LABEL: multiple_fused_ops:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w8, #200 // =0xc8
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    mla z2.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z2.h
@@ -770,8 +770,8 @@ define void @mad_in_loop(ptr %dst, ptr %src1, ptr %src2, i32 %n) {
 ; CHECK-NEXT:    b.lt .LBB70_3
 ; CHECK-NEXT:  // %bb.1: // %for.body.preheader
 ; CHECK-NEXT:    mov w9, w3
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z0.s, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    whilelo p1.s, xzr, x9
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    cntw x10
diff --git a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
index d04da62451778..8c1b5225b7f25 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
@@ -378,29 +378,29 @@ declare i8 @llvm.vector.reduce.smin.nxv10i8(<vscale x 10 x i8>)
 define i8 @smin_nxv10i8(<vscale x 10 x i8> %a) {
 ; CHECK-LABEL: smin_nxv10i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpkhi z2.h, z0.b
-; CHECK-NEXT:    mov z1.d, #127 // =0x7f
+; CHECK-NEXT:    uunpkhi z1.h, z0.b
+; CHECK-NEXT:    mov z3.d, #127 // =0x7f
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    uunpklo z3.s, z2.h
-; CHECK-NEXT:    uunpkhi z2.s, z2.h
-; CHECK-NEXT:    uunpklo z3.d, z3.s
-; CHECK-NEXT:    uzp1 z3.s, z3.s, z1.s
-; CHECK-NEXT:    uzp1 z2.h, z3.h, z2.h
-; CHECK-NEXT:    uzp1 z2.b, z0.b, z2.b
-; CHECK-NEXT:    uunpkhi z2.h, z2.b
-; CHECK-NEXT:    uunpkhi z3.s, z2.h
-; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    uunpkhi z3.d, z3.s
-; CHECK-NEXT:    uzp1 z3.s, z1.s, z3.s
-; CHECK-NEXT:    uzp1 z2.h, z2.h, z3.h
-; CHECK-NEXT:    uzp1 z2.b, z0.b, z2.b
-; CHECK-NEXT:    uunpkhi z2.h, z2.b
-; CHECK-NEXT:    uunpkhi z3.s, z2.h
-; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    uunpklo z3.d, z3.s
-; CHECK-NEXT:    uzp1 z1.s, z3.s, z1.s
+; CHECK-NEXT:    uunpklo z2.s, z1.h
+; CHECK-NEXT:    uunpkhi z1.s, z1.h
+; CHECK-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z3.s
 ; CHECK-NEXT:    uzp1 z1.h, z2.h, z1.h
+; CHECK-NEXT:    uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-NEXT:    uunpkhi z2.s, z1.h
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpkhi z2.d, z2.s
+; CHECK-NEXT:    uzp1 z2.s, z3.s, z2.s
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT:    uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-NEXT:    uunpkhi z2.s, z1.h
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
 ; CHECK-NEXT:    sminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
@@ -414,12 +414,12 @@ declare i8 @llvm.vector.reduce.add.nxv12i8(<vscale x 12 x i8>)
 define i8 @uaddv_nxv12i8(<vscale x 12 x i8> %a) {
 ; CHECK-LABEL: uaddv_nxv12i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpkhi z2.h, z0.b
-; CHECK-NEXT:    mov z1.s, #0 // =0x0
+; CHECK-NEXT:    uunpkhi z1.h, z0.b
+; CHECK-NEXT:    mov z2.s, #0 // =0x0
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    uzp1 z1.h, z2.h, z1.h
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
 ; CHECK-NEXT:    uaddv d0, p0, z0.b
 ; CHECK-NEXT:    fmov x0, d0
@@ -434,15 +434,15 @@ declare i8 @llvm.vector.reduce.umax.nxv14i8(<vscale x 14 x i8>)
 define i8 @umax_nxv14i8(<vscale x 14 x i8> %a) {
 ; CHECK-LABEL: umax_nxv14i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uunpkhi z2.h, z0.b
-; CHECK-NEXT:    mov z1.d, #0 // =0x0
+; CHECK-NEXT:    uunpkhi z1.h, z0.b
+; CHECK-NEXT:    mov z3.d, #0 // =0x0
 ; CHECK-NEXT:    uunpklo z0.h, z0.b
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    uunpkhi z3.s, z2.h
-; CHECK-NEXT:    uunpklo z2.s, z2.h
-; CHECK-NEXT:    uunpklo z3.d, z3.s
-; CHECK-NEXT:    uzp1 z1.s, z3.s, z1.s
-; CHECK-NEXT:    uzp1 z1.h, z2.h, z1.h
+; CHECK-NEXT:    uunpkhi z2.s, z1.h
+; CHECK-NEXT:    uunpklo z1.s, z1.h
+; CHECK-NEXT:    uunpklo z2.d, z2.s
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
 ; CHECK-NEXT:    umaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll
index 2464eacd185dd..bc94c087ef5fe 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-index.ll
@@ -239,10 +239,10 @@ define <vscale x 4 x i32> @index_rr_i32_combine(i32 %a, i32 %b) {
 define <vscale x 4 x i32> @index_rr_i32_not_combine(i32 %a, i32 %b) {
 ; CHECK-LABEL: index_rr_i32_not_combine:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    index z0.s, #0, #1
 ; CHECK-NEXT:    mov z1.s, w0
 ; CHECK-NEXT:    mov z2.s, w1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mla z1.s, p0/m, z0.s, z2.s
 ; CHECK-NEXT:    add z0.s, z1.s, z0.s
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll
index 3e453a6b78179..5648e8244e6ec 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-imm.ll
@@ -247,8 +247,8 @@ define <vscale x 4 x i32> @sub_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 define <vscale x 4 x i32> @sub_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: sub_i32_ptrue_all_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.s, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    sub z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -402,8 +402,8 @@ define <vscale x 4 x i32> @subr_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 define <vscale x 4 x i32> @subr_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: subr_i32_ptrue_all_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.s, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    subr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -449,8 +449,8 @@ define <vscale x 8 x i16> @smax_i16(<vscale x 8 x i16> %a) {
 define <vscale x 8 x i16> @smax_i16_out_of_range(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: smax_i16_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w8, #129 // =0x81
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
@@ -480,8 +480,8 @@ define <vscale x 4 x i32> @smax_i32(<vscale x 4 x i32> %a) {
 define <vscale x 4 x i32> @smax_i32_out_of_range(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: smax_i32_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, #-129 // =0xffffffffffffff7f
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
@@ -510,8 +510,8 @@ define <vscale x 2 x i64> @smax_i64(<vscale x 2 x i64> %a) {
 define <vscale x 2 x i64> @smax_i64_out_of_range(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: smax_i64_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, #65535 // =0xffff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -559,8 +559,8 @@ define <vscale x 4 x i32> @smax_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 define <vscale x 4 x i32> @smax_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: smax_i32_ptrue_all_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.s, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -606,8 +606,8 @@ define <vscale x 8 x i16> @smin_i16(<vscale x 8 x i16> %a) {
 define <vscale x 8 x i16> @smin_i16_out_of_range(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: smin_i16_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z1.h, #-129 // =0xffffffffffffff7f
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
@@ -636,8 +636,8 @@ define <vscale x 4 x i32> @smin_i32(<vscale x 4 x i32> %a) {
 define <vscale x 4 x i32> @smin_i32_out_of_range(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: smin_i32_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #257 // =0x101
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -668,8 +668,8 @@ define <vscale x 2 x i64> @smin_i64(<vscale x 2 x i64> %a) {
 define <vscale x 2 x i64> @smin_i64_out_of_range(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: smin_i64_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, #-256 // =0xffffffffffffff00
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -717,8 +717,8 @@ define <vscale x 4 x i32> @smin_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 define <vscale x 4 x i32> @smin_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: smin_i32_ptrue_all_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.s, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -764,8 +764,8 @@ define <vscale x 8 x i16> @umax_i16(<vscale x 8 x i16> %a) {
 define <vscale x 8 x i16> @umax_i16_out_of_range(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: umax_i16_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    dupm z1.b, #0x1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
@@ -794,8 +794,8 @@ define <vscale x 4 x i32> @umax_i32(<vscale x 4 x i32> %a) {
 define <vscale x 4 x i32> @umax_i32_out_of_range(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: umax_i32_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #257 // =0x101
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -825,8 +825,8 @@ define <vscale x 2 x i64> @umax_i64(<vscale x 2 x i64> %a) {
 define <vscale x 2 x i64> @umax_i64_out_of_range(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: umax_i64_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, #65535 // =0xffff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -874,8 +874,8 @@ define <vscale x 4 x i32> @umax_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 define <vscale x 4 x i32> @umax_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: umax_i32_ptrue_all_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.s, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -921,8 +921,8 @@ define <vscale x 8 x i16> @umin_i16(<vscale x 8 x i16> %a) {
 define <vscale x 8 x i16> @umin_i16_out_of_range(<vscale x 8 x i16> %a) {
 ; CHECK-LABEL: umin_i16_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    dupm z1.b, #0x1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    ret
   %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
@@ -951,8 +951,8 @@ define <vscale x 4 x i32> @umin_i32(<vscale x 4 x i32> %a) {
 define <vscale x 4 x i32> @umin_i32_out_of_range(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: umin_i32_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #257 // =0x101
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
@@ -982,8 +982,8 @@ define <vscale x 2 x i64> @umin_i64(<vscale x 2 x i64> %a) {
 define <vscale x 2 x i64> @umin_i64_out_of_range(<vscale x 2 x i64> %a) {
 ; CHECK-LABEL: umin_i64_out_of_range:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, #65535 // =0xffff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -1031,8 +1031,8 @@ define <vscale x 4 x i32> @umin_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 define <vscale x 4 x i32> @umin_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: umin_i32_ptrue_all_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.s, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
@@ -2120,8 +2120,8 @@ define <vscale x 4 x i32> @mul_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 define <vscale x 4 x i32> @mul_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: mul_i32_ptrue_all_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.s, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-logical-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-logical-imm.ll
index 7cdedeee2cada..5db7ee75c2a8d 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-logical-imm.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-logical-imm.ll
@@ -261,8 +261,8 @@ define <vscale x 4 x i32> @orr_i32_ptrue_all_h(<vscale x 4 x i32> %a) {
 define <vscale x 4 x i32> @orr_i32_ptrue_all_d(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: orr_i32_ptrue_all_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.s, #65535 // =0xffff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    orr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
diff --git a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll
index da5dc5c5b34d9..619134dc4a696 100644
--- a/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ld-post-inc.ll
@@ -38,18 +38,18 @@ define <vscale x 2 x double> @test_post_ld1_dup(ptr %a, ptr %ptr, i64 %inc) {
 define void @test_post_ld1_int_fixed(ptr %data, i64 %idx, ptr %addr, ptr %res_ptr)  #1 {
 ; CHECK-LABEL: test_post_ld1_int_fixed:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #2 // =0x2
 ; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    ptrue p1.d, vl1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x0, x1, lsl #3]
+; CHECK-NEXT:    ptrue p2.d, vl1
 ; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x2]
-; CHECK-NEXT:    cmpeq p2.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    ldr x9, [x0, x1, lsl #3]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, z1.d
 ; CHECK-NEXT:    mov z0.d, z2.d
-; CHECK-NEXT:    mov z2.d, p2/m, x9
-; CHECK-NEXT:    mov z0.d, p1/m, x8
+; CHECK-NEXT:    mov z0.d, p2/m, x8
+; CHECK-NEXT:    mov z2.d, p1/m, x9
 ; CHECK-NEXT:    add z0.d, z0.d, z2.d
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x3]
 ; CHECK-NEXT:    ret
@@ -67,18 +67,18 @@ define void @test_post_ld1_int_fixed(ptr %data, i64 %idx, ptr %addr, ptr %res_pt
 define void @test_post_ld1_double_fixed(ptr %data, i64 %idx, ptr %addr, ptr %res_ptr)  #1 {
 ; CHECK-LABEL: test_post_ld1_double_fixed:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #2 // =0x2
 ; CHECK-NEXT:    index z0.d, #0, #1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
-; CHECK-NEXT:    ptrue p1.d, vl1
-; CHECK-NEXT:    ld1d { z2.d }, p0/z, [x2]
-; CHECK-NEXT:    cmpeq p2.d, p0/z, z0.d, z1.d
-; CHECK-NEXT:    ldr d0, [x0]
-; CHECK-NEXT:    ldr d1, [x0, x1, lsl #3]
-; CHECK-NEXT:    sel z0.d, p1, z0.d, z2.d
-; CHECK-NEXT:    mov z2.d, p2/m, d1
-; CHECK-NEXT:    fadd z0.d, z0.d, z2.d
+; CHECK-NEXT:    ptrue p2.d, vl1
+; CHECK-NEXT:    ldr d2, [x0, x1, lsl #3]
+; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, z1.d
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x2]
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    sel z1.d, p2, z1.d, z0.d
+; CHECK-NEXT:    mov z0.d, p1/m, d2
+; CHECK-NEXT:    fadd z0.d, z1.d, z0.d
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x3]
 ; CHECK-NEXT:    ret
   %A = load <4 x double>, ptr %addr
diff --git a/llvm/test/CodeGen/AArch64/sve-ld1r.ll b/llvm/test/CodeGen/AArch64/sve-ld1r.ll
index e42e2272a2d4f..fbe82e8591fd0 100644
--- a/llvm/test/CodeGen/AArch64/sve-ld1r.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ld1r.ll
@@ -20,8 +20,8 @@ define <vscale x 16 x i8> @ld1r_stack() {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    adrp x8, :got:g8
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    ldr x8, [x8, :got_lo12:g8]
 ; CHECK-NEXT:    ldrb w8, [x8]
 ; CHECK-NEXT:    strb w8, [sp, #12]
@@ -1433,10 +1433,10 @@ define ptr @avoid_preindex_load(ptr %src, ptr %out) {
 define ptr @avoid_preindex_load_dup(ptr %src, <vscale x 2 x i1> %pg, ptr %out) {
 ; CHECK-LABEL: avoid_preindex_load_dup:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    ld1rsb { z0.d }, p0/z, [x0, #1]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    add x0, x0, #1
-; CHECK-NEXT:    st1d { z0.d }, p1, [x1]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
   %ptr = getelementptr inbounds i8, ptr %src, i64 1
   %tmp = load i8, ptr %ptr, align 4
@@ -1450,10 +1450,10 @@ define ptr @avoid_preindex_load_dup(ptr %src, <vscale x 2 x i1> %pg, ptr %out) {
 define ptr @avoid_preindex_load_dup_passthru_zero(ptr %src, <vscale x 2 x i1> %pg, ptr %out) {
 ; CHECK-LABEL: avoid_preindex_load_dup_passthru_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    ld1rsb { z0.d }, p0/z, [x0, #1]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    add x0, x0, #1
-; CHECK-NEXT:    st1d { z0.d }, p1, [x1]
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
   %ptr = getelementptr inbounds i8, ptr %src, i64 1
   %tmp = load i8, ptr %ptr, align 4
@@ -1467,10 +1467,10 @@ define ptr @avoid_preindex_load_dup_passthru_zero(ptr %src, <vscale x 2 x i1> %p
 define ptr @preindex_load_dup_passthru(<vscale x 2 x i64> %passthru, ptr %src, <vscale x 2 x i1> %pg, ptr %out) {
 ; CHECK-LABEL: preindex_load_dup_passthru:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    ldrsb x8, [x0, #1]!
 ; CHECK-NEXT:    mov z0.d, p0/m, x8
-; CHECK-NEXT:    st1d { z0.d }, p1, [x1]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
 ; CHECK-NEXT:    ret
   %ptr = getelementptr inbounds i8, ptr %src, i64 1
   %tmp = load i8, ptr %ptr, align 4
@@ -1485,8 +1485,8 @@ define ptr @preindex_load_dup_passthru(<vscale x 2 x i64> %passthru, ptr %src, <
 define ptr @preidx8sext64_instead_of_ld1r(ptr %src, ptr %out, ptr %dst) {
 ; CHECK-LABEL: preidx8sext64_instead_of_ld1r:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldrsb x8, [x0, #1]!
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z0.d, x8
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
 ; CHECK-NEXT:    str x8, [x2]
diff --git a/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll b/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll
index 06ec132808154..5d53c00c52728 100644
--- a/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll
+++ b/llvm/test/CodeGen/AArch64/sve-lsr-scaled-index-addressing-mode.ll
@@ -38,8 +38,8 @@ define void @ld_st_nxv8i16(ptr %in, ptr %out) {
 ;
 ; ASM-LABEL: ld_st_nxv8i16:
 ; ASM:       // %bb.0: // %entry
-; ASM-NEXT:    ptrue p0.h
 ; ASM-NEXT:    mov z0.h, #3 // =0x3
+; ASM-NEXT:    ptrue p0.h
 ; ASM-NEXT:    mov x8, xzr
 ; ASM-NEXT:    cnth x9
 ; ASM-NEXT:  .LBB0_1: // %loop
@@ -111,8 +111,8 @@ define void @masked_ld_st_nxv8i16(ptr %in, ptr %out, i64 %n) {
 ;
 ; ASM-LABEL: masked_ld_st_nxv8i16:
 ; ASM:       // %bb.0: // %entry
-; ASM-NEXT:    ptrue p0.h
 ; ASM-NEXT:    mov z0.h, #3 // =0x3
+; ASM-NEXT:    ptrue p0.h
 ; ASM-NEXT:    mov x8, xzr
 ; ASM-NEXT:    cnth x9
 ; ASM-NEXT:  .LBB1_1: // %loop
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
index e40d65efb158b..dfdfc456ccdba 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
@@ -126,9 +126,9 @@ define <vscale x 8 x half> @masked_gather_nxv8f16(<vscale x 8 x ptr> %ptrs, <vsc
 define <vscale x 8 x bfloat> @masked_gather_nxv8bf16(ptr %base, <vscale x 8 x i16> %indices, <vscale x 8 x i1> %mask) #0 {
 ; CHECK-LABEL: masked_gather_nxv8bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    punpkhi p1.h, p0.b
 ; CHECK-NEXT:    sunpkhi z1.s, z0.h
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
+; CHECK-NEXT:    punpkhi p1.h, p0.b
 ; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    ld1h { z1.s }, p1/z, [x0, z1.s, sxtw #1]
 ; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
@@ -175,14 +175,14 @@ define <vscale x 8 x float> @masked_gather_nxv8f32(ptr %base, <vscale x 8 x i32>
 define <vscale x 16 x i8> @masked_gather_nxv16i8(ptr %base, <vscale x 16 x i8> %indices, <vscale x 16 x i1> %mask) #0 {
 ; CHECK-LABEL: masked_gather_nxv16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    punpkhi p1.h, p0.b
 ; CHECK-NEXT:    sunpkhi z1.h, z0.b
+; CHECK-NEXT:    punpkhi p1.h, p0.b
 ; CHECK-NEXT:    sunpklo z0.h, z0.b
 ; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    sunpkhi z2.s, z1.h
-; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    punpkhi p2.h, p1.b
 ; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    sunpkhi z2.s, z1.h
+; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    ld1b { z2.s }, p2/z, [x0, z2.s, sxtw]
 ; CHECK-NEXT:    ld1b { z1.s }, p1/z, [x0, z1.s, sxtw]
 ; CHECK-NEXT:    punpkhi p1.h, p0.b
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-ldst-sext.ll b/llvm/test/CodeGen/AArch64/sve-masked-ldst-sext.ll
index 40d889f1b501e..d397424cb162f 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-ldst-sext.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-ldst-sext.ll
@@ -235,9 +235,9 @@ define <vscale x 8 x i64> @masked_sload_x2_8i8_8i64(ptr %a, ptr %b, <vscale x 8
 ; CHECK-NEXT:    punpkhi p2.h, p1.b
 ; CHECK-NEXT:    punpklo p1.h, p1.b
 ; CHECK-NEXT:    punpkhi p3.h, p0.b
-; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    ld1sb { z3.d }, p2/z, [x0, #3, mul vl]
 ; CHECK-NEXT:    ld1sb { z5.d }, p2/z, [x1, #3, mul vl]
+; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    ld1sb { z2.d }, p1/z, [x0, #2, mul vl]
 ; CHECK-NEXT:    ld1sb { z6.d }, p1/z, [x1, #2, mul vl]
 ; CHECK-NEXT:    ld1sb { z1.d }, p3/z, [x0, #1, mul vl]
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-ldst-zext.ll b/llvm/test/CodeGen/AArch64/sve-masked-ldst-zext.ll
index e9860222613e9..17fa13e4e8a8b 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-ldst-zext.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-ldst-zext.ll
@@ -98,9 +98,9 @@ define <vscale x 8 x i64> @masked_zload_nxv8i16(ptr %a, <vscale x 8 x i1> %mask)
 define <vscale x 2 x double> @masked_zload_2i16_2f64(ptr noalias %in, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_zload_2i16_2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0]
-; CHECK-NEXT:    ucvtf z0.d, p1/m, z0.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    ret
   %wide.load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(ptr %in, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
   %zext = zext <vscale x 2 x i16> %wide.load to <vscale x 2 x i32>
@@ -230,9 +230,9 @@ define <vscale x 8 x i64> @masked_zload_x2_8i8_8i64(ptr %a, ptr %b, <vscale x 8
 ; CHECK-NEXT:    punpkhi p2.h, p1.b
 ; CHECK-NEXT:    punpklo p1.h, p1.b
 ; CHECK-NEXT:    punpkhi p3.h, p0.b
-; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    ld1b { z3.d }, p2/z, [x0, #3, mul vl]
 ; CHECK-NEXT:    ld1b { z5.d }, p2/z, [x1, #3, mul vl]
+; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    ld1b { z2.d }, p1/z, [x0, #2, mul vl]
 ; CHECK-NEXT:    ld1b { z6.d }, p1/z, [x1, #2, mul vl]
 ; CHECK-NEXT:    ld1b { z1.d }, p3/z, [x0, #1, mul vl]
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll b/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll
index 10a6445c6fa2f..09ee1e7d3b12e 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll
@@ -9,26 +9,26 @@ target triple = "aarch64-linux-gnu"
 define void @masked_scatter_nxv16i8(<vscale x 16 x i8> %data, ptr %base, <vscale x 16 x i8> %offsets, <vscale x 16 x i1> %mask) #0 {
 ; CHECK-LABEL: masked_scatter_nxv16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    sunpklo z2.h, z1.b
 ; CHECK-NEXT:    uunpklo z4.h, z0.b
-; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    sunpkhi z1.h, z1.b
 ; CHECK-NEXT:    uunpkhi z0.h, z0.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    punpklo p2.h, p1.b
+; CHECK-NEXT:    punpkhi p1.h, p1.b
 ; CHECK-NEXT:    sunpklo z3.s, z2.h
 ; CHECK-NEXT:    uunpklo z5.s, z4.h
 ; CHECK-NEXT:    sunpkhi z2.s, z2.h
-; CHECK-NEXT:    punpklo p2.h, p1.b
-; CHECK-NEXT:    punpkhi p1.h, p1.b
 ; CHECK-NEXT:    st1b { z5.s }, p2, [x0, z3.s, sxtw]
 ; CHECK-NEXT:    uunpkhi z3.s, z4.h
 ; CHECK-NEXT:    st1b { z3.s }, p1, [x0, z2.s, sxtw]
-; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    sunpklo z2.s, z1.h
-; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    uunpklo z3.s, z0.h
 ; CHECK-NEXT:    sunpkhi z1.s, z1.h
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
+; CHECK-NEXT:    punpklo p1.h, p0.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    st1b { z3.s }, p1, [x0, z2.s, sxtw]
 ; CHECK-NEXT:    st1b { z0.s }, p0, [x0, z1.s, sxtw]
 ; CHECK-NEXT:    ret
@@ -40,12 +40,12 @@ define void @masked_scatter_nxv16i8(<vscale x 16 x i8> %data, ptr %base, <vscale
 define void @masked_scatter_nxv8i16(<vscale x 8 x i16> %data, ptr %base, <vscale x 8 x i16> %offsets, <vscale x 8 x i1> %mask) #0 {
 ; CHECK-LABEL: masked_scatter_nxv8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    uunpklo z3.s, z0.h
-; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    sunpkhi z1.s, z1.h
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
+; CHECK-NEXT:    punpklo p1.h, p0.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    st1h { z3.s }, p1, [x0, z2.s, sxtw #1]
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x0, z1.s, sxtw #1]
 ; CHECK-NEXT:    ret
@@ -57,12 +57,12 @@ define void @masked_scatter_nxv8i16(<vscale x 8 x i16> %data, ptr %base, <vscale
 define void @masked_scatter_nxv8bf16(<vscale x 8 x bfloat> %data, ptr %base, <vscale x 8 x i16> %offsets, <vscale x 8 x i1> %mask) #0 {
 ; CHECK-LABEL: masked_scatter_nxv8bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    uunpklo z3.s, z0.h
-; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    sunpkhi z1.s, z1.h
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
+; CHECK-NEXT:    punpklo p1.h, p0.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    st1h { z3.s }, p1, [x0, z2.s, sxtw #1]
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x0, z1.s, sxtw #1]
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll
index e866474942cd7..94e525d22b825 100644
--- a/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll
@@ -76,8 +76,8 @@ define void @masked_scatter_nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x p
 define void @masked_scatter_splat_constant_pointer (<vscale x 4 x i1> %pg) {
 ; CHECK-LABEL: masked_scatter_splat_constant_pointer:
 ; CHECK:       // %bb.0: // %vector.body
-; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    punpklo p1.h, p0.b
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    st1w { z0.d }, p1, [z0.d]
 ; CHECK-NEXT:    st1w { z0.d }, p0, [z0.d]
diff --git a/llvm/test/CodeGen/AArch64/sve-pr62151.ll b/llvm/test/CodeGen/AArch64/sve-pr62151.ll
index 7cec20fda429c..5ed34f14a0b14 100644
--- a/llvm/test/CodeGen/AArch64/sve-pr62151.ll
+++ b/llvm/test/CodeGen/AArch64/sve-pr62151.ll
@@ -5,8 +5,8 @@
 define i32 @build_interpolation(<2 x i32> %0, <2 x i32> %1, <2 x i32> %2) {
 ; CHECK-LABEL: build_interpolation:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    mul v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    // kill: def $d2 killed $d2 def $z2
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z2.s
 ; CHECK-NEXT:    mla v0.2s, v1.2s, v0.s[1]
diff --git a/llvm/test/CodeGen/AArch64/sve-pred-arith.ll b/llvm/test/CodeGen/AArch64/sve-pred-arith.ll
index 4d46ac5ecbaa9..6e08606db9537 100644
--- a/llvm/test/CodeGen/AArch64/sve-pred-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-pred-arith.ll
@@ -54,24 +54,24 @@ define aarch64_sve_vector_pcs <vscale x 64 x i1> @add_nxv64i1(<vscale x 64 x i1>
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    ptrue p4.b
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue p6.b
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    ldr p5, [x0]
-; CHECK-NEXT:    ldr p6, [x1]
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ldr p4, [x0]
+; CHECK-NEXT:    ldr p5, [x1]
 ; CHECK-NEXT:    ldr p7, [x2]
 ; CHECK-NEXT:    ldr p8, [x3]
-; CHECK-NEXT:    eor p0.b, p4/z, p0.b, p5.b
-; CHECK-NEXT:    eor p1.b, p4/z, p1.b, p6.b
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    eor p2.b, p4/z, p2.b, p7.b
+; CHECK-NEXT:    eor p0.b, p6/z, p0.b, p4.b
+; CHECK-NEXT:    eor p1.b, p6/z, p1.b, p5.b
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    eor p2.b, p6/z, p2.b, p7.b
 ; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    eor p3.b, p4/z, p3.b, p8.b
+; CHECK-NEXT:    eor p3.b, p6/z, p3.b, p8.b
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
@@ -138,24 +138,24 @@ define aarch64_sve_vector_pcs <vscale x 64 x i1> @sub_nxv64i1(<vscale x 64 x i1>
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    addvl sp, sp, #-1
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    ptrue p4.b
 ; CHECK-NEXT:    str p8, [sp, #3, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue p6.b
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    ldr p5, [x0]
-; CHECK-NEXT:    ldr p6, [x1]
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ldr p4, [x0]
+; CHECK-NEXT:    ldr p5, [x1]
 ; CHECK-NEXT:    ldr p7, [x2]
 ; CHECK-NEXT:    ldr p8, [x3]
-; CHECK-NEXT:    eor p0.b, p4/z, p0.b, p5.b
-; CHECK-NEXT:    eor p1.b, p4/z, p1.b, p6.b
-; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    eor p2.b, p4/z, p2.b, p7.b
+; CHECK-NEXT:    eor p0.b, p6/z, p0.b, p4.b
+; CHECK-NEXT:    eor p1.b, p6/z, p1.b, p5.b
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    eor p2.b, p6/z, p2.b, p7.b
 ; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    eor p3.b, p4/z, p3.b, p8.b
+; CHECK-NEXT:    eor p3.b, p6/z, p3.b, p8.b
 ; CHECK-NEXT:    ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll
index 600e9c4805ff7..8438e9d88f5de 100644
--- a/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll
+++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop.ll
@@ -322,10 +322,10 @@ entry:
 define <vscale x 4 x i32> @ornot_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
 ; CHECK-LABEL: ornot_v4i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z3.s, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    eor z2.d, z2.d, z3.d
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    eor z2.d, z2.d, z3.d
 ; CHECK-NEXT:    orr z1.d, z1.d, z2.d
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
@@ -340,10 +340,10 @@ entry:
 define <vscale x 8 x i16> @ornot_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
 ; CHECK-LABEL: ornot_v8i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z3.h, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    eor z2.d, z2.d, z3.d
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    eor z2.d, z2.d, z3.d
 ; CHECK-NEXT:    orr z1.d, z1.d, z2.d
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
@@ -358,10 +358,10 @@ entry:
 define <vscale x 16 x i8> @ornot_v16i8(<vscale x 16 x i8> %z, <vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
 ; CHECK-LABEL: ornot_v16i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z3.b, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    eor z2.d, z2.d, z3.d
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    eor z2.d, z2.d, z3.d
 ; CHECK-NEXT:    orr z1.d, z1.d, z2.d
 ; CHECK-NEXT:    mov z0.b, p0/m, z1.b
 ; CHECK-NEXT:    ret
@@ -904,8 +904,8 @@ define <vscale x 4 x i32> @addqr_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x i32>
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, w0
-; CHECK-NEXT:    add z1.s, z1.s, z2.s
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    add z1.s, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
@@ -922,8 +922,8 @@ define <vscale x 8 x i16> @addqr_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x i16>
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w0
-; CHECK-NEXT:    add z1.h, z1.h, z2.h
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    add z1.h, z1.h, z2.h
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
@@ -940,8 +940,8 @@ define <vscale x 16 x i8> @addqr_v16i8(<vscale x 16 x i8> %z, <vscale x 16 x i8>
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z2.b, w0
-; CHECK-NEXT:    add z1.b, z1.b, z2.b
 ; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    add z1.b, z1.b, z2.b
 ; CHECK-NEXT:    mov z0.b, p0/m, z1.b
 ; CHECK-NEXT:    ret
 entry:
@@ -958,8 +958,8 @@ define <vscale x 4 x i32> @subqr_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x i32>
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, w0
-; CHECK-NEXT:    sub z1.s, z1.s, z2.s
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    sub z1.s, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
@@ -976,8 +976,8 @@ define <vscale x 8 x i16> @subqr_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x i16>
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w0
-; CHECK-NEXT:    sub z1.h, z1.h, z2.h
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    sub z1.h, z1.h, z2.h
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
@@ -994,8 +994,8 @@ define <vscale x 16 x i8> @subqr_v16i8(<vscale x 16 x i8> %z, <vscale x 16 x i8>
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z2.b, w0
-; CHECK-NEXT:    sub z1.b, z1.b, z2.b
 ; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    sub z1.b, z1.b, z2.b
 ; CHECK-NEXT:    mov z0.b, p0/m, z1.b
 ; CHECK-NEXT:    ret
 entry:
@@ -1010,10 +1010,10 @@ entry:
 define <vscale x 4 x i32> @mulqr_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x i32> %x, i32 %y) {
 ; CHECK-LABEL: mulqr_v4i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, w0
-; CHECK-NEXT:    mul z1.s, z1.s, z2.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    mul z1.s, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
@@ -1028,10 +1028,10 @@ entry:
 define <vscale x 8 x i16> @mulqr_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x i16> %x, i16 %y) {
 ; CHECK-LABEL: mulqr_v8i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w0
-; CHECK-NEXT:    mul z1.h, z1.h, z2.h
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    mul z1.h, z1.h, z2.h
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
@@ -1046,10 +1046,10 @@ entry:
 define <vscale x 16 x i8> @mulqr_v16i8(<vscale x 16 x i8> %z, <vscale x 16 x i8> %x, i8 %y) {
 ; CHECK-LABEL: mulqr_v16i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z2.b, w0
-; CHECK-NEXT:    mul z1.b, z1.b, z2.b
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    mul z1.b, z1.b, z2.b
 ; CHECK-NEXT:    mov z0.b, p0/m, z1.b
 ; CHECK-NEXT:    ret
 entry:
@@ -1064,11 +1064,11 @@ entry:
 define <vscale x 4 x float> @faddqr_v4f32(<vscale x 4 x float> %z, <vscale x 4 x float> %x, float %y) {
 ; CHECK-LABEL: faddqr_v4f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    // kill: def $s2 killed $s2 def $z2
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, s2
-; CHECK-NEXT:    fadd z1.s, z1.s, z2.s
 ; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    fadd z1.s, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
@@ -1083,11 +1083,11 @@ entry:
 define <vscale x 8 x half> @faddqr_v8f16(<vscale x 8 x half> %z, <vscale x 8 x half> %x, half %y) {
 ; CHECK-LABEL: faddqr_v8f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    // kill: def $h2 killed $h2 def $z2
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, h2
-; CHECK-NEXT:    fadd z1.h, z1.h, z2.h
 ; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    fadd z1.h, z1.h, z2.h
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
@@ -1102,11 +1102,11 @@ entry:
 define <vscale x 4 x float> @fsubqr_v4f32(<vscale x 4 x float> %z, <vscale x 4 x float> %x, float %y) {
 ; CHECK-LABEL: fsubqr_v4f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    // kill: def $s2 killed $s2 def $z2
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, s2
-; CHECK-NEXT:    fsub z1.s, z1.s, z2.s
 ; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    fsub z1.s, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
@@ -1121,11 +1121,11 @@ entry:
 define <vscale x 8 x half> @fsubqr_v8f16(<vscale x 8 x half> %z, <vscale x 8 x half> %x, half %y) {
 ; CHECK-LABEL: fsubqr_v8f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    // kill: def $h2 killed $h2 def $z2
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, h2
-; CHECK-NEXT:    fsub z1.h, z1.h, z2.h
 ; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    fsub z1.h, z1.h, z2.h
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
@@ -1140,11 +1140,11 @@ entry:
 define <vscale x 4 x float> @fmulqr_v4f32(<vscale x 4 x float> %z, <vscale x 4 x float> %x, float %y) {
 ; CHECK-LABEL: fmulqr_v4f32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    // kill: def $s2 killed $s2 def $z2
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, s2
-; CHECK-NEXT:    fmul z1.s, z1.s, z2.s
 ; CHECK-NEXT:    fcmeq p0.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    fmul z1.s, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
@@ -1159,11 +1159,11 @@ entry:
 define <vscale x 8 x half> @fmulqr_v8f16(<vscale x 8 x half> %z, <vscale x 8 x half> %x, half %y) {
 ; CHECK-LABEL: fmulqr_v8f16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    // kill: def $h2 killed $h2 def $z2
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, h2
-; CHECK-NEXT:    fmul z1.h, z1.h, z2.h
 ; CHECK-NEXT:    fcmeq p0.h, p0/z, z0.h, #0.0
+; CHECK-NEXT:    fmul z1.h, z1.h, z2.h
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
@@ -1178,10 +1178,10 @@ entry:
 define <vscale x 4 x i32> @sadd_satqr_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x i32> %x, i32 %y) {
 ; CHECK-LABEL: sadd_satqr_v4i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, w0
-; CHECK-NEXT:    sqadd z1.s, z1.s, z2.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    sqadd z1.s, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
@@ -1196,10 +1196,10 @@ entry:
 define <vscale x 8 x i16> @sadd_satqr_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x i16> %x, i16 %y) {
 ; CHECK-LABEL: sadd_satqr_v8i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w0
-; CHECK-NEXT:    sqadd z1.h, z1.h, z2.h
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    sqadd z1.h, z1.h, z2.h
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
@@ -1214,10 +1214,10 @@ entry:
 define <vscale x 16 x i8> @sadd_satqr_v16i8(<vscale x 16 x i8> %z, <vscale x 16 x i8> %x, i8 %y) {
 ; CHECK-LABEL: sadd_satqr_v16i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z2.b, w0
-; CHECK-NEXT:    sqadd z1.b, z1.b, z2.b
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    sqadd z1.b, z1.b, z2.b
 ; CHECK-NEXT:    mov z0.b, p0/m, z1.b
 ; CHECK-NEXT:    ret
 entry:
@@ -1232,10 +1232,10 @@ entry:
 define <vscale x 4 x i32> @uadd_satqr_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x i32> %x, i32 %y) {
 ; CHECK-LABEL: uadd_satqr_v4i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, w0
-; CHECK-NEXT:    uqadd z1.s, z1.s, z2.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    uqadd z1.s, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
@@ -1250,10 +1250,10 @@ entry:
 define <vscale x 8 x i16> @uadd_satqr_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x i16> %x, i16 %y) {
 ; CHECK-LABEL: uadd_satqr_v8i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w0
-; CHECK-NEXT:    uqadd z1.h, z1.h, z2.h
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    uqadd z1.h, z1.h, z2.h
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
@@ -1268,10 +1268,10 @@ entry:
 define <vscale x 16 x i8> @uadd_satqr_v16i8(<vscale x 16 x i8> %z, <vscale x 16 x i8> %x, i8 %y) {
 ; CHECK-LABEL: uadd_satqr_v16i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z2.b, w0
-; CHECK-NEXT:    uqadd z1.b, z1.b, z2.b
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    uqadd z1.b, z1.b, z2.b
 ; CHECK-NEXT:    mov z0.b, p0/m, z1.b
 ; CHECK-NEXT:    ret
 entry:
@@ -1286,10 +1286,10 @@ entry:
 define <vscale x 4 x i32> @ssub_satqr_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x i32> %x, i32 %y) {
 ; CHECK-LABEL: ssub_satqr_v4i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, w0
-; CHECK-NEXT:    sqsub z1.s, z1.s, z2.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    sqsub z1.s, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
@@ -1304,10 +1304,10 @@ entry:
 define <vscale x 8 x i16> @ssub_satqr_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x i16> %x, i16 %y) {
 ; CHECK-LABEL: ssub_satqr_v8i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w0
-; CHECK-NEXT:    sqsub z1.h, z1.h, z2.h
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    sqsub z1.h, z1.h, z2.h
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
@@ -1322,10 +1322,10 @@ entry:
 define <vscale x 16 x i8> @ssub_satqr_v16i8(<vscale x 16 x i8> %z, <vscale x 16 x i8> %x, i8 %y) {
 ; CHECK-LABEL: ssub_satqr_v16i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z2.b, w0
-; CHECK-NEXT:    sqsub z1.b, z1.b, z2.b
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    sqsub z1.b, z1.b, z2.b
 ; CHECK-NEXT:    mov z0.b, p0/m, z1.b
 ; CHECK-NEXT:    ret
 entry:
@@ -1340,10 +1340,10 @@ entry:
 define <vscale x 4 x i32> @usub_satqr_v4i32(<vscale x 4 x i32> %z, <vscale x 4 x i32> %x, i32 %y) {
 ; CHECK-LABEL: usub_satqr_v4i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, w0
-; CHECK-NEXT:    uqsub z1.s, z1.s, z2.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, #0
+; CHECK-NEXT:    uqsub z1.s, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.s, p0/m, z1.s
 ; CHECK-NEXT:    ret
 entry:
@@ -1358,10 +1358,10 @@ entry:
 define <vscale x 8 x i16> @usub_satqr_v8i16(<vscale x 8 x i16> %z, <vscale x 8 x i16> %x, i16 %y) {
 ; CHECK-LABEL: usub_satqr_v8i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w0
-; CHECK-NEXT:    uqsub z1.h, z1.h, z2.h
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    uqsub z1.h, z1.h, z2.h
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
 entry:
@@ -1376,10 +1376,10 @@ entry:
 define <vscale x 16 x i8> @usub_satqr_v16i8(<vscale x 16 x i8> %z, <vscale x 16 x i8> %x, i8 %y) {
 ; CHECK-LABEL: usub_satqr_v16i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z2.b, w0
-; CHECK-NEXT:    uqsub z1.b, z1.b, z2.b
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    cmpeq p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    uqsub z1.b, z1.b, z2.b
 ; CHECK-NEXT:    mov z0.b, p0/m, z1.b
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll
index 14bc1b45e79ee..2541910e080e3 100644
--- a/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop2.ll
@@ -202,9 +202,9 @@ entry:
 define <vscale x 8 x i16> @sdiv_nxv8i16_x(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y, <vscale x 8 x i16> %n) {
 ; CHECK-LABEL: sdiv_nxv8i16_x:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpkhi z3.s, z1.h
 ; CHECK-NEXT:    sunpkhi z4.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    sunpklo z4.s, z0.h
@@ -288,9 +288,9 @@ entry:
 define <vscale x 8 x i16> @udiv_nxv8i16_x(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y, <vscale x 8 x i16> %n) {
 ; CHECK-LABEL: udiv_nxv8i16_x:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpkhi z3.s, z1.h
 ; CHECK-NEXT:    uunpkhi z4.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    uunpklo z4.s, z0.h
@@ -376,17 +376,17 @@ entry:
 define <vscale x 8 x i16> @srem_nxv8i16_x(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y, <vscale x 8 x i16> %n) {
 ; CHECK-LABEL: srem_nxv8i16_x:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpkhi z3.s, z1.h
 ; CHECK-NEXT:    sunpkhi z4.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpklo z5.s, z0.h
 ; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    sunpklo z4.s, z1.h
 ; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cmpgt p0.h, p0/z, z2.h, #0
-; CHECK-NEXT:    uzp1 z3.h, z4.h, z3.h
-; CHECK-NEXT:    mls z0.h, p0/m, z3.h, z1.h
+; CHECK-NEXT:    uzp1 z2.h, z4.h, z3.h
+; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 8 x i16> %n, zeroinitializer
@@ -419,8 +419,8 @@ define <vscale x 16 x i8> @srem_nxv16i8_x(<vscale x 16 x i8> %x, <vscale x 16 x
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    cmpgt p0.b, p0/z, z2.b, #0
 ; CHECK-NEXT:    uzp1 z4.h, z4.h, z7.h
-; CHECK-NEXT:    uzp1 z3.b, z4.b, z3.b
-; CHECK-NEXT:    mls z0.b, p0/m, z3.b, z1.b
+; CHECK-NEXT:    uzp1 z2.b, z4.b, z3.b
+; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 16 x i8> %n, zeroinitializer
@@ -464,17 +464,17 @@ entry:
 define <vscale x 8 x i16> @urem_nxv8i16_x(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y, <vscale x 8 x i16> %n) {
 ; CHECK-LABEL: urem_nxv8i16_x:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpkhi z3.s, z1.h
 ; CHECK-NEXT:    uunpkhi z4.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpklo z5.s, z0.h
 ; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    uunpklo z4.s, z1.h
 ; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cmpgt p0.h, p0/z, z2.h, #0
-; CHECK-NEXT:    uzp1 z3.h, z4.h, z3.h
-; CHECK-NEXT:    mls z0.h, p0/m, z3.h, z1.h
+; CHECK-NEXT:    uzp1 z2.h, z4.h, z3.h
+; CHECK-NEXT:    mls z0.h, p0/m, z2.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 8 x i16> %n, zeroinitializer
@@ -507,8 +507,8 @@ define <vscale x 16 x i8> @urem_nxv16i8_x(<vscale x 16 x i8> %x, <vscale x 16 x
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    cmpgt p0.b, p0/z, z2.b, #0
 ; CHECK-NEXT:    uzp1 z4.h, z4.h, z7.h
-; CHECK-NEXT:    uzp1 z3.b, z4.b, z3.b
-; CHECK-NEXT:    mls z0.b, p0/m, z3.b, z1.b
+; CHECK-NEXT:    uzp1 z2.b, z4.b, z3.b
+; CHECK-NEXT:    mls z0.b, p0/m, z2.b, z1.b
 ; CHECK-NEXT:    ret
 entry:
   %c = icmp sgt <vscale x 16 x i8> %n, zeroinitializer
@@ -1140,8 +1140,8 @@ define <vscale x 8 x half> @fdiv_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fdiv_nxv8f16_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
 ; CHECK-NEXT:    fdivr z1.h, p0/m, z1.h, z0.h
+; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
 ; CHECK-NEXT:    not p0.b, p0/z, p1.b
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
@@ -1654,9 +1654,9 @@ entry:
 define <vscale x 8 x i16> @sdiv_nxv8i16_y(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y, <vscale x 8 x i16> %n) {
 ; CHECK-LABEL: sdiv_nxv8i16_y:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpkhi z3.s, z1.h
 ; CHECK-NEXT:    sunpkhi z4.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    sunpklo z4.s, z1.h
@@ -1740,9 +1740,9 @@ entry:
 define <vscale x 8 x i16> @udiv_nxv8i16_y(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y, <vscale x 8 x i16> %n) {
 ; CHECK-LABEL: udiv_nxv8i16_y:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpkhi z3.s, z1.h
 ; CHECK-NEXT:    uunpkhi z4.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    uunpklo z4.s, z1.h
@@ -1830,17 +1830,17 @@ entry:
 define <vscale x 8 x i16> @srem_nxv8i16_y(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y, <vscale x 8 x i16> %n) {
 ; CHECK-LABEL: srem_nxv8i16_y:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpkhi z3.s, z1.h
 ; CHECK-NEXT:    sunpkhi z4.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpklo z5.s, z0.h
 ; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    sunpklo z4.s, z1.h
 ; CHECK-NEXT:    sdivr z4.s, p0/m, z4.s, z5.s
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cmpgt p0.h, p0/z, z2.h, #0
-; CHECK-NEXT:    uzp1 z3.h, z4.h, z3.h
-; CHECK-NEXT:    msb z1.h, p0/m, z3.h, z0.h
+; CHECK-NEXT:    uzp1 z2.h, z4.h, z3.h
+; CHECK-NEXT:    msb z1.h, p0/m, z2.h, z0.h
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
@@ -1874,8 +1874,8 @@ define <vscale x 16 x i8> @srem_nxv16i8_y(<vscale x 16 x i8> %x, <vscale x 16 x
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    cmpgt p0.b, p0/z, z2.b, #0
 ; CHECK-NEXT:    uzp1 z4.h, z4.h, z7.h
-; CHECK-NEXT:    uzp1 z3.b, z4.b, z3.b
-; CHECK-NEXT:    msb z1.b, p0/m, z3.b, z0.b
+; CHECK-NEXT:    uzp1 z2.b, z4.b, z3.b
+; CHECK-NEXT:    msb z1.b, p0/m, z2.b, z0.b
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
@@ -1922,17 +1922,17 @@ entry:
 define <vscale x 8 x i16> @urem_nxv8i16_y(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y, <vscale x 8 x i16> %n) {
 ; CHECK-LABEL: urem_nxv8i16_y:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpkhi z3.s, z1.h
 ; CHECK-NEXT:    uunpkhi z4.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpklo z5.s, z0.h
 ; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    uunpklo z4.s, z1.h
 ; CHECK-NEXT:    udivr z4.s, p0/m, z4.s, z5.s
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cmpgt p0.h, p0/z, z2.h, #0
-; CHECK-NEXT:    uzp1 z3.h, z4.h, z3.h
-; CHECK-NEXT:    msb z1.h, p0/m, z3.h, z0.h
+; CHECK-NEXT:    uzp1 z2.h, z4.h, z3.h
+; CHECK-NEXT:    msb z1.h, p0/m, z2.h, z0.h
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
@@ -1966,8 +1966,8 @@ define <vscale x 16 x i8> @urem_nxv16i8_y(<vscale x 16 x i8> %x, <vscale x 16 x
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    cmpgt p0.b, p0/z, z2.b, #0
 ; CHECK-NEXT:    uzp1 z4.h, z4.h, z7.h
-; CHECK-NEXT:    uzp1 z3.b, z4.b, z3.b
-; CHECK-NEXT:    msb z1.b, p0/m, z3.b, z0.b
+; CHECK-NEXT:    uzp1 z2.b, z4.b, z3.b
+; CHECK-NEXT:    msb z1.b, p0/m, z2.b, z0.b
 ; CHECK-NEXT:    mov z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
@@ -2629,8 +2629,8 @@ define <vscale x 8 x half> @fdiv_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fdiv_nxv8f16_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
 ; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
 ; CHECK-NEXT:    not p0.b, p0/z, p1.b
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
@@ -2855,8 +2855,8 @@ define <vscale x 4 x float> @fmai_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z3.s, #0.0
 ; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2871,8 +2871,8 @@ define <vscale x 8 x half> @fmai_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z3.h, #0.0
 ; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2887,8 +2887,8 @@ define <vscale x 2 x double> @fmai_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z3.d, #0.0
 ; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -2903,8 +2903,8 @@ define <vscale x 4 x float> @fma_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z3.s, #0.0
 ; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -2920,8 +2920,8 @@ define <vscale x 8 x half> @fma_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z3.h, #0.0
 ; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -2937,8 +2937,8 @@ define <vscale x 2 x double> @fma_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z3.d, #0.0
 ; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll b/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll
index 0f09f7dac2982..bafd5abcc7b23 100644
--- a/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll
+++ b/llvm/test/CodeGen/AArch64/sve-pred-selectop3.ll
@@ -792,8 +792,8 @@ define <vscale x 8 x half> @fdiv_nxv8f16_x(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fdiv_nxv8f16_x:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
 ; CHECK-NEXT:    fdivr z1.h, p0/m, z1.h, z0.h
+; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
 ; CHECK-NEXT:    not p0.b, p0/z, p1.b
 ; CHECK-NEXT:    mov z0.h, p0/m, z1.h
 ; CHECK-NEXT:    ret
@@ -1700,8 +1700,8 @@ define <vscale x 8 x half> @fdiv_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-LABEL: fdiv_nxv8f16_y:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
 ; CHECK-NEXT:    fdiv z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    fcmle p1.h, p0/z, z2.h, #0.0
 ; CHECK-NEXT:    not p0.b, p0/z, p1.b
 ; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
@@ -1734,8 +1734,8 @@ define <vscale x 4 x float> @fmai_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z3.s, #0.0
 ; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1750,8 +1750,8 @@ define <vscale x 8 x half> @fmai_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z3.h, #0.0
 ; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1766,8 +1766,8 @@ define <vscale x 2 x double> @fmai_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z3.d, #0.0
 ; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
@@ -1782,8 +1782,8 @@ define <vscale x 4 x float> @fma_nxv4f32_y(<vscale x 4 x float> %x, <vscale x 4
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fcmle p1.s, p0/z, z3.s, #0.0
 ; CHECK-NEXT:    fmla z0.s, p0/m, z1.s, z2.s
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.s, p1, z0.s, z1.s
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 4 x float> %n, zeroinitializer
@@ -1799,8 +1799,8 @@ define <vscale x 8 x half> @fma_nxv8f16_y(<vscale x 8 x half> %x, <vscale x 8 x
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    fcmle p1.h, p0/z, z3.h, #0.0
 ; CHECK-NEXT:    fmla z0.h, p0/m, z1.h, z2.h
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.h, p1, z0.h, z1.h
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 8 x half> %n, zeroinitializer
@@ -1816,8 +1816,8 @@ define <vscale x 2 x double> @fma_nxv2f64_y(<vscale x 2 x double> %x, <vscale x
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fcmle p1.d, p0/z, z3.d, #0.0
 ; CHECK-NEXT:    fmla z0.d, p0/m, z1.d, z2.d
-; CHECK-NEXT:    not p1.b, p0/z, p1.b
-; CHECK-NEXT:    sel z0.d, p1, z0.d, z1.d
+; CHECK-NEXT:    not p0.b, p0/z, p1.b
+; CHECK-NEXT:    sel z0.d, p0, z0.d, z1.d
 ; CHECK-NEXT:    ret
 entry:
   %c = fcmp ugt <vscale x 2 x double> %n, zeroinitializer
diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmple.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmple.ll
index fbadbf7226fd1..8bd38d7bc44df 100644
--- a/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmple.ll
+++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-cmple.ll
@@ -320,8 +320,8 @@ define i1 @cmp32_ptest_any_xx(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vs
 define i1 @cmp8_ptest_first_ax(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: cmp8_ptest_first_ax:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    cmpge p0.b, p0/z, z0.b, z1.b
+; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    ptest p1, p0.b
 ; CHECK-NEXT:    cset w0, mi
 ; CHECK-NEXT:    ret
@@ -338,8 +338,8 @@ define i1 @cmp8_ptest_first_ax(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <v
 define i1 @cmp8_ptest_last_ax(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-LABEL: cmp8_ptest_last_ax:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    cmpge p0.b, p0/z, z0.b, z1.b
+; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    ptest p1, p0.b
 ; CHECK-NEXT:    cset w0, lo
 ; CHECK-NEXT:    ret
@@ -371,8 +371,8 @@ define i1 @cmp8_ptest_any_ax(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vsc
 define i1 @cmp32_ptest_first_ax(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: cmp32_ptest_first_ax:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    cmpge p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    ptest p1, p0.b
 ; CHECK-NEXT:    cset w0, mi
 ; CHECK-NEXT:    ret
@@ -390,8 +390,8 @@ define i1 @cmp32_ptest_first_ax(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <
 define i1 @cmp32_ptest_last_ax(<vscale x 16 x i1> %pg, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: cmp32_ptest_last_ax:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    cmpge p0.s, p0/z, z0.s, z1.s
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    ptest p1, p0.b
 ; CHECK-NEXT:    cset w0, lo
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-redundant-store.ll b/llvm/test/CodeGen/AArch64/sve-redundant-store.ll
index 6873404724f1d..508fe5d5a58a5 100644
--- a/llvm/test/CodeGen/AArch64/sve-redundant-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-redundant-store.ll
@@ -35,8 +35,8 @@ entry:
 define void @keep_scalable_store(ptr writeonly %ptr, ptr %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: keep_scalable_store:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ldp q2, q1, [x1]
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    stp q2, q1, [x0]
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll b/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll
index a1c2ec9c7e1d4..76190eba870de 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-extract-elt.ll
@@ -22,15 +22,15 @@ define i8 @split_extract_32i8_idx(<vscale x 32 x i8> %a, i32 %idx) {
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    rdvl x8, #2
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov w9, w0
 ; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    csel x8, x9, x8, lo
-; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    st1b { z1.b }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
+; CHECK-NEXT:    csel x8, x9, x8, lo
+; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    ldrb w0, [x9, x8]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -46,15 +46,15 @@ define i16 @split_extract_16i16_idx(<vscale x 16 x i16> %a, i32 %idx) {
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w9, w0
 ; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    csel x8, x9, x8, lo
-; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK-NEXT:    csel x8, x9, x8, lo
+; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    ldrh w0, [x9, x8, lsl #1]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -70,15 +70,15 @@ define i32 @split_extract_8i32_idx(<vscale x 8 x i32> %a, i32 %idx) {
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w9, w0
 ; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    csel x8, x9, x8, lo
-; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
+; CHECK-NEXT:    csel x8, x9, x8, lo
+; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    ldr w0, [x9, x8, lsl #2]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -94,15 +94,15 @@ define i64 @split_extract_8i64_idx(<vscale x 8 x i64> %a, i32 %idx) {
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w9, w0
 ; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    csel x8, x9, x8, lo
-; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    st1d { z3.d }, p0, [sp, #3, mul vl]
 ; CHECK-NEXT:    st1d { z2.d }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    csel x8, x9, x8, lo
+; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
 ; CHECK-NEXT:    ldr x0, [x9, x8, lsl #3]
@@ -140,15 +140,15 @@ define i16 @split_extract_16i16(<vscale x 16 x i16> %a) {
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w9, #128 // =0x80
 ; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    cmp x8, #128
-; CHECK-NEXT:    csel x8, x8, x9, lo
-; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    ldrh w0, [x9, x8, lsl #1]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -164,16 +164,16 @@ define i32 @split_extract_16i32(<vscale x 16 x i32> %a) {
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    rdvl x8, #1
 ; CHECK-NEXT:    mov w9, #34464 // =0x86a0
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    movk w9, #1, lsl #16
 ; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    csel x8, x8, x9, lo
-; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    st1w { z3.s }, p0, [sp, #3, mul vl]
 ; CHECK-NEXT:    st1w { z2.s }, p0, [sp, #2, mul vl]
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK-NEXT:    ldr w0, [x9, x8, lsl #2]
@@ -191,15 +191,15 @@ define i64 @split_extract_4i64(<vscale x 4 x i64> %a) {
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w9, #10 // =0xa
 ; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    cmp x8, #10
-; CHECK-NEXT:    csel x8, x8, x9, lo
-; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
+; CHECK-NEXT:    csel x8, x8, x9, lo
+; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    ldr x0, [x9, x8, lsl #3]
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
index 3997409172d03..bc015116917d8 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll
@@ -6,9 +6,9 @@
 define <vscale x 8 x float> @fcvts_nxv8f16(<vscale x 8 x half> %a) {
 ; CHECK-LABEL: fcvts_nxv8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpklo z1.s, z0.h
 ; CHECK-NEXT:    uunpkhi z2.s, z0.h
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    movprfx z0, z1
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z1.h
 ; CHECK-NEXT:    movprfx z1, z2
@@ -21,9 +21,9 @@ define <vscale x 8 x float> @fcvts_nxv8f16(<vscale x 8 x half> %a) {
 define <vscale x 4 x double> @fcvtd_nxv4f16(<vscale x 4 x half> %a) {
 ; CHECK-LABEL: fcvtd_nxv4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z2.d, z0.s
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    movprfx z0, z1
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z1.h
 ; CHECK-NEXT:    movprfx z1, z2
@@ -37,8 +37,8 @@ define <vscale x 8 x double> @fcvtd_nxv8f16(<vscale x 8 x half> %a) {
 ; CHECK-LABEL: fcvtd_nxv8f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uunpklo z1.s, z0.h
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uunpkhi z0.s, z0.h
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uunpklo z2.d, z1.s
 ; CHECK-NEXT:    uunpkhi z1.d, z1.s
 ; CHECK-NEXT:    uunpklo z3.d, z0.s
@@ -58,9 +58,9 @@ define <vscale x 8 x double> @fcvtd_nxv8f16(<vscale x 8 x half> %a) {
 define <vscale x 4 x double> @fcvtd_nxv4f32(<vscale x 4 x float> %a) {
 ; CHECK-LABEL: fcvtd_nxv4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z2.d, z0.s
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    movprfx z0, z1
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z1.s
 ; CHECK-NEXT:    movprfx z1, z2
@@ -73,11 +73,11 @@ define <vscale x 4 x double> @fcvtd_nxv4f32(<vscale x 4 x float> %a) {
 define <vscale x 8 x double> @fcvtd_nxv8f32(<vscale x 8 x float> %a) {
 ; CHECK-LABEL: fcvtd_nxv8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uunpklo z2.d, z0.s
 ; CHECK-NEXT:    uunpkhi z3.d, z0.s
 ; CHECK-NEXT:    uunpklo z4.d, z1.s
 ; CHECK-NEXT:    uunpkhi z5.d, z1.s
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    movprfx z0, z2
 ; CHECK-NEXT:    fcvt z0.d, p0/m, z2.s
 ; CHECK-NEXT:    movprfx z1, z3
@@ -195,9 +195,9 @@ define <vscale x 8 x i16> @fcvtzs_h_nxv8f64(<vscale x 8 x double> %a) {
 define <vscale x 4 x i64> @fcvtzs_d_nxv4f32(<vscale x 4 x float> %a) {
 ; CHECK-LABEL: fcvtzs_d_nxv4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z2.d, z0.s
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    movprfx z0, z1
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z1.s
 ; CHECK-NEXT:    movprfx z1, z2
@@ -210,11 +210,11 @@ define <vscale x 4 x i64> @fcvtzs_d_nxv4f32(<vscale x 4 x float> %a) {
 define <vscale x 16 x i32> @fcvtzs_s_nxv16f16(<vscale x 16 x half> %a) {
 ; CHECK-LABEL: fcvtzs_s_nxv16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uunpklo z2.s, z0.h
 ; CHECK-NEXT:    uunpkhi z3.s, z0.h
 ; CHECK-NEXT:    uunpklo z4.s, z1.h
 ; CHECK-NEXT:    uunpkhi z5.s, z1.h
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    movprfx z0, z2
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z2.h
 ; CHECK-NEXT:    movprfx z1, z3
@@ -247,9 +247,9 @@ define <vscale x 4 x i32> @fcvtzu_s_nxv4f64(<vscale x 4 x double> %a) {
 define <vscale x 4 x i64> @fcvtzu_d_nxv4f32(<vscale x 4 x float> %a) {
 ; CHECK-LABEL: fcvtzu_d_nxv4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z2.d, z0.s
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    movprfx z0, z1
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z1.s
 ; CHECK-NEXT:    movprfx z1, z2
@@ -295,8 +295,8 @@ define <vscale x 16 x float> @scvtf_s_nxv16i8(<vscale x 16 x i8> %a) {
 ; CHECK-LABEL: scvtf_s_nxv16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sunpklo z1.h, z0.b
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpkhi z0.h, z0.b
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    sunpkhi z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z3.s, z0.h
@@ -316,9 +316,9 @@ define <vscale x 16 x float> @scvtf_s_nxv16i8(<vscale x 16 x i8> %a) {
 define <vscale x 4 x double> @scvtf_d_nxv4i32(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: scvtf_d_nxv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    sunpklo z1.d, z0.s
 ; CHECK-NEXT:    sunpkhi z2.d, z0.s
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    movprfx z0, z1
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z1.d
 ; CHECK-NEXT:    movprfx z1, z2
@@ -333,8 +333,8 @@ define <vscale x 4 x double> @scvtf_d_nxv4i1(<vscale x 4 x i1> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    punpklo p2.h, p0.b
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
-; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    mov z0.d, p2/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    mov z1.d, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    scvtf z0.d, p1/m, z0.d
 ; CHECK-NEXT:    scvtf z1.d, p1/m, z1.d
@@ -378,9 +378,9 @@ define <vscale x 8 x half> @ucvtf_h_nxv8i64(<vscale x 8 x i64> %a) {
 define <vscale x 4 x double> @ucvtf_d_nxv4i32(<vscale x 4 x i32> %a) {
 ; CHECK-LABEL: ucvtf_d_nxv4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uunpklo z1.d, z0.s
 ; CHECK-NEXT:    uunpkhi z2.d, z0.s
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    movprfx z0, z1
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z1.d
 ; CHECK-NEXT:    movprfx z1, z2
@@ -395,8 +395,8 @@ define <vscale x 4 x double> @ucvtf_d_nxv4i1(<vscale x 4 x i1> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    punpklo p2.h, p0.b
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
-; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    mov z0.d, p2/z, #1 // =0x1
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    mov z1.d, p0/z, #1 // =0x1
 ; CHECK-NEXT:    ucvtf z0.d, p1/m, z0.d
 ; CHECK-NEXT:    ucvtf z1.d, p1/m, z1.d
diff --git a/llvm/test/CodeGen/AArch64/sve-split-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-split-fp-reduce.ll
index 7f642882eddbe..696b6c34ef041 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-fp-reduce.ll
@@ -23,8 +23,8 @@ define double @fadda_nxv8f64(double %init, <vscale x 8 x double> %a) {
 define float @faddv_nxv8f32(float %init, <vscale x 8 x float> %a) {
 ; CHECK-LABEL: faddv_nxv8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fadd z1.s, z1.s, z2.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    faddv s1, p0, z1.s
 ; CHECK-NEXT:    fadd s0, s0, s1
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll b/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll
index 5441659fa5cb4..75366384cb750 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-insert-elt.ll
@@ -6,9 +6,9 @@
 define <vscale x 8 x i8> @promote_insert_8i8(<vscale x 8 x i8> %a, i8 %elt, i64 %idx) {
 ; CHECK-LABEL: promote_insert_8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    index z1.h, #0, #1
 ; CHECK-NEXT:    mov z2.h, w1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z1.h, z2.h
 ; CHECK-NEXT:    mov z0.h, p0/m, w0
 ; CHECK-NEXT:    ret
@@ -23,13 +23,13 @@ define <vscale x 32 x i8> @split_insert_32i8_idx(<vscale x 32 x i8> %a, i8 %elt,
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    rdvl x8, #2
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    cmp x1, x8
-; CHECK-NEXT:    csel x8, x1, x8, lo
 ; CHECK-NEXT:    st1b { z1.b }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    csel x8, x1, x8, lo
 ; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
 ; CHECK-NEXT:    strb w0, [x9, x8]
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [sp]
@@ -48,13 +48,13 @@ define <vscale x 8 x float> @split_insert_8f32_idx(<vscale x 8 x float> %a, floa
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    cmp x0, x8
-; CHECK-NEXT:    csel x8, x0, x8, lo
 ; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    csel x8, x0, x8, lo
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK-NEXT:    str s2, [x9, x8, lsl #2]
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
@@ -73,13 +73,13 @@ define <vscale x 8 x i64> @split_insert_8i64_idx(<vscale x 8 x i64> %a, i64 %elt
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    cmp x1, x8
-; CHECK-NEXT:    csel x8, x1, x8, lo
 ; CHECK-NEXT:    st1d { z3.d }, p0, [sp, #3, mul vl]
+; CHECK-NEXT:    csel x8, x1, x8, lo
 ; CHECK-NEXT:    st1d { z2.d }, p0, [sp, #2, mul vl]
 ; CHECK-NEXT:    st1d { z1.d }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
@@ -100,9 +100,9 @@ define <vscale x 8 x i64> @split_insert_8i64_idx(<vscale x 8 x i64> %a, i64 %elt
 define <vscale x 4 x i16> @promote_insert_4i16(<vscale x 4 x i16> %a, i16 %elt) {
 ; CHECK-LABEL: promote_insert_4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #5 // =0x5
 ; CHECK-NEXT:    index z1.s, #0, #1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z1.s, z2.s
 ; CHECK-NEXT:    mov z0.s, p0/m, w0
@@ -117,9 +117,9 @@ define <vscale x 4 x i16> @promote_insert_4i16(<vscale x 4 x i16> %a, i16 %elt)
 define <vscale x 32 x i8> @split_insert_32i8(<vscale x 32 x i8> %a, i8 %elt) {
 ; CHECK-LABEL: split_insert_32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov w8, #3 // =0x3
 ; CHECK-NEXT:    index z2.b, #0, #1
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z3.b, w8
 ; CHECK-NEXT:    cmpeq p0.b, p0/z, z2.b, z3.b
 ; CHECK-NEXT:    mov z0.b, p0/m, w0
@@ -135,14 +135,14 @@ define <vscale x 32 x i16> @split_insert_32i16(<vscale x 32 x i16> %a, i16 %elt)
 ; CHECK-NEXT:    addvl sp, sp, #-4
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    rdvl x8, #2
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w9, #128 // =0x80
 ; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    cmp x8, #128
+; CHECK-NEXT:    st1h { z3.h }, p0, [sp, #3, mul vl]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    st1h { z3.h }, p0, [sp, #3, mul vl]
 ; CHECK-NEXT:    st1h { z2.h }, p0, [sp, #2, mul vl]
 ; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
@@ -165,15 +165,15 @@ define <vscale x 8 x i32> @split_insert_8i32(<vscale x 8 x i32> %a, i32 %elt) {
 ; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cnth x8
 ; CHECK-NEXT:    mov w9, #16960 // =0x4240
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    movk w9, #15, lsl #16
 ; CHECK-NEXT:    sub x8, x8, #1
 ; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    csel x8, x8, x9, lo
 ; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
 ; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
 ; CHECK-NEXT:    str w0, [x9, x8, lsl #2]
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
diff --git a/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll
index 42f3a163d14cc..dd7b15ef5ee6f 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll
@@ -17,8 +17,8 @@ define i8 @andv_nxv8i8(<vscale x 8 x i8> %a) {
 define i32 @andv_nxv8i32(<vscale x 8 x i32> %a) {
 ; CHECK-LABEL: andv_nxv8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -71,8 +71,8 @@ define i16 @xorv_nxv2i16(<vscale x 2 x i16> %a) {
 define i32 @xorv_nxv8i32(<vscale x 8 x i32> %a) {
 ; CHECK-LABEL: xorv_nxv8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -97,8 +97,8 @@ define i16 @uaddv_nxv4i16(<vscale x 4 x i16> %a) {
 define i16 @uaddv_nxv16i16(<vscale x 16 x i16> %a) {
 ; CHECK-LABEL: uaddv_nxv16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
@@ -127,8 +127,8 @@ define i32 @uaddv_nxv16i32(<vscale x 16 x i32> %a) {
 define i32 @umin_nxv2i32(<vscale x 2 x i32> %a) {
 ; CHECK-LABEL: umin_nxv2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
diff --git a/llvm/test/CodeGen/AArch64/sve-split-load.ll b/llvm/test/CodeGen/AArch64/sve-split-load.ll
index af03059cf0d8b..754f0339702dc 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-load.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-load.ll
@@ -93,8 +93,8 @@ define <vscale x 32 x i16> @masked_load_split_32i16(ptr %a, <vscale x 32 x i1> %
 ; CHECK-NEXT:    punpklo p2.h, p0.b
 ; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    punpklo p3.h, p1.b
-; CHECK-NEXT:    punpkhi p1.h, p1.b
 ; CHECK-NEXT:    ld1h { z0.h }, p2/z, [x0]
+; CHECK-NEXT:    punpkhi p1.h, p1.b
 ; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, #1, mul vl]
 ; CHECK-NEXT:    ld1h { z2.h }, p3/z, [x0, #2, mul vl]
 ; CHECK-NEXT:    ld1h { z3.h }, p1/z, [x0, #3, mul vl]
@@ -123,8 +123,8 @@ define <vscale x 8 x i64> @masked_load_split_8i64(ptr %a, <vscale x 8 x i1> %pg)
 ; CHECK-NEXT:    punpklo p2.h, p1.b
 ; CHECK-NEXT:    punpkhi p1.h, p1.b
 ; CHECK-NEXT:    punpklo p3.h, p0.b
-; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    ld1d { z0.d }, p2/z, [x0]
+; CHECK-NEXT:    punpkhi p0.h, p0.b
 ; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x0, #1, mul vl]
 ; CHECK-NEXT:    ld1d { z2.d }, p3/z, [x0, #2, mul vl]
 ; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x0, #3, mul vl]
diff --git a/llvm/test/CodeGen/AArch64/sve-split-store.ll b/llvm/test/CodeGen/AArch64/sve-split-store.ll
index 90ec783ea4dbc..affa9a18ac182 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-store.ll
@@ -81,8 +81,8 @@ define void @masked_store_split_32i16(<vscale x 32 x i16> %data, ptr %a, <vscale
 ; CHECK-NEXT:    punpkhi p2.h, p1.b
 ; CHECK-NEXT:    punpklo p1.h, p1.b
 ; CHECK-NEXT:    punpkhi p3.h, p0.b
-; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    st1h { z3.h }, p2, [x0, #3, mul vl]
+; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    st1h { z2.h }, p1, [x0, #2, mul vl]
 ; CHECK-NEXT:    st1h { z1.h }, p3, [x0, #1, mul vl]
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
@@ -110,11 +110,11 @@ define void @masked_store_split_8i64(<vscale x 8 x i64> %data, ptr %a, <vscale x
 ; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    punpkhi p2.h, p1.b
 ; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    punpkhi p3.h, p0.b
-; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    st1d { z3.d }, p2, [x0, #3, mul vl]
+; CHECK-NEXT:    punpkhi p2.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    st1d { z2.d }, p1, [x0, #2, mul vl]
-; CHECK-NEXT:    st1d { z1.d }, p3, [x0, #1, mul vl]
+; CHECK-NEXT:    st1d { z1.d }, p2, [x0, #1, mul vl]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.nxv8i64(<vscale x 8 x i64> %data, ptr %a, i32 1, <vscale x 8 x i1> %pg)
diff --git a/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll b/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll
index 9c3d4b1e5a810..f556d60d23b88 100644
--- a/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll
+++ b/llvm/test/CodeGen/AArch64/sve-srem-combine-loop.ll
@@ -6,8 +6,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <vscale x 4 x i32> @srem_combine_loop(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: srem_combine_loop:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, #2 // =0x2
 ; CHECK-NEXT:    asrd z1.s, p0/m, z1.s, #1
 ; CHECK-NEXT:    mls z0.s, p0/m, z1.s, z2.s
diff --git a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll
index 728041d3f916b..3273e6b384f63 100644
--- a/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll
+++ b/llvm/test/CodeGen/AArch64/sve-st1-addressing-mode-reg-imm.ll
@@ -105,8 +105,8 @@ define void @st1d_inbound(<vscale x 2 x i64> %data, ptr %a) {
 define void @store_nxv2f32(ptr %out) {
 ; CHECK-LABEL: store_nxv2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    fmov z0.s, #1.00000000
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
   %ins = insertelement <vscale x 2 x float> undef, float 1.0, i32 0
@@ -118,8 +118,8 @@ define void @store_nxv2f32(ptr %out) {
 define void @store_nxv4f16(ptr %out) {
 ; CHECK-LABEL: store_nxv4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    fmov z0.h, #1.00000000
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
   %ins = insertelement <vscale x 4 x half> undef, half 1.0, i32 0
@@ -133,9 +133,9 @@ define void @store_nxv4f16(ptr %out) {
 define void @store_nxv6f32(ptr %out) {
 ; CHECK-LABEL: store_nxv6f32:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z0.s, #1.00000000
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    fmov z0.s, #1.00000000
 ; CHECK-NEXT:    st1w { z0.d }, p0, [x0, #2, mul vl]
 ; CHECK-NEXT:    st1w { z0.s }, p1, [x0]
 ; CHECK-NEXT:    ret
@@ -148,9 +148,9 @@ define void @store_nxv6f32(ptr %out) {
 define void @store_nxv12f16(ptr %out) {
 ; CHECK-LABEL: store_nxv12f16:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z0.h, #1.00000000
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    fmov z0.h, #1.00000000
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x0, #2, mul vl]
 ; CHECK-NEXT:    st1h { z0.h }, p1, [x0]
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-stepvector.ll b/llvm/test/CodeGen/AArch64/sve-stepvector.ll
index 6f5a31248de7e..4c5f27d3e7093 100644
--- a/llvm/test/CodeGen/AArch64/sve-stepvector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-stepvector.ll
@@ -208,9 +208,9 @@ entry:
 define <vscale x 4 x i32> @multiple_use_stepvector_nxv4i32_1(i32 %data) {
 ; CHECK-LABEL: multiple_use_stepvector_nxv4i32_1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    index z0.s, w0, #1
 ; CHECK-NEXT:    mov z1.s, w0
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mul z1.s, p0/m, z1.s, z0.s
 ; CHECK-NEXT:    sub z0.s, z1.s, z0.s
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
index be5c318e675df..d547f99a0230a 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-bit-counting.ll
@@ -11,8 +11,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i8> @ctlz_v4i8(<4 x i8> %op) {
 ; CHECK-LABEL: ctlz_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    and z0.h, z0.h, #0xff
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    sub z0.h, z0.h, #8 // =0x8
@@ -49,8 +49,8 @@ define <16 x i8> @ctlz_v16i8(<16 x i8> %op) {
 define void @ctlz_v32i8(ptr %a) {
 ; CHECK-LABEL: ctlz_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    clz z0.b, p0/m, z0.b
 ; CHECK-NEXT:    clz z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -64,8 +64,8 @@ define void @ctlz_v32i8(ptr %a) {
 define <2 x i16> @ctlz_v2i16(<2 x i16> %op) {
 ; CHECK-LABEL: ctlz_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    and z0.s, z0.s, #0xffff
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    sub z0.s, z0.s, #16 // =0x10
@@ -102,8 +102,8 @@ define <8 x i16> @ctlz_v8i16(<8 x i16> %op) {
 define void @ctlz_v16i16(ptr %a) {
 ; CHECK-LABEL: ctlz_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    clz z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -141,8 +141,8 @@ define <4 x i32> @ctlz_v4i32(<4 x i32> %op) {
 define void @ctlz_v8i32(ptr %a) {
 ; CHECK-LABEL: ctlz_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    clz z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -180,8 +180,8 @@ define <2 x i64> @ctlz_v2i64(<2 x i64> %op) {
 define void @ctlz_v4i64(ptr %a) {
 ; CHECK-LABEL: ctlz_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    clz z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -199,8 +199,8 @@ define void @ctlz_v4i64(ptr %a) {
 define <4 x i8> @ctpop_v4i8(<4 x i8> %op) {
 ; CHECK-LABEL: ctpop_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    and z0.h, z0.h, #0xff
 ; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -236,8 +236,8 @@ define <16 x i8> @ctpop_v16i8(<16 x i8> %op) {
 define void @ctpop_v32i8(ptr %a) {
 ; CHECK-LABEL: ctpop_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    cnt z0.b, p0/m, z0.b
 ; CHECK-NEXT:    cnt z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -251,8 +251,8 @@ define void @ctpop_v32i8(ptr %a) {
 define <2 x i16> @ctpop_v2i16(<2 x i16> %op) {
 ; CHECK-LABEL: ctpop_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    and z0.s, z0.s, #0xffff
 ; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -288,8 +288,8 @@ define <8 x i16> @ctpop_v8i16(<8 x i16> %op) {
 define void @ctpop_v16i16(ptr %a) {
 ; CHECK-LABEL: ctpop_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    cnt z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -327,8 +327,8 @@ define <4 x i32> @ctpop_v4i32(<4 x i32> %op) {
 define void @ctpop_v8i32(ptr %a) {
 ; CHECK-LABEL: ctpop_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    cnt z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -366,8 +366,8 @@ define <2 x i64> @ctpop_v2i64(<2 x i64> %op) {
 define void @ctpop_v4i64(ptr %a) {
 ; CHECK-LABEL: ctpop_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    cnt z0.d, p0/m, z0.d
 ; CHECK-NEXT:    cnt z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -385,8 +385,8 @@ define void @ctpop_v4i64(ptr %a) {
 define <4 x i8> @cttz_v4i8(<4 x i8> %op) {
 ; CHECK-LABEL: cttz_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    orr z0.h, z0.h, #0x100
 ; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
@@ -425,8 +425,8 @@ define <16 x i8> @cttz_v16i8(<16 x i8> %op) {
 define void @cttz_v32i8(ptr %a) {
 ; CHECK-LABEL: cttz_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
 ; CHECK-NEXT:    rbit z1.b, p0/m, z1.b
 ; CHECK-NEXT:    clz z0.b, p0/m, z0.b
@@ -442,8 +442,8 @@ define void @cttz_v32i8(ptr %a) {
 define <2 x i16> @cttz_v2i16(<2 x i16> %op) {
 ; CHECK-LABEL: cttz_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    orr z0.s, z0.s, #0x10000
 ; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
@@ -482,8 +482,8 @@ define <8 x i16> @cttz_v8i16(<8 x i16> %op) {
 define void @cttz_v16i16(ptr %a) {
 ; CHECK-LABEL: cttz_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
 ; CHECK-NEXT:    rbit z1.h, p0/m, z1.h
 ; CHECK-NEXT:    clz z0.h, p0/m, z0.h
@@ -525,8 +525,8 @@ define <4 x i32> @cttz_v4i32(<4 x i32> %op) {
 define void @cttz_v8i32(ptr %a) {
 ; CHECK-LABEL: cttz_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
 ; CHECK-NEXT:    rbit z1.s, p0/m, z1.s
 ; CHECK-NEXT:    clz z0.s, p0/m, z0.s
@@ -568,8 +568,8 @@ define <2 x i64> @cttz_v2i64(<2 x i64> %op) {
 define void @cttz_v4i64(ptr %a) {
 ; CHECK-LABEL: cttz_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
 ; CHECK-NEXT:    rbit z1.d, p0/m, z1.d
 ; CHECK-NEXT:    clz z0.d, p0/m, z0.d
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
index 251a7c3b18a9f..0aefba2d4c6ab 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ext-loads.ll
@@ -169,15 +169,15 @@ define <16 x i64> @load_zext_v16i16i64(ptr %ap)  {
 ; CHECK-NEXT:    ld1h { z1.d }, p0/z, [x0, x8, lsl #1]
 ; CHECK-NEXT:    mov x8, #8 // =0x8
 ; CHECK-NEXT:    ld1h { z2.d }, p0/z, [x0, x9, lsl #1]
-; CHECK-NEXT:    mov x9, #10 // =0xa
 ; CHECK-NEXT:    ld1h { z3.d }, p0/z, [x0, x10, lsl #1]
-; CHECK-NEXT:    mov x10, #12 // =0xc
+; CHECK-NEXT:    mov x9, #10 // =0xa
 ; CHECK-NEXT:    ld1h { z4.d }, p0/z, [x0, x8, lsl #1]
-; CHECK-NEXT:    mov x8, #14 // =0xe
+; CHECK-NEXT:    mov x8, #12 // =0xc
+; CHECK-NEXT:    mov x10, #14 // =0xe
 ; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    ld1h { z5.d }, p0/z, [x0, x9, lsl #1]
-; CHECK-NEXT:    ld1h { z6.d }, p0/z, [x0, x10, lsl #1]
-; CHECK-NEXT:    ld1h { z7.d }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT:    ld1h { z6.d }, p0/z, [x0, x8, lsl #1]
+; CHECK-NEXT:    ld1h { z7.d }, p0/z, [x0, x10, lsl #1]
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 killed $z1
 ; CHECK-NEXT:    // kill: def $q2 killed $q2 killed $z2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
index 2ace0bca274af..0d6675def8b52 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll
@@ -241,8 +241,8 @@ define void @test_copysign_v2f32_v2f64(ptr %ap, ptr %bp) {
 ; SVE-NEXT:    ptrue p0.d
 ; SVE-NEXT:    ldr q0, [x1]
 ; SVE-NEXT:    ldr d1, [x0]
-; SVE-NEXT:    and z1.s, z1.s, #0x7fffffff
 ; SVE-NEXT:    fcvt z0.s, p0/m, z0.d
+; SVE-NEXT:    and z1.s, z1.s, #0x7fffffff
 ; SVE-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; SVE-NEXT:    and z0.s, z0.s, #0x80000000
 ; SVE-NEXT:    orr z0.d, z1.d, z0.d
@@ -274,8 +274,8 @@ define void @test_copysign_v2f32_v2f64(ptr %ap, ptr %bp) {
 define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) {
 ; SVE-LABEL: test_copysign_v4f32_v4f64:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ptrue p0.d
 ; SVE-NEXT:    ldp q0, q1, [x1]
+; SVE-NEXT:    ptrue p0.d
 ; SVE-NEXT:    fcvt z1.s, p0/m, z1.d
 ; SVE-NEXT:    fcvt z0.s, p0/m, z0.d
 ; SVE-NEXT:    ptrue p0.s, vl2
@@ -291,8 +291,8 @@ define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) {
 ;
 ; SVE2-LABEL: test_copysign_v4f32_v4f64:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ptrue p0.d
 ; SVE2-NEXT:    ldp q0, q1, [x1]
+; SVE2-NEXT:    ptrue p0.d
 ; SVE2-NEXT:    ldr q2, [x0]
 ; SVE2-NEXT:    fcvt z1.s, p0/m, z1.d
 ; SVE2-NEXT:    fcvt z0.s, p0/m, z0.d
@@ -319,8 +319,8 @@ define void @test_copysign_v2f64_v2f32(ptr %ap, ptr %bp) {
 ; SVE:       // %bb.0:
 ; SVE-NEXT:    ptrue p0.d, vl2
 ; SVE-NEXT:    ldr q0, [x0]
-; SVE-NEXT:    and z0.d, z0.d, #0x7fffffffffffffff
 ; SVE-NEXT:    ld1w { z1.d }, p0/z, [x1]
+; SVE-NEXT:    and z0.d, z0.d, #0x7fffffffffffffff
 ; SVE-NEXT:    fcvt z1.d, p0/m, z1.s
 ; SVE-NEXT:    and z1.d, z1.d, #0x8000000000000000
 ; SVE-NEXT:    orr z0.d, z0.d, z1.d
@@ -354,10 +354,10 @@ define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) {
 ; SVE-NEXT:    ptrue p0.d, vl2
 ; SVE-NEXT:    mov x8, #2 // =0x2
 ; SVE-NEXT:    ldp q2, q3, [x0]
-; SVE-NEXT:    and z2.d, z2.d, #0x7fffffffffffffff
-; SVE-NEXT:    and z3.d, z3.d, #0x7fffffffffffffff
 ; SVE-NEXT:    ld1w { z0.d }, p0/z, [x1]
 ; SVE-NEXT:    ld1w { z1.d }, p0/z, [x1, x8, lsl #2]
+; SVE-NEXT:    and z2.d, z2.d, #0x7fffffffffffffff
+; SVE-NEXT:    and z3.d, z3.d, #0x7fffffffffffffff
 ; SVE-NEXT:    fcvt z0.d, p0/m, z0.s
 ; SVE-NEXT:    fcvt z1.d, p0/m, z1.s
 ; SVE-NEXT:    and z0.d, z0.d, #0x8000000000000000
@@ -397,8 +397,8 @@ define void @test_copysign_v4f16_v4f32(ptr %ap, ptr %bp) {
 ; SVE-NEXT:    ptrue p0.s
 ; SVE-NEXT:    ldr q0, [x1]
 ; SVE-NEXT:    ldr d1, [x0]
-; SVE-NEXT:    and z1.h, z1.h, #0x7fff
 ; SVE-NEXT:    fcvt z0.h, p0/m, z0.s
+; SVE-NEXT:    and z1.h, z1.h, #0x7fff
 ; SVE-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; SVE-NEXT:    and z0.h, z0.h, #0x8000
 ; SVE-NEXT:    orr z0.d, z1.d, z0.d
@@ -429,13 +429,13 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) {
 ; SVE:       // %bb.0:
 ; SVE-NEXT:    ldp q0, q1, [x1]
 ; SVE-NEXT:    ptrue p0.s, vl2
-; SVE-NEXT:    ptrue p1.s
 ; SVE-NEXT:    fcvtxn v1.2s, v1.2d
 ; SVE-NEXT:    fcvtxn v0.2s, v0.2d
 ; SVE-NEXT:    splice z0.s, p0, z0.s, z1.s
+; SVE-NEXT:    ptrue p0.s
 ; SVE-NEXT:    ldr d1, [x0]
 ; SVE-NEXT:    and z1.h, z1.h, #0x7fff
-; SVE-NEXT:    fcvt z0.h, p1/m, z0.s
+; SVE-NEXT:    fcvt z0.h, p0/m, z0.s
 ; SVE-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; SVE-NEXT:    and z0.h, z0.h, #0x8000
 ; SVE-NEXT:    orr z0.d, z1.d, z0.d
@@ -446,13 +446,13 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) {
 ; SVE2:       // %bb.0:
 ; SVE2-NEXT:    ldp q0, q1, [x1]
 ; SVE2-NEXT:    ptrue p0.s, vl2
-; SVE2-NEXT:    ptrue p1.s
 ; SVE2-NEXT:    ldr d2, [x0]
 ; SVE2-NEXT:    fcvtxn v1.2s, v1.2d
 ; SVE2-NEXT:    fcvtxn v0.2s, v0.2d
 ; SVE2-NEXT:    splice z0.s, p0, z0.s, z1.s
+; SVE2-NEXT:    ptrue p0.s
 ; SVE2-NEXT:    mov z1.h, #32767 // =0x7fff
-; SVE2-NEXT:    fcvt z0.h, p1/m, z0.s
+; SVE2-NEXT:    fcvt z0.h, p0/m, z0.s
 ; SVE2-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; SVE2-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; SVE2-NEXT:    str d2, [x0]
@@ -470,8 +470,8 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) {
 define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) {
 ; SVE-LABEL: test_copysign_v8f16_v8f32:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ptrue p0.s
 ; SVE-NEXT:    ldp q0, q1, [x1]
+; SVE-NEXT:    ptrue p0.s
 ; SVE-NEXT:    fcvt z1.h, p0/m, z1.s
 ; SVE-NEXT:    fcvt z0.h, p0/m, z0.s
 ; SVE-NEXT:    ptrue p0.h, vl4
@@ -487,8 +487,8 @@ define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) {
 ;
 ; SVE2-LABEL: test_copysign_v8f16_v8f32:
 ; SVE2:       // %bb.0:
-; SVE2-NEXT:    ptrue p0.s
 ; SVE2-NEXT:    ldp q0, q1, [x1]
+; SVE2-NEXT:    ptrue p0.s
 ; SVE2-NEXT:    ldr q2, [x0]
 ; SVE2-NEXT:    fcvt z1.h, p0/m, z1.s
 ; SVE2-NEXT:    fcvt z0.h, p0/m, z0.s
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
index c436dea8ff1b2..c2d6ed4e9ccf9 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-arith.ll
@@ -50,8 +50,8 @@ define <8 x half> @fadd_v8f16(<8 x half> %op1, <8 x half> %op2) {
 define void @fadd_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fadd_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z2
@@ -94,8 +94,8 @@ define <4 x float> @fadd_v4f32(<4 x float> %op1, <4 x float> %op2) {
 define void @fadd_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fadd_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z2
@@ -125,8 +125,8 @@ define <2 x double> @fadd_v2f64(<2 x double> %op1, <2 x double> %op2) {
 define void @fadd_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fadd_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    movprfx z1, z2
@@ -186,8 +186,8 @@ define <8 x half> @fdiv_v8f16(<8 x half> %op1, <8 x half> %op2) {
 define void @fdiv_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fdiv_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fdivr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z2
@@ -230,8 +230,8 @@ define <4 x float> @fdiv_v4f32(<4 x float> %op1, <4 x float> %op2) {
 define void @fdiv_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fdiv_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fdivr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z2
@@ -261,8 +261,8 @@ define <2 x double> @fdiv_v2f64(<2 x double> %op1, <2 x double> %op2) {
 define void @fdiv_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fdiv_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fdivr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    movprfx z1, z2
@@ -325,8 +325,8 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
 define void @fma_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fma_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q4, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q5, [x2]
 ; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fmad z0.h, p0/m, z2.h, z1.h
@@ -373,8 +373,8 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o
 define void @fma_v8f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fma_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q4, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q5, [x2]
 ; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fmad z0.s, p0/m, z2.s, z1.s
@@ -407,8 +407,8 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double
 define void @fma_v4f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fma_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q4, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q5, [x2]
 ; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fmad z0.d, p0/m, z2.d, z1.d
@@ -470,8 +470,8 @@ define <8 x half> @fmul_v8f16(<8 x half> %op1, <8 x half> %op2) {
 define void @fmul_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmul_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z2
@@ -514,8 +514,8 @@ define <4 x float> @fmul_v4f32(<4 x float> %op1, <4 x float> %op2) {
 define void @fmul_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmul_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z2
@@ -545,8 +545,8 @@ define <2 x double> @fmul_v2f64(<2 x double> %op1, <2 x double> %op2) {
 define void @fmul_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmul_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    movprfx z1, z2
@@ -603,8 +603,8 @@ define <8 x half> @fneg_v8f16(<8 x half> %op) {
 define void @fneg_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fneg_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    fneg z0.h, p0/m, z0.h
 ; CHECK-NEXT:    fneg z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -642,8 +642,8 @@ define <4 x float> @fneg_v4f32(<4 x float> %op) {
 define void @fneg_v8f32(ptr %a) {
 ; CHECK-LABEL: fneg_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    fneg z0.s, p0/m, z0.s
 ; CHECK-NEXT:    fneg z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -669,8 +669,8 @@ define <2 x double> @fneg_v2f64(<2 x double> %op) {
 define void @fneg_v4f64(ptr %a) {
 ; CHECK-LABEL: fneg_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    fneg z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fneg z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -724,8 +724,8 @@ define <8 x half> @fsqrt_v8f16(<8 x half> %op) {
 define void @fsqrt_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fsqrt_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
 ; CHECK-NEXT:    fsqrt z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -763,8 +763,8 @@ define <4 x float> @fsqrt_v4f32(<4 x float> %op) {
 define void @fsqrt_v8f32(ptr %a) {
 ; CHECK-LABEL: fsqrt_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    fsqrt z0.s, p0/m, z0.s
 ; CHECK-NEXT:    fsqrt z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -790,8 +790,8 @@ define <2 x double> @fsqrt_v2f64(<2 x double> %op) {
 define void @fsqrt_v4f64(ptr %a) {
 ; CHECK-LABEL: fsqrt_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    fsqrt z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fsqrt z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -848,8 +848,8 @@ define <8 x half> @fsub_v8f16(<8 x half> %op1, <8 x half> %op2) {
 define void @fsub_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fsub_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fsubr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z2
@@ -892,8 +892,8 @@ define <4 x float> @fsub_v4f32(<4 x float> %op1, <4 x float> %op2) {
 define void @fsub_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fsub_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fsubr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z2
@@ -923,8 +923,8 @@ define <2 x double> @fsub_v2f64(<2 x double> %op1, <2 x double> %op2) {
 define void @fsub_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fsub_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fsubr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    movprfx z1, z2
@@ -981,8 +981,8 @@ define <8 x half> @fabs_v8f16(<8 x half> %op) {
 define void @fabs_v16f16(ptr %a) {
 ; CHECK-LABEL: fabs_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    fabs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    fabs z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -1020,8 +1020,8 @@ define <4 x float> @fabs_v4f32(<4 x float> %op) {
 define void @fabs_v8f32(ptr %a) {
 ; CHECK-LABEL: fabs_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    fabs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    fabs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -1047,8 +1047,8 @@ define <2 x double> @fabs_v2f64(<2 x double> %op) {
 define void @fabs_v4f64(ptr %a) {
 ; CHECK-LABEL: fabs_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    fabs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fabs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
index aad078f035f7d..e92694d1fc80d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-compares.ll
@@ -57,8 +57,8 @@ define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) {
 define void @fcmp_oeq_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_oeq_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmeq p1.h, p0/z, z1.h, z0.h
 ; CHECK-NEXT:    fcmeq p0.h, p0/z, z2.h, z3.h
@@ -107,8 +107,8 @@ define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) {
 define void @fcmp_oeq_v8f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_oeq_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmeq p1.s, p0/z, z1.s, z0.s
 ; CHECK-NEXT:    fcmeq p0.s, p0/z, z2.s, z3.s
@@ -157,8 +157,8 @@ define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) {
 define void @fcmp_oeq_v4f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_oeq_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmeq p1.d, p0/z, z1.d, z0.d
 ; CHECK-NEXT:    fcmeq p0.d, p0/z, z2.d, z3.d
@@ -181,8 +181,8 @@ define void @fcmp_oeq_v4f64(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_ueq_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_ueq_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z1.h, z0.h
 ; CHECK-NEXT:    fcmeq p2.h, p0/z, z1.h, z0.h
@@ -209,8 +209,8 @@ define void @fcmp_ueq_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_one_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_one_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z1.h
 ; CHECK-NEXT:    fcmgt p2.h, p0/z, z1.h, z0.h
@@ -237,8 +237,8 @@ define void @fcmp_one_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_une_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmne p1.h, p0/z, z1.h, z0.h
 ; CHECK-NEXT:    fcmne p0.h, p0/z, z2.h, z3.h
@@ -261,8 +261,8 @@ define void @fcmp_une_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_ogt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_ogt_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z0.h
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z2.h, z3.h
@@ -285,8 +285,8 @@ define void @fcmp_ogt_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_ugt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_ugt_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
 ; CHECK-NEXT:    fcmge p0.h, p0/z, z3.h, z2.h
@@ -312,8 +312,8 @@ define void @fcmp_ugt_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_olt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_olt_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z1.h
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z3.h, z2.h
@@ -336,8 +336,8 @@ define void @fcmp_olt_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_ult_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_ult_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z0.h
 ; CHECK-NEXT:    fcmge p0.h, p0/z, z2.h, z3.h
@@ -363,8 +363,8 @@ define void @fcmp_ult_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_oge_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_oge_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z0.h
 ; CHECK-NEXT:    fcmge p0.h, p0/z, z2.h, z3.h
@@ -387,8 +387,8 @@ define void @fcmp_oge_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_uge_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_uge_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z1.h
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z3.h, z2.h
@@ -414,8 +414,8 @@ define void @fcmp_uge_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_ole_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_ole_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
 ; CHECK-NEXT:    fcmge p0.h, p0/z, z3.h, z2.h
@@ -438,8 +438,8 @@ define void @fcmp_ole_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_ule_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_ule_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z0.h
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z2.h, z3.h
@@ -465,8 +465,8 @@ define void @fcmp_ule_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_uno_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_uno_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z1.h, z0.h
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z2.h, z3.h
@@ -489,8 +489,8 @@ define void @fcmp_uno_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_ord_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_ord_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmuo p1.h, p0/z, z1.h, z0.h
 ; CHECK-NEXT:    fcmuo p0.h, p0/z, z2.h, z3.h
@@ -516,8 +516,8 @@ define void @fcmp_ord_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_eq_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_eq_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmeq p1.h, p0/z, z1.h, z0.h
 ; CHECK-NEXT:    fcmeq p0.h, p0/z, z2.h, z3.h
@@ -540,8 +540,8 @@ define void @fcmp_eq_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_ne_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_ne_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmne p1.h, p0/z, z1.h, z0.h
 ; CHECK-NEXT:    fcmne p0.h, p0/z, z2.h, z3.h
@@ -564,8 +564,8 @@ define void @fcmp_ne_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_gt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_gt_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmgt p1.h, p0/z, z1.h, z0.h
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z2.h, z3.h
@@ -588,8 +588,8 @@ define void @fcmp_gt_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_lt_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_lt_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmgt p1.h, p0/z, z0.h, z1.h
 ; CHECK-NEXT:    fcmgt p0.h, p0/z, z3.h, z2.h
@@ -612,8 +612,8 @@ define void @fcmp_lt_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_ge_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_ge_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z1.h, z0.h
 ; CHECK-NEXT:    fcmge p0.h, p0/z, z2.h, z3.h
@@ -636,8 +636,8 @@ define void @fcmp_ge_v16f16(ptr %a, ptr %b, ptr %c) {
 define void @fcmp_le_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fcmp_le_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fcmge p1.h, p0/z, z0.h, z1.h
 ; CHECK-NEXT:    fcmge p0.h, p0/z, z3.h, z2.h
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
index 18f9a4d371d0c..9bdde14e8d83d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-convert.ll
@@ -8,9 +8,9 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @fp_convert_combine_crash(ptr %a, ptr %b) {
 ; CHECK-LABEL: fp_convert_combine_crash:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmov z0.s, #8.00000000
+; CHECK-NEXT:    ldp q1, q2, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    fmul z1.s, p0/m, z1.s, z0.s
 ; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, z2.s
 ; CHECK-NEXT:    fcvtzs z1.s, p0/m, z1.s
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
index 28e02da53af43..244a405101739 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-extend-trunc.ll
@@ -11,8 +11,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @fcvt_v2f16_to_v2f32(<2 x half> %a, ptr %b) {
 ; CHECK-LABEL: fcvt_v2f16_to_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    str d0, [x0]
@@ -25,8 +25,8 @@ define void @fcvt_v2f16_to_v2f32(<2 x half> %a, ptr %b) {
 define void @fcvt_v4f16_to_v4f32(<4 x half> %a, ptr %b) {
 ; CHECK-LABEL: fcvt_v4f16_to_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
 ; CHECK-NEXT:    str q0, [x0]
@@ -371,8 +371,8 @@ define void @fcvt_v4f32_v4f16(ptr %a, ptr %b) {
 define void @fcvt_v8f32_v8f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvt_v8f32_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    mov x8, #4 // =0x4
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
 ; CHECK-NEXT:    fcvt z1.h, p0/m, z1.s
@@ -420,8 +420,8 @@ define void @fcvt_v2f64_v2f16(ptr %a, ptr %b) {
 define void @fcvt_v4f64_v4f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvt_v4f64_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    mov x8, #2 // =0x2
 ; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
 ; CHECK-NEXT:    fcvt z1.h, p0/m, z1.d
@@ -467,8 +467,8 @@ define void @fcvt_v2f64_v2f32(<2 x double> %op1, ptr %b) {
 define void @fcvt_v4f64_v4f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvt_v4f64_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    mov x8, #2 // =0x2
 ; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
 ; CHECK-NEXT:    fcvt z1.s, p0/m, z1.d
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
index b5df97f767c13..478be9ab76dd9 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-fma.ll
@@ -40,8 +40,8 @@ define <8 x half> @fma_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x half> %op3)
 define void @fma_v16f16(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fma_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q4, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q5, [x2]
 ; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fmad z0.h, p0/m, z2.h, z1.h
@@ -91,8 +91,8 @@ define <4 x float> @fma_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x float> %o
 define void @fma_v8f32(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fma_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q4, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q5, [x2]
 ; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fmad z0.s, p0/m, z2.s, z1.s
@@ -140,8 +140,8 @@ define <2 x double> @fma_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x double
 define void @fma_v4f64(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: fma_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q4, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q5, [x2]
 ; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fmad z0.d, p0/m, z2.d, z1.d
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
index 07a67e2650290..4dc034adf459a 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-minmax.ll
@@ -37,8 +37,8 @@ define <8 x half> @fmaxnm_v8f16(<8 x half> %op1, <8 x half> %op2) {
 define void @fmaxnm_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmaxnm_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z2
@@ -81,8 +81,8 @@ define <4 x float> @fmaxnm_v4f32(<4 x float> %op1, <4 x float> %op2) {
 define void @fmaxnm_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmaxnm_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z2
@@ -123,8 +123,8 @@ define <2 x double> @fmaxnm_v2f64(<2 x double> %op1, <2 x double> %op2) {
 define void @fmaxnm_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmaxnm_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    movprfx z1, z2
@@ -171,8 +171,8 @@ define <8 x half> @fminnm_v8f16(<8 x half> %op1, <8 x half> %op2) {
 define void @fminnm_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fminnm_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z2
@@ -215,8 +215,8 @@ define <4 x float> @fminnm_v4f32(<4 x float> %op1, <4 x float> %op2) {
 define void @fminnm_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fminnm_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z2
@@ -257,8 +257,8 @@ define <2 x double> @fminnm_v2f64(<2 x double> %op1, <2 x double> %op2) {
 define void @fminnm_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fminnm_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    movprfx z1, z2
@@ -305,8 +305,8 @@ define <8 x half> @fmax_v8f16(<8 x half> %op1, <8 x half> %op2) {
 define void @fmax_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmax_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z2
@@ -349,8 +349,8 @@ define <4 x float> @fmax_v4f32(<4 x float> %op1, <4 x float> %op2) {
 define void @fmax_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmax_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z2
@@ -391,8 +391,8 @@ define <2 x double> @fmax_v2f64(<2 x double> %op1, <2 x double> %op2) {
 define void @fmax_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmax_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    movprfx z1, z2
@@ -439,8 +439,8 @@ define <8 x half> @fmin_v8f16(<8 x half> %op1, <8 x half> %op2) {
 define void @fmin_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmin_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z2
@@ -483,8 +483,8 @@ define <4 x float> @fmin_v4f32(<4 x float> %op1, <4 x float> %op2) {
 define void @fmin_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmin_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z2
@@ -525,8 +525,8 @@ define <2 x double> @fmin_v2f64(<2 x double> %op1, <2 x double> %op2) {
 define void @fmin_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fmin_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fmin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    movprfx z1, z2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
index d2d771c48c204..bd10a0e091c0d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-reduce.ll
@@ -211,8 +211,8 @@ define half @faddv_v8f16(half %start, <8 x half> %a) {
 define half @faddv_v16f16(half %start, ptr %a) {
 ; CHECK-LABEL: faddv_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q2, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    fadd z1.h, p0/m, z1.h, z2.h
 ; CHECK-NEXT:    faddv h1, p0, z1.h
 ; CHECK-NEXT:    fadd h0, h0, h1
@@ -249,8 +249,8 @@ define float @faddv_v4f32(float %start, <4 x float> %a) {
 define float @faddv_v8f32(float %start, ptr %a) {
 ; CHECK-LABEL: faddv_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q2, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    fadd z1.s, p0/m, z1.s, z2.s
 ; CHECK-NEXT:    faddv s1, p0, z1.s
 ; CHECK-NEXT:    fadd s0, s0, s1
@@ -285,8 +285,8 @@ define double @faddv_v2f64(double %start, <2 x double> %a) {
 define double @faddv_v4f64(double %start, ptr %a) {
 ; CHECK-LABEL: faddv_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q2, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    fadd z1.d, p0/m, z1.d, z2.d
 ; CHECK-NEXT:    faddv d1, p0, z1.d
 ; CHECK-NEXT:    fadd d0, d0, d1
@@ -327,8 +327,8 @@ define half @fmaxv_v8f16(<8 x half> %a) {
 define half @fmaxv_v16f16(ptr %a) {
 ; CHECK-LABEL: fmaxv_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    fmaxnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    fmaxnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
@@ -365,8 +365,8 @@ define float @fmaxv_v4f32(<4 x float> %a) {
 define float @fmaxv_v8f32(ptr %a) {
 ; CHECK-LABEL: fmaxv_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    fmaxnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    fmaxnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
@@ -401,8 +401,8 @@ define double @fmaxv_v2f64(<2 x double> %a) {
 define double @fmaxv_v4f64(ptr %a) {
 ; CHECK-LABEL: fmaxv_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    fmaxnm z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    fmaxnmv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -443,8 +443,8 @@ define half @fminv_v8f16(<8 x half> %a) {
 define half @fminv_v16f16(ptr %a) {
 ; CHECK-LABEL: fminv_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    fminnm z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    fminnmv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
@@ -481,8 +481,8 @@ define float @fminv_v4f32(<4 x float> %a) {
 define float @fminv_v8f32(ptr %a) {
 ; CHECK-LABEL: fminv_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    fminnm z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    fminnmv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
@@ -517,8 +517,8 @@ define double @fminv_v2f64(<2 x double> %a) {
 define double @fminv_v4f64(ptr %a) {
 ; CHECK-LABEL: fminv_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    fminnm z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    fminnmv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -559,8 +559,8 @@ define half @fmaximumv_v8f16(<8 x half> %a) {
 define half @fmaximumv_v16f16(ptr %a) {
 ; CHECK-LABEL: fmaximumv_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    fmax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    fmaxv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
@@ -597,8 +597,8 @@ define float @fmaximumv_v4f32(<4 x float> %a) {
 define float @fmaximumv_v8f32(ptr %a) {
 ; CHECK-LABEL: fmaximumv_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    fmax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    fmaxv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
@@ -633,8 +633,8 @@ define double @fmaximumv_v2f64(<2 x double> %a) {
 define double @fmaximumv_v4f64(ptr %a) {
 ; CHECK-LABEL: fmaximumv_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    fmax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    fmaxv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -675,8 +675,8 @@ define half @fminimumv_v8f16(<8 x half> %a) {
 define half @fminimumv_v16f16(ptr %a) {
 ; CHECK-LABEL: fminimumv_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    fmin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    fminv h0, p0, z0.h
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
@@ -713,8 +713,8 @@ define float @fminimumv_v4f32(<4 x float> %a) {
 define float @fminimumv_v8f32(ptr %a) {
 ; CHECK-LABEL: fminimumv_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    fmin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    fminv s0, p0, z0.s
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
@@ -749,8 +749,8 @@ define double @fminimumv_v2f64(<2 x double> %a) {
 define double @fminimumv_v4f64(ptr %a) {
 ; CHECK-LABEL: fminimumv_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    fmin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    fminv d0, p0, z0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
index 580b43531070f..24832d807c649 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-rounding.ll
@@ -47,8 +47,8 @@ define <8 x half> @frintp_v8f16(<8 x half> %op) {
 define void @frintp_v16f16(ptr %a) {
 ; CHECK-LABEL: frintp_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    frintp z0.h, p0/m, z0.h
 ; CHECK-NEXT:    frintp z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -86,8 +86,8 @@ define <4 x float> @frintp_v4f32(<4 x float> %op) {
 define void @frintp_v8f32(ptr %a) {
 ; CHECK-LABEL: frintp_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    frintp z0.s, p0/m, z0.s
 ; CHECK-NEXT:    frintp z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -123,8 +123,8 @@ define <2 x double> @frintp_v2f64(<2 x double> %op) {
 define void @frintp_v4f64(ptr %a) {
 ; CHECK-LABEL: frintp_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    frintp z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frintp z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -178,8 +178,8 @@ define <8 x half> @frintm_v8f16(<8 x half> %op) {
 define void @frintm_v16f16(ptr %a) {
 ; CHECK-LABEL: frintm_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    frintm z0.h, p0/m, z0.h
 ; CHECK-NEXT:    frintm z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -217,8 +217,8 @@ define <4 x float> @frintm_v4f32(<4 x float> %op) {
 define void @frintm_v8f32(ptr %a) {
 ; CHECK-LABEL: frintm_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    frintm z0.s, p0/m, z0.s
 ; CHECK-NEXT:    frintm z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -254,8 +254,8 @@ define <2 x double> @frintm_v2f64(<2 x double> %op) {
 define void @frintm_v4f64(ptr %a) {
 ; CHECK-LABEL: frintm_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    frintm z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frintm z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -309,8 +309,8 @@ define <8 x half> @frinti_v8f16(<8 x half> %op) {
 define void @frinti_v16f16(ptr %a) {
 ; CHECK-LABEL: frinti_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    frinti z0.h, p0/m, z0.h
 ; CHECK-NEXT:    frinti z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -348,8 +348,8 @@ define <4 x float> @frinti_v4f32(<4 x float> %op) {
 define void @frinti_v8f32(ptr %a) {
 ; CHECK-LABEL: frinti_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    frinti z0.s, p0/m, z0.s
 ; CHECK-NEXT:    frinti z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -385,8 +385,8 @@ define <2 x double> @frinti_v2f64(<2 x double> %op) {
 define void @frinti_v4f64(ptr %a) {
 ; CHECK-LABEL: frinti_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    frinti z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frinti z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -440,8 +440,8 @@ define <8 x half> @frintx_v8f16(<8 x half> %op) {
 define void @frintx_v16f16(ptr %a) {
 ; CHECK-LABEL: frintx_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
 ; CHECK-NEXT:    frintx z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -479,8 +479,8 @@ define <4 x float> @frintx_v4f32(<4 x float> %op) {
 define void @frintx_v8f32(ptr %a) {
 ; CHECK-LABEL: frintx_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
 ; CHECK-NEXT:    frintx z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -516,8 +516,8 @@ define <2 x double> @frintx_v2f64(<2 x double> %op) {
 define void @frintx_v4f64(ptr %a) {
 ; CHECK-LABEL: frintx_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frintx z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -571,8 +571,8 @@ define <8 x half> @frinta_v8f16(<8 x half> %op) {
 define void @frinta_v16f16(ptr %a) {
 ; CHECK-LABEL: frinta_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    frinta z0.h, p0/m, z0.h
 ; CHECK-NEXT:    frinta z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -610,8 +610,8 @@ define <4 x float> @frinta_v4f32(<4 x float> %op) {
 define void @frinta_v8f32(ptr %a) {
 ; CHECK-LABEL: frinta_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    frinta z0.s, p0/m, z0.s
 ; CHECK-NEXT:    frinta z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -647,8 +647,8 @@ define <2 x double> @frinta_v2f64(<2 x double> %op) {
 define void @frinta_v4f64(ptr %a) {
 ; CHECK-LABEL: frinta_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    frinta z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frinta z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -702,8 +702,8 @@ define <8 x half> @frintn_v8f16(<8 x half> %op) {
 define void @frintn_v16f16(ptr %a) {
 ; CHECK-LABEL: frintn_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    frintn z0.h, p0/m, z0.h
 ; CHECK-NEXT:    frintn z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -741,8 +741,8 @@ define <4 x float> @frintn_v4f32(<4 x float> %op) {
 define void @frintn_v8f32(ptr %a) {
 ; CHECK-LABEL: frintn_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    frintn z0.s, p0/m, z0.s
 ; CHECK-NEXT:    frintn z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -778,8 +778,8 @@ define <2 x double> @frintn_v2f64(<2 x double> %op) {
 define void @frintn_v4f64(ptr %a) {
 ; CHECK-LABEL: frintn_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    frintn z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frintn z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -833,8 +833,8 @@ define <8 x half> @frintz_v8f16(<8 x half> %op) {
 define void @frintz_v16f16(ptr %a) {
 ; CHECK-LABEL: frintz_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    frintz z0.h, p0/m, z0.h
 ; CHECK-NEXT:    frintz z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -872,8 +872,8 @@ define <4 x float> @frintz_v4f32(<4 x float> %op) {
 define void @frintz_v8f32(ptr %a) {
 ; CHECK-LABEL: frintz_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    frintz z0.s, p0/m, z0.s
 ; CHECK-NEXT:    frintz z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -909,8 +909,8 @@ define <2 x double> @frintz_v2f64(<2 x double> %op) {
 define void @frintz_v4f64(ptr %a) {
 ; CHECK-LABEL: frintz_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    frintz z0.d, p0/m, z0.d
 ; CHECK-NEXT:    frintz z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
index 73fd7e1465343..132225546fc4f 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-select.ll
@@ -7,8 +7,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v2f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w0
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    and z2.h, z2.h, #0x1
@@ -23,8 +23,8 @@ define <2 x half> @select_v2f16(<2 x half> %op1, <2 x half> %op2, i1 %mask) {
 define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w0
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    and z2.h, z2.h, #0x1
@@ -39,8 +39,8 @@ define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) {
 define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w0
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    and z2.h, z2.h, #0x1
@@ -55,8 +55,8 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) {
 define void @select_v16f16(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-LABEL: select_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z0.h, w2
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    and z0.h, z0.h, #0x1
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -77,8 +77,8 @@ define void @select_v16f16(ptr %a, ptr %b, i1 %mask) {
 define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and w8, w0, #0x1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    mov z2.s, w8
@@ -93,8 +93,8 @@ define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) {
 define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and w8, w0, #0x1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    mov z2.s, w8
@@ -109,8 +109,8 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) {
 define void @select_v8f32(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-LABEL: select_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z0.s, w8
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -150,9 +150,9 @@ define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask
 define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    and x8, x0, #0x1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    mov z2.d, x8
@@ -167,9 +167,9 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask
 define void @select_v4f64(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-LABEL: select_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    and x8, x2, #0x1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z0.d, x8
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    ldr q0, [x0]
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
index d6adf9cf0ad67..58eae212d7999 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -36,8 +36,8 @@ define void @fcvtzu_v8f16_v8i16(ptr %a, ptr %b) {
 define void @fcvtzu_v16f16_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v16f16_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    fcvtzu z0.h, p0/m, z0.h
 ; CHECK-NEXT:    fcvtzu z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
@@ -55,8 +55,8 @@ define void @fcvtzu_v16f16_v16i16(ptr %a, ptr %b) {
 define <2 x i32> @fcvtzu_v2f16_v2i32(<2 x half> %op1) {
 ; CHECK-LABEL: fcvtzu_v2f16_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -68,8 +68,8 @@ define <2 x i32> @fcvtzu_v2f16_v2i32(<2 x half> %op1) {
 define <4 x i32> @fcvtzu_v4f16_v4i32(<4 x half> %op1) {
 ; CHECK-LABEL: fcvtzu_v4f16_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -302,8 +302,8 @@ define <4 x i16> @fcvtzu_v4f32_v4i16(<4 x float> %op1) {
 define <8 x i16> @fcvtzu_v8f32_v8i16(ptr %a) {
 ; CHECK-LABEL: fcvtzu_v8f32_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    fcvtzu z1.s, p0/m, z1.s
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
@@ -320,8 +320,8 @@ define <8 x i16> @fcvtzu_v8f32_v8i16(ptr %a) {
 define void @fcvtzu_v16f32_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v16f32_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fcvtzu z1.s, p0/m, z1.s
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
@@ -373,8 +373,8 @@ define <4 x i32> @fcvtzu_v4f32_v4i32(<4 x float> %op1) {
 define void @fcvtzu_v8f32_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v8f32_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
 ; CHECK-NEXT:    fcvtzu z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
@@ -392,8 +392,8 @@ define void @fcvtzu_v8f32_v8i32(ptr %a, ptr %b) {
 define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) {
 ; CHECK-LABEL: fcvtzu_v1f32_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -405,8 +405,8 @@ define <1 x i64> @fcvtzu_v1f32_v1i64(<1 x float> %op1) {
 define <2 x i64> @fcvtzu_v2f32_v2i64(<2 x float> %op1) {
 ; CHECK-LABEL: fcvtzu_v2f32_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -491,8 +491,8 @@ define <4 x i16> @fcvtzu_v4f64_v4i16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
@@ -520,8 +520,8 @@ define <8 x i16> @fcvtzu_v8f64_v8i16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q0, [x0, #32]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q3, q2, [x0]
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
@@ -563,24 +563,22 @@ define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q3, q2, [x0]
-; CHECK-NEXT:    ldr q6, [x0, #112]
-; CHECK-NEXT:    ldp q4, q5, [x0, #80]
-; CHECK-NEXT:    ldr q7, [x0, #64]
+; CHECK-NEXT:    ldp q4, q5, [x0, #96]
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
+; CHECK-NEXT:    ldp q6, q7, [x0, #64]
 ; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.d
-; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.d
 ; CHECK-NEXT:    fcvtzs z5.d, p0/m, z5.d
 ; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.d
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.d
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    uzp1 z6.s, z6.s, z6.s
 ; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
 ; CHECK-NEXT:    fmov w8, s1
 ; CHECK-NEXT:    mov z16.s, z1.s[1]
@@ -606,25 +604,26 @@ define void @fcvtzu_v16f64_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z3.s, z5.s[1]
 ; CHECK-NEXT:    strh w8, [sp, #6]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.s, z6.s[1]
+; CHECK-NEXT:    uzp1 z2.s, z6.s, z6.s
 ; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    strh w8, [sp, #28]
 ; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    strh w8, [sp, #24]
+; CHECK-NEXT:    strh w8, [sp, #28]
 ; CHECK-NEXT:    fmov w8, s1
 ; CHECK-NEXT:    mov z1.s, z1.s[1]
-; CHECK-NEXT:    strh w8, [sp, #20]
+; CHECK-NEXT:    strh w8, [sp, #24]
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    strh w8, [sp, #16]
+; CHECK-NEXT:    strh w8, [sp, #20]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w8, [sp, #30]
+; CHECK-NEXT:    mov z2.s, z2.s[1]
+; CHECK-NEXT:    strh w8, [sp, #16]
 ; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w8, [sp, #26]
+; CHECK-NEXT:    strh w8, [sp, #30]
 ; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w8, [sp, #22]
+; CHECK-NEXT:    strh w8, [sp, #26]
 ; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strh w8, [sp, #22]
+; CHECK-NEXT:    fmov w8, s2
 ; CHECK-NEXT:    strh w8, [sp, #18]
 ; CHECK-NEXT:    ldp q1, q0, [sp]
 ; CHECK-NEXT:    stp q1, q0, [x1]
@@ -669,8 +668,8 @@ define <2 x i32> @fcvtzu_v2f64_v2i32(<2 x double> %op1) {
 define <4 x i32> @fcvtzu_v4f64_v4i32(ptr %a) {
 ; CHECK-LABEL: fcvtzu_v4f64_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.d
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
 ; CHECK-NEXT:    ptrue p0.s, vl2
@@ -687,8 +686,8 @@ define <4 x i32> @fcvtzu_v4f64_v4i32(ptr %a) {
 define void @fcvtzu_v8f64_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v8f64_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.d
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
@@ -740,8 +739,8 @@ define <2 x i64> @fcvtzu_v2f64_v2i64(<2 x double> %op1) {
 define void @fcvtzu_v4f64_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v4f64_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]
@@ -785,8 +784,8 @@ define void @fcvtzs_v8f16_v8i16(ptr %a, ptr %b) {
 define void @fcvtzs_v16f16_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v16f16_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    fcvtzs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    fcvtzs z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
@@ -804,8 +803,8 @@ define void @fcvtzs_v16f16_v16i16(ptr %a, ptr %b) {
 define <2 x i32> @fcvtzs_v2f16_v2i32(<2 x half> %op1) {
 ; CHECK-LABEL: fcvtzs_v2f16_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -817,8 +816,8 @@ define <2 x i32> @fcvtzs_v2f16_v2i32(<2 x half> %op1) {
 define <4 x i32> @fcvtzs_v4f16_v4i32(<4 x half> %op1) {
 ; CHECK-LABEL: fcvtzs_v4f16_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -1052,8 +1051,8 @@ define <4 x i16> @fcvtzs_v4f32_v4i16(<4 x float> %op1) {
 define <8 x i16> @fcvtzs_v8f32_v8i16(ptr %a) {
 ; CHECK-LABEL: fcvtzs_v8f32_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    fcvtzs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
@@ -1070,8 +1069,8 @@ define <8 x i16> @fcvtzs_v8f32_v8i16(ptr %a) {
 define void @fcvtzs_v16f32_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v16f32_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fcvtzs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
@@ -1123,8 +1122,8 @@ define <4 x i32> @fcvtzs_v4f32_v4i32(<4 x float> %op1) {
 define void @fcvtzs_v8f32_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v8f32_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    fcvtzs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
@@ -1142,8 +1141,8 @@ define void @fcvtzs_v8f32_v8i32(ptr %a, ptr %b) {
 define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) {
 ; CHECK-LABEL: fcvtzs_v1f32_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -1155,8 +1154,8 @@ define <1 x i64> @fcvtzs_v1f32_v1i64(<1 x float> %op1) {
 define <2 x i64> @fcvtzs_v2f32_v2i64(<2 x float> %op1) {
 ; CHECK-LABEL: fcvtzs_v2f32_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -1243,8 +1242,8 @@ define <4 x i16> @fcvtzs_v4f64_v4i16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
@@ -1272,8 +1271,8 @@ define <8 x i16> @fcvtzs_v8f64_v8i16(ptr %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q0, [x0, #32]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q3, q2, [x0]
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
@@ -1315,24 +1314,22 @@ define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #32
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q3, q2, [x0]
-; CHECK-NEXT:    ldr q6, [x0, #112]
-; CHECK-NEXT:    ldp q4, q5, [x0, #80]
-; CHECK-NEXT:    ldr q7, [x0, #64]
+; CHECK-NEXT:    ldp q4, q5, [x0, #96]
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
-; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.d
+; CHECK-NEXT:    ldp q6, q7, [x0, #64]
 ; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.d
-; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.d
 ; CHECK-NEXT:    fcvtzs z5.d, p0/m, z5.d
 ; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.d
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.d
+; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    uzp1 z6.s, z6.s, z6.s
 ; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
 ; CHECK-NEXT:    fmov w8, s1
 ; CHECK-NEXT:    mov z16.s, z1.s[1]
@@ -1358,25 +1355,26 @@ define void @fcvtzs_v16f64_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z3.s, z5.s[1]
 ; CHECK-NEXT:    strh w8, [sp, #6]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    mov z2.s, z6.s[1]
+; CHECK-NEXT:    uzp1 z2.s, z6.s, z6.s
 ; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    fmov w8, s6
-; CHECK-NEXT:    strh w8, [sp, #28]
 ; CHECK-NEXT:    fmov w8, s5
-; CHECK-NEXT:    strh w8, [sp, #24]
+; CHECK-NEXT:    strh w8, [sp, #28]
 ; CHECK-NEXT:    fmov w8, s1
 ; CHECK-NEXT:    mov z1.s, z1.s[1]
-; CHECK-NEXT:    strh w8, [sp, #20]
+; CHECK-NEXT:    strh w8, [sp, #24]
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    mov z0.s, z0.s[1]
-; CHECK-NEXT:    strh w8, [sp, #16]
+; CHECK-NEXT:    strh w8, [sp, #20]
 ; CHECK-NEXT:    fmov w8, s2
-; CHECK-NEXT:    strh w8, [sp, #30]
+; CHECK-NEXT:    mov z2.s, z2.s[1]
+; CHECK-NEXT:    strh w8, [sp, #16]
 ; CHECK-NEXT:    fmov w8, s3
-; CHECK-NEXT:    strh w8, [sp, #26]
+; CHECK-NEXT:    strh w8, [sp, #30]
 ; CHECK-NEXT:    fmov w8, s1
-; CHECK-NEXT:    strh w8, [sp, #22]
+; CHECK-NEXT:    strh w8, [sp, #26]
 ; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    strh w8, [sp, #22]
+; CHECK-NEXT:    fmov w8, s2
 ; CHECK-NEXT:    strh w8, [sp, #18]
 ; CHECK-NEXT:    ldp q1, q0, [sp]
 ; CHECK-NEXT:    stp q1, q0, [x1]
@@ -1421,8 +1419,8 @@ define <2 x i32> @fcvtzs_v2f64_v2i32(<2 x double> %op1) {
 define <4 x i32> @fcvtzs_v4f64_v4i32(ptr %a) {
 ; CHECK-LABEL: fcvtzs_v4f64_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    ptrue p0.s, vl2
@@ -1439,8 +1437,8 @@ define <4 x i32> @fcvtzs_v4f64_v4i32(ptr %a) {
 define void @fcvtzs_v8f64_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v8f64_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
@@ -1492,8 +1490,8 @@ define <2 x i64> @fcvtzs_v2f64_v2i64(<2 x double> %op1) {
 define void @fcvtzs_v4f64_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v4f64_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
index ee8704284def5..4c5a6fe2fd231 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll
@@ -71,8 +71,8 @@ define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask
 define void @select_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: select_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q2, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q3, [x1]
 ; CHECK-NEXT:    fcmeq p1.h, p0/z, z0.h, z1.h
 ; CHECK-NEXT:    fcmeq p0.h, p0/z, z2.h, z3.h
@@ -128,8 +128,8 @@ define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %m
 define void @select_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: select_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q2, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q3, [x1]
 ; CHECK-NEXT:    fcmeq p1.s, p0/z, z0.s, z1.s
 ; CHECK-NEXT:    fcmeq p0.s, p0/z, z2.s, z3.s
@@ -186,8 +186,8 @@ define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1>
 define void @select_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: select_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q2, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q3, [x1]
 ; CHECK-NEXT:    fcmeq p1.d, p0/z, z0.d, z1.d
 ; CHECK-NEXT:    fcmeq p0.d, p0/z, z2.d, z3.d
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
index 0b3e7695e6a0a..4aa965777c742 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-insert-vector-elt.ll
@@ -11,9 +11,9 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i8> @insertelement_v4i8(<4 x i8> %op1) {
 ; CHECK-LABEL: insertelement_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w8, #3 // =0x3
 ; CHECK-NEXT:    index z1.h, #0, #1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov w8, #5 // =0x5
@@ -28,9 +28,9 @@ define <4 x i8> @insertelement_v4i8(<4 x i8> %op1) {
 define <8 x i8> @insertelement_v8i8(<8 x i8> %op1) {
 ; CHECK-LABEL: insertelement_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov w8, #7 // =0x7
 ; CHECK-NEXT:    index z1.b, #0, #1
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z2.b, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov w8, #5 // =0x5
@@ -45,9 +45,9 @@ define <8 x i8> @insertelement_v8i8(<8 x i8> %op1) {
 define <16 x i8> @insertelement_v16i8(<16 x i8> %op1) {
 ; CHECK-LABEL: insertelement_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov w8, #15 // =0xf
 ; CHECK-NEXT:    index z1.b, #0, #1
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z2.b, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    mov w8, #5 // =0x5
@@ -62,9 +62,9 @@ define <16 x i8> @insertelement_v16i8(<16 x i8> %op1) {
 define <32 x i8> @insertelement_v32i8(<32 x i8> %op1) {
 ; CHECK-LABEL: insertelement_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov w8, #15 // =0xf
 ; CHECK-NEXT:    index z2.b, #0, #1
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z3.b, w8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    mov w8, #5 // =0x5
@@ -80,9 +80,9 @@ define <32 x i8> @insertelement_v32i8(<32 x i8> %op1) {
 define <2 x i16> @insertelement_v2i16(<2 x i16> %op1) {
 ; CHECK-LABEL: insertelement_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    index z1.s, #0, #1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov w8, #5 // =0x5
@@ -97,9 +97,9 @@ define <2 x i16> @insertelement_v2i16(<2 x i16> %op1) {
 define <4 x i16> @insertelement_v4i16(<4 x i16> %op1) {
 ; CHECK-LABEL: insertelement_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w8, #3 // =0x3
 ; CHECK-NEXT:    index z1.h, #0, #1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov w8, #5 // =0x5
@@ -114,9 +114,9 @@ define <4 x i16> @insertelement_v4i16(<4 x i16> %op1) {
 define <8 x i16> @insertelement_v8i16(<8 x i16> %op1) {
 ; CHECK-LABEL: insertelement_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w8, #7 // =0x7
 ; CHECK-NEXT:    index z1.h, #0, #1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    mov w8, #5 // =0x5
@@ -131,9 +131,9 @@ define <8 x i16> @insertelement_v8i16(<8 x i16> %op1) {
 define <16 x i16> @insertelement_v16i16(<16 x i16> %op1) {
 ; CHECK-LABEL: insertelement_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w8, #7 // =0x7
 ; CHECK-NEXT:    index z2.h, #0, #1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z3.h, w8
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    mov w8, #5 // =0x5
@@ -149,9 +149,9 @@ define <16 x i16> @insertelement_v16i16(<16 x i16> %op1) {
 define <2 x i32> @insertelement_v2i32(<2 x i32> %op1) {
 ; CHECK-LABEL: insertelement_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    index z1.s, #0, #1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov w8, #5 // =0x5
@@ -166,9 +166,9 @@ define <2 x i32> @insertelement_v2i32(<2 x i32> %op1) {
 define <4 x i32> @insertelement_v4i32(<4 x i32> %op1) {
 ; CHECK-LABEL: insertelement_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #3 // =0x3
 ; CHECK-NEXT:    index z1.s, #0, #1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    mov w8, #5 // =0x5
@@ -183,9 +183,9 @@ define <4 x i32> @insertelement_v4i32(<4 x i32> %op1) {
 define <8 x i32> @insertelement_v8i32(ptr %a) {
 ; CHECK-LABEL: insertelement_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #3 // =0x3
 ; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    mov w8, #5 // =0x5
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
@@ -212,9 +212,9 @@ define <1 x i64> @insertelement_v1i64(<1 x i64> %op1) {
 define <2 x i64> @insertelement_v2i64(<2 x i64> %op1) {
 ; CHECK-LABEL: insertelement_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    index z1.d, #0, #1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    mov w8, #5 // =0x5
@@ -229,9 +229,9 @@ define <2 x i64> @insertelement_v2i64(<2 x i64> %op1) {
 define <4 x i64> @insertelement_v4i64(ptr %a) {
 ; CHECK-LABEL: insertelement_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    index z0.d, #0, #1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    mov w8, #5 // =0x5
 ; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, z1.d
@@ -264,9 +264,9 @@ define <2 x half> @insertelement_v2f16(<2 x half> %op1) {
 define <4 x half> @insertelement_v4f16(<4 x half> %op1) {
 ; CHECK-LABEL: insertelement_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w8, #3 // =0x3
 ; CHECK-NEXT:    index z1.h, #0, #1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z1.h, z2.h
@@ -281,9 +281,9 @@ define <4 x half> @insertelement_v4f16(<4 x half> %op1) {
 define <8 x half> @insertelement_v8f16(<8 x half> %op1) {
 ; CHECK-LABEL: insertelement_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w8, #7 // =0x7
 ; CHECK-NEXT:    index z1.h, #0, #1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z1.h, z2.h
@@ -298,9 +298,9 @@ define <8 x half> @insertelement_v8f16(<8 x half> %op1) {
 define <16 x half> @insertelement_v16f16(ptr %a) {
 ; CHECK-LABEL: insertelement_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov w8, #7 // =0x7
 ; CHECK-NEXT:    index z0.h, #0, #1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z1.h, w8
 ; CHECK-NEXT:    fmov h2, #5.00000000
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z0.h, z1.h
@@ -317,9 +317,9 @@ define <16 x half> @insertelement_v16f16(ptr %a) {
 define <2 x float> @insertelement_v2f32(<2 x float> %op1) {
 ; CHECK-LABEL: insertelement_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    index z1.s, #0, #1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z1.s, z2.s
@@ -334,9 +334,9 @@ define <2 x float> @insertelement_v2f32(<2 x float> %op1) {
 define <4 x float> @insertelement_v4f32(<4 x float> %op1) {
 ; CHECK-LABEL: insertelement_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #3 // =0x3
 ; CHECK-NEXT:    index z1.s, #0, #1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z2.s, w8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z1.s, z2.s
@@ -351,9 +351,9 @@ define <4 x float> @insertelement_v4f32(<4 x float> %op1) {
 define <8 x float> @insertelement_v8f32(ptr %a) {
 ; CHECK-LABEL: insertelement_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov w8, #3 // =0x3
 ; CHECK-NEXT:    index z0.s, #0, #1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z1.s, w8
 ; CHECK-NEXT:    fmov s2, #5.00000000
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
@@ -379,9 +379,9 @@ define <1 x double> @insertelement_v1f64(<1 x double> %op1) {
 define <2 x double> @insertelement_v2f64(<2 x double> %op1) {
 ; CHECK-LABEL: insertelement_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    index z1.d, #0, #1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z2.d, x8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    cmpeq p0.d, p0/z, z1.d, z2.d
@@ -396,9 +396,9 @@ define <2 x double> @insertelement_v2f64(<2 x double> %op1) {
 define <4 x double> @insertelement_v4f64(ptr %a) {
 ; CHECK-LABEL: insertelement_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    index z0.d, #0, #1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, x8
 ; CHECK-NEXT:    fmov d2, #5.00000000
 ; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, z1.d
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
index e3c4b6f1cb53f..8baa87c6d686d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-arith.ll
@@ -262,8 +262,8 @@ define <16 x i8> @mul_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @mul_v32i8(ptr %a, ptr %b) {
 ; SVE-LABEL: mul_v32i8:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ptrue p0.b, vl16
 ; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ptrue p0.b, vl16
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    mul z0.b, p0/m, z0.b, z1.b
 ; SVE-NEXT:    movprfx z1, z2
@@ -352,8 +352,8 @@ define <8 x i16> @mul_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @mul_v16i16(ptr %a, ptr %b) {
 ; SVE-LABEL: mul_v16i16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ptrue p0.h, vl8
 ; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ptrue p0.h, vl8
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    mul z0.h, p0/m, z0.h, z1.h
 ; SVE-NEXT:    movprfx z1, z2
@@ -421,8 +421,8 @@ define <4 x i32> @mul_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @mul_v8i32(ptr %a, ptr %b) {
 ; SVE-LABEL: mul_v8i32:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ptrue p0.s, vl4
 ; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ptrue p0.s, vl4
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    mul z0.s, p0/m, z0.s, z1.s
 ; SVE-NEXT:    movprfx z1, z2
@@ -490,8 +490,8 @@ define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @mul_v4i64(ptr %a, ptr %b) {
 ; SVE-LABEL: mul_v4i64:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ptrue p0.d, vl2
 ; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ptrue p0.d, vl2
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; SVE-NEXT:    movprfx z1, z2
@@ -746,8 +746,8 @@ define <16 x i8> @abs_v16i8(<16 x i8> %op1) {
 define void @abs_v32i8(ptr %a) {
 ; CHECK-LABEL: abs_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    abs z0.b, p0/m, z0.b
 ; CHECK-NEXT:    abs z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -798,8 +798,8 @@ define <8 x i16> @abs_v8i16(<8 x i16> %op1) {
 define void @abs_v16i16(ptr %a) {
 ; CHECK-LABEL: abs_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    abs z0.h, p0/m, z0.h
 ; CHECK-NEXT:    abs z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -837,8 +837,8 @@ define <4 x i32> @abs_v4i32(<4 x i32> %op1) {
 define void @abs_v8i32(ptr %a) {
 ; CHECK-LABEL: abs_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    abs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -876,8 +876,8 @@ define <2 x i64> @abs_v2i64(<2 x i64> %op1) {
 define void @abs_v4i64(ptr %a) {
 ; CHECK-LABEL: abs_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    abs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    abs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
index 6200e44218a96..73c1eac99dd30 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-compares.ll
@@ -41,8 +41,8 @@ define <16 x i8> @icmp_eq_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @icmp_eq_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: icmp_eq_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    cmpeq p1.b, p0/z, z1.b, z0.b
 ; CHECK-NEXT:    cmpeq p0.b, p0/z, z2.b, z3.b
@@ -91,8 +91,8 @@ define <8 x i16> @icmp_eq_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @icmp_eq_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: icmp_eq_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    cmpeq p1.h, p0/z, z1.h, z0.h
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z2.h, z3.h
@@ -141,8 +141,8 @@ define <4 x i32> @icmp_eq_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @icmp_eq_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: icmp_eq_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    cmpeq p1.s, p0/z, z1.s, z0.s
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z2.s, z3.s
@@ -191,8 +191,8 @@ define <2 x i64> @icmp_eq_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @icmp_eq_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: icmp_eq_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    cmpeq p1.d, p0/z, z1.d, z0.d
 ; CHECK-NEXT:    cmpeq p0.d, p0/z, z2.d, z3.d
@@ -215,8 +215,8 @@ define void @icmp_eq_v4i64(ptr %a, ptr %b) {
 define void @icmp_ne_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: icmp_ne_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    cmpne p1.b, p0/z, z1.b, z0.b
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z2.b, z3.b
@@ -261,8 +261,8 @@ define void @icmp_sge_v8i16(ptr %a, ptr %b) {
 define void @icmp_sgt_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: icmp_sgt_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    cmpgt p1.h, p0/z, z1.h, z0.h
 ; CHECK-NEXT:    cmpgt p0.h, p0/z, z2.h, z3.h
@@ -307,8 +307,8 @@ define void @icmp_sle_v4i32(ptr %a, ptr %b) {
 define void @icmp_slt_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: icmp_slt_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    cmpgt p1.s, p0/z, z0.s, z1.s
 ; CHECK-NEXT:    cmpgt p0.s, p0/z, z3.s, z2.s
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
index fcf4f21c6ea84..5158dda37a8b9 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-div.ll
@@ -86,8 +86,8 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z5.s
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z1.h, z4.h, z4.h
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    splice z1.h, p0, z1.h, z2.h
 ; CHECK-NEXT:    uzp1 z1.b, z1.b, z1.b
@@ -163,18 +163,18 @@ define void @sdiv_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
 ; CHECK-NEXT:    sdiv z6.s, p0/m, z6.s, z7.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    uzp1 z7.h, z16.h, z16.h
 ; CHECK-NEXT:    splice z4.h, p0, z4.h, z5.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
+; CHECK-NEXT:    uzp1 z7.h, z16.h, z16.h
 ; CHECK-NEXT:    uzp1 z1.b, z4.b, z4.b
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
 ; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
 ; CHECK-NEXT:    splice z7.h, p0, z7.h, z6.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z3.b, z7.b, z7.b
 ; CHECK-NEXT:    splice z2.b, p0, z2.b, z0.b
+; CHECK-NEXT:    uzp1 z3.b, z7.b, z7.b
 ; CHECK-NEXT:    splice z3.b, p0, z3.b, z1.b
 ; CHECK-NEXT:    stp q3, q2, [x0]
 ; CHECK-NEXT:    ret
@@ -203,9 +203,9 @@ define <2 x i16> @sdiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 define <4 x i16> @sdiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-LABEL: sdiv_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z1.s
@@ -272,8 +272,8 @@ define void @sdiv_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    sdiv z3.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    uzp1 z1.h, z5.h, z5.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z5.h, z5.h
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    splice z3.h, p0, z3.h, z1.h
 ; CHECK-NEXT:    stp q3, q0, [x0]
@@ -314,8 +314,8 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @sdiv_v8i32(ptr %a, ptr %b)  {
 ; CHECK-LABEL: sdiv_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    sdivr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z2
@@ -358,8 +358,8 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @sdiv_v4i64(ptr %a, ptr %b)  {
 ; CHECK-LABEL: sdiv_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    sdivr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    movprfx z1, z2
@@ -453,8 +453,8 @@ define <16 x i8> @udiv_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z5.s
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
-; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z1.h, z4.h, z4.h
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    splice z1.h, p0, z1.h, z2.h
 ; CHECK-NEXT:    uzp1 z1.b, z1.b, z1.b
@@ -530,18 +530,18 @@ define void @udiv_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
 ; CHECK-NEXT:    udiv z6.s, p0/m, z6.s, z7.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    uzp1 z7.h, z16.h, z16.h
 ; CHECK-NEXT:    splice z4.h, p0, z4.h, z5.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
+; CHECK-NEXT:    uzp1 z7.h, z16.h, z16.h
 ; CHECK-NEXT:    uzp1 z1.b, z4.b, z4.b
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
 ; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
 ; CHECK-NEXT:    splice z7.h, p0, z7.h, z6.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z3.b, z7.b, z7.b
 ; CHECK-NEXT:    splice z2.b, p0, z2.b, z0.b
+; CHECK-NEXT:    uzp1 z3.b, z7.b, z7.b
 ; CHECK-NEXT:    splice z3.b, p0, z3.b, z1.b
 ; CHECK-NEXT:    stp q3, q2, [x0]
 ; CHECK-NEXT:    ret
@@ -555,9 +555,9 @@ define void @udiv_v32i8(ptr %a, ptr %b) {
 define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-LABEL: udiv_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    and z1.s, z1.s, #0xffff
 ; CHECK-NEXT:    and z0.s, z0.s, #0xffff
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
@@ -570,9 +570,9 @@ define <2 x i16> @udiv_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 define <4 x i16> @udiv_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-LABEL: udiv_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    uunpklo z1.s, z1.h
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    udiv z0.s, p0/m, z0.s, z1.s
@@ -639,8 +639,8 @@ define void @udiv_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    udiv z3.s, p0/m, z3.s, z4.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    uzp1 z1.h, z5.h, z5.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z2.h
+; CHECK-NEXT:    uzp1 z1.h, z5.h, z5.h
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    splice z3.h, p0, z3.h, z1.h
 ; CHECK-NEXT:    stp q3, q0, [x0]
@@ -681,8 +681,8 @@ define <4 x i32> @udiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @udiv_v8i32(ptr %a, ptr %b)  {
 ; CHECK-LABEL: udiv_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    udivr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z2
@@ -725,8 +725,8 @@ define <2 x i64> @udiv_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @udiv_v4i64(ptr %a, ptr %b)  {
 ; CHECK-LABEL: udiv_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    udivr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    movprfx z1, z2
@@ -743,10 +743,10 @@ define void @udiv_v4i64(ptr %a, ptr %b)  {
 define void @udiv_constantsplat_v8i32(ptr %a)  {
 ; SVE-LABEL: udiv_constantsplat_v8i32:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ptrue p0.s, vl4
 ; SVE-NEXT:    mov w8, #8969 // =0x2309
-; SVE-NEXT:    movk w8, #22765, lsl #16
 ; SVE-NEXT:    ldp q1, q2, [x0]
+; SVE-NEXT:    movk w8, #22765, lsl #16
+; SVE-NEXT:    ptrue p0.s, vl4
 ; SVE-NEXT:    mov z0.s, w8
 ; SVE-NEXT:    movprfx z3, z1
 ; SVE-NEXT:    umulh z3.s, p0/m, z3.s, z0.s
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
index 0785c67ce6f41..f028b3eeca257 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-immediates.ll
@@ -221,8 +221,8 @@ define void @ashr_v4i64(ptr %a) {
 define void @icmp_eq_v32i8(ptr %a) {
 ; CHECK-LABEL: icmp_eq_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    cmpeq p1.b, p0/z, z0.b, #7
 ; CHECK-NEXT:    cmpeq p0.b, p0/z, z1.b, #7
 ; CHECK-NEXT:    mov z0.b, p1/z, #-1 // =0xffffffffffffffff
@@ -241,8 +241,8 @@ define void @icmp_eq_v32i8(ptr %a) {
 define void @icmp_sge_v16i16(ptr %a) {
 ; CHECK-LABEL: icmp_sge_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    cmpge p1.h, p0/z, z0.h, #15
 ; CHECK-NEXT:    cmpge p0.h, p0/z, z1.h, #15
 ; CHECK-NEXT:    mov z0.h, p1/z, #-1 // =0xffffffffffffffff
@@ -261,8 +261,8 @@ define void @icmp_sge_v16i16(ptr %a) {
 define void @icmp_sgt_v8i32(ptr %a) {
 ; CHECK-LABEL: icmp_sgt_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    cmpgt p1.s, p0/z, z0.s, #-8
 ; CHECK-NEXT:    cmpgt p0.s, p0/z, z1.s, #-8
 ; CHECK-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
@@ -281,8 +281,8 @@ define void @icmp_sgt_v8i32(ptr %a) {
 define void @icmp_ult_v4i64(ptr %a) {
 ; CHECK-LABEL: icmp_ult_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    cmplo p1.d, p0/z, z0.d, #63
 ; CHECK-NEXT:    cmplo p0.d, p0/z, z1.d, #63
 ; CHECK-NEXT:    mov z0.d, p1/z, #-1 // =0xffffffffffffffff
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
index d7600c6e6192d..50cf9b73d9a79 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-minmax.ll
@@ -37,8 +37,8 @@ define <16 x i8> @smax_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @smax_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: smax_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    smax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    movprfx z1, z2
@@ -81,8 +81,8 @@ define <8 x i16> @smax_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @smax_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: smax_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z2
@@ -125,8 +125,8 @@ define <4 x i32> @smax_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @smax_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: smax_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z2
@@ -171,8 +171,8 @@ define <2 x i64> @smax_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @smax_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: smax_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    movprfx z1, z2
@@ -219,8 +219,8 @@ define <16 x i8> @smin_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @smin_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: smin_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    smin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    movprfx z1, z2
@@ -263,8 +263,8 @@ define <8 x i16> @smin_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @smin_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: smin_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z2
@@ -307,8 +307,8 @@ define <4 x i32> @smin_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @smin_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: smin_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z2
@@ -353,8 +353,8 @@ define <2 x i64> @smin_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @smin_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: smin_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    movprfx z1, z2
@@ -401,8 +401,8 @@ define <16 x i8> @umax_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @umax_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: umax_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    umax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    movprfx z1, z2
@@ -445,8 +445,8 @@ define <8 x i16> @umax_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @umax_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: umax_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z2
@@ -489,8 +489,8 @@ define <4 x i32> @umax_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @umax_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: umax_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z2
@@ -535,8 +535,8 @@ define <2 x i64> @umax_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @umax_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: umax_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    movprfx z1, z2
@@ -583,8 +583,8 @@ define <16 x i8> @umin_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @umin_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: umin_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    umin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    movprfx z1, z2
@@ -627,8 +627,8 @@ define <8 x i16> @umin_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @umin_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: umin_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z2
@@ -671,8 +671,8 @@ define <4 x i32> @umin_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @umin_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: umin_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z2
@@ -717,8 +717,8 @@ define <2 x i64> @umin_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @umin_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: umin_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    movprfx z1, z2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
index c48cb315a7aa3..cb7fa53eac513 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll
@@ -101,8 +101,8 @@ define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @smulh_v32i8(ptr %a, ptr %b) {
 ; SVE-LABEL: smulh_v32i8:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ptrue p0.b, vl16
 ; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ptrue p0.b, vl16
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    smulh z0.b, p0/m, z0.b, z1.b
 ; SVE-NEXT:    movprfx z1, z2
@@ -214,8 +214,8 @@ define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @smulh_v16i16(ptr %a, ptr %b) {
 ; SVE-LABEL: smulh_v16i16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ptrue p0.h, vl8
 ; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ptrue p0.h, vl8
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    smulh z0.h, p0/m, z0.h, z1.h
 ; SVE-NEXT:    movprfx z1, z2
@@ -295,8 +295,8 @@ define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @smulh_v8i32(ptr %a, ptr %b) {
 ; SVE-LABEL: smulh_v8i32:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ptrue p0.s, vl4
 ; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ptrue p0.s, vl4
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    smulh z0.s, p0/m, z0.s, z1.s
 ; SVE-NEXT:    movprfx z1, z2
@@ -378,8 +378,8 @@ define <2 x i64> @smulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @smulh_v4i64(ptr %a, ptr %b) {
 ; SVE-LABEL: smulh_v4i64:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ptrue p0.d, vl2
 ; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ptrue p0.d, vl2
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    smulh z0.d, p0/m, z0.d, z1.d
 ; SVE-NEXT:    movprfx z1, z2
@@ -413,9 +413,9 @@ define void @smulh_v4i64(ptr %a, ptr %b) {
 define <4 x i8> @umulh_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; SVE-LABEL: umulh_v4i8:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ptrue p0.h, vl4
 ; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE-NEXT:    ptrue p0.h, vl4
 ; SVE-NEXT:    and z0.h, z0.h, #0xff
 ; SVE-NEXT:    and z1.h, z1.h, #0xff
 ; SVE-NEXT:    mul z0.h, p0/m, z0.h, z1.h
@@ -494,8 +494,8 @@ define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @umulh_v32i8(ptr %a, ptr %b) {
 ; SVE-LABEL: umulh_v32i8:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ptrue p0.b, vl16
 ; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ptrue p0.b, vl16
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    umulh z0.b, p0/m, z0.b, z1.b
 ; SVE-NEXT:    movprfx z1, z2
@@ -525,9 +525,9 @@ define void @umulh_v32i8(ptr %a, ptr %b) {
 define <2 x i16> @umulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; SVE-LABEL: umulh_v2i16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ptrue p0.s, vl2
 ; SVE-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; SVE-NEXT:    // kill: def $d0 killed $d0 def $z0
+; SVE-NEXT:    ptrue p0.s, vl2
 ; SVE-NEXT:    and z0.s, z0.s, #0xffff
 ; SVE-NEXT:    and z1.s, z1.s, #0xffff
 ; SVE-NEXT:    mul z0.s, p0/m, z0.s, z1.s
@@ -606,8 +606,8 @@ define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @umulh_v16i16(ptr %a, ptr %b) {
 ; SVE-LABEL: umulh_v16i16:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ptrue p0.h, vl8
 ; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ptrue p0.h, vl8
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    umulh z0.h, p0/m, z0.h, z1.h
 ; SVE-NEXT:    movprfx z1, z2
@@ -687,8 +687,8 @@ define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @umulh_v8i32(ptr %a, ptr %b) {
 ; SVE-LABEL: umulh_v8i32:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ptrue p0.s, vl4
 ; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ptrue p0.s, vl4
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    umulh z0.s, p0/m, z0.s, z1.s
 ; SVE-NEXT:    movprfx z1, z2
@@ -770,8 +770,8 @@ define <2 x i64> @umulh_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @umulh_v4i64(ptr %a, ptr %b) {
 ; SVE-LABEL: umulh_v4i64:
 ; SVE:       // %bb.0:
-; SVE-NEXT:    ptrue p0.d, vl2
 ; SVE-NEXT:    ldp q0, q3, [x1]
+; SVE-NEXT:    ptrue p0.d, vl2
 ; SVE-NEXT:    ldp q1, q2, [x0]
 ; SVE-NEXT:    umulh z0.d, p0/m, z0.d, z1.d
 ; SVE-NEXT:    movprfx z1, z2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
index c51630ecd752a..751f43768a511 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
@@ -37,8 +37,8 @@ define i8 @uaddv_v16i8(<16 x i8> %a) {
 define i8 @uaddv_v32i8(ptr %a) {
 ; CHECK-LABEL: uaddv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    add z0.b, z1.b, z0.b
 ; CHECK-NEXT:    uaddv d0, p0, z0.b
 ; CHECK-NEXT:    fmov x0, d0
@@ -78,8 +78,8 @@ define i16 @uaddv_v8i16(<8 x i16> %a) {
 define i16 @uaddv_v16i16(ptr %a) {
 ; CHECK-LABEL: uaddv_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    add z0.h, z1.h, z0.h
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
 ; CHECK-NEXT:    fmov x0, d0
@@ -119,8 +119,8 @@ define i32 @uaddv_v4i32(<4 x i32> %a) {
 define i32 @uaddv_v8i32(ptr %a) {
 ; CHECK-LABEL: uaddv_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    add z0.s, z1.s, z0.s
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
 ; CHECK-NEXT:    fmov x0, d0
@@ -146,8 +146,8 @@ define i64 @uaddv_v2i64(<2 x i64> %a) {
 define i64 @uaddv_v4i64(ptr %a) {
 ; CHECK-LABEL: uaddv_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    add z0.d, z1.d, z0.d
 ; CHECK-NEXT:    uaddv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
@@ -188,8 +188,8 @@ define i8 @smaxv_v16i8(<16 x i8> %a) {
 define i8 @smaxv_v32i8(ptr %a) {
 ; CHECK-LABEL: smaxv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    smax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    smaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
@@ -226,8 +226,8 @@ define i16 @smaxv_v8i16(<8 x i16> %a) {
 define i16 @smaxv_v16i16(ptr %a) {
 ; CHECK-LABEL: smaxv_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    smax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    smaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
@@ -264,8 +264,8 @@ define i32 @smaxv_v4i32(<4 x i32> %a) {
 define i32 @smaxv_v8i32(ptr %a) {
 ; CHECK-LABEL: smaxv_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    smax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    smaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
@@ -291,8 +291,8 @@ define i64 @smaxv_v2i64(<2 x i64> %a) {
 define i64 @smaxv_v4i64(ptr %a) {
 ; CHECK-LABEL: smaxv_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    smax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    smaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
@@ -333,8 +333,8 @@ define i8 @sminv_v16i8(<16 x i8> %a) {
 define i8 @sminv_v32i8(ptr %a) {
 ; CHECK-LABEL: sminv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    smin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    sminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
@@ -371,8 +371,8 @@ define i16 @sminv_v8i16(<8 x i16> %a) {
 define i16 @sminv_v16i16(ptr %a) {
 ; CHECK-LABEL: sminv_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    smin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    sminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
@@ -409,8 +409,8 @@ define i32 @sminv_v4i32(<4 x i32> %a) {
 define i32 @sminv_v8i32(ptr %a) {
 ; CHECK-LABEL: sminv_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    smin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    sminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
@@ -436,8 +436,8 @@ define i64 @sminv_v2i64(<2 x i64> %a) {
 define i64 @sminv_v4i64(ptr %a) {
 ; CHECK-LABEL: sminv_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    smin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    sminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
@@ -478,8 +478,8 @@ define i8 @umaxv_v16i8(<16 x i8> %a) {
 define i8 @umaxv_v32i8(ptr %a) {
 ; CHECK-LABEL: umaxv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    umax z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    umaxv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
@@ -516,8 +516,8 @@ define i16 @umaxv_v8i16(<8 x i16> %a) {
 define i16 @umaxv_v16i16(ptr %a) {
 ; CHECK-LABEL: umaxv_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    umaxv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
@@ -554,8 +554,8 @@ define i32 @umaxv_v4i32(<4 x i32> %a) {
 define i32 @umaxv_v8i32(ptr %a) {
 ; CHECK-LABEL: umaxv_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    umaxv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
@@ -581,8 +581,8 @@ define i64 @umaxv_v2i64(<2 x i64> %a) {
 define i64 @umaxv_v4i64(ptr %a) {
 ; CHECK-LABEL: umaxv_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    umax z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    umaxv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
@@ -623,8 +623,8 @@ define i8 @uminv_v16i8(<16 x i8> %a) {
 define i8 @uminv_v32i8(ptr %a) {
 ; CHECK-LABEL: uminv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    umin z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    uminv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
@@ -661,8 +661,8 @@ define i16 @uminv_v8i16(<8 x i16> %a) {
 define i16 @uminv_v16i16(ptr %a) {
 ; CHECK-LABEL: uminv_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    umin z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    uminv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
@@ -699,8 +699,8 @@ define i32 @uminv_v4i32(<4 x i32> %a) {
 define i32 @uminv_v8i32(ptr %a) {
 ; CHECK-LABEL: uminv_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    umin z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    uminv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
@@ -726,8 +726,8 @@ define i64 @uminv_v2i64(<2 x i64> %a) {
 define i64 @uminv_v4i64(ptr %a) {
 ; CHECK-LABEL: uminv_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    umin z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    uminv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
index 4a1209b942f4a..d373a9063f852 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-rem.ll
@@ -65,7 +65,6 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ptrue p1.b, vl16
 ; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
 ; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z2.h, z2.b
@@ -91,15 +90,16 @@ define <16 x i8> @srem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    sdivr z3.s, p0/m, z3.s, z5.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    uzp1 z5.h, z6.h, z6.h
 ; CHECK-NEXT:    splice z4.h, p0, z4.h, z2.h
+; CHECK-NEXT:    uzp1 z5.h, z6.h, z6.h
 ; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    splice z5.h, p0, z5.h, z3.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
 ; CHECK-NEXT:    uzp1 z3.b, z5.b, z5.b
 ; CHECK-NEXT:    splice z3.b, p0, z3.b, z2.b
-; CHECK-NEXT:    mls z0.b, p1/m, z3.b, z1.b
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    mls z0.b, p0/m, z3.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %res = srem <16 x i8> %op1, %op2
@@ -112,7 +112,6 @@ define void @srem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    ldr q1, [x1, #16]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ptrue p1.b, vl16
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:    sunpklo z7.h, z1.b
@@ -171,22 +170,23 @@ define void @srem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    uzp1 z17.h, z17.h, z17.h
 ; CHECK-NEXT:    sdivr z18.s, p0/m, z18.s, z20.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    uzp1 z19.h, z21.h, z21.h
 ; CHECK-NEXT:    splice z16.h, p0, z16.h, z17.h
 ; CHECK-NEXT:    splice z2.h, p0, z2.h, z5.h
 ; CHECK-NEXT:    splice z6.h, p0, z6.h, z7.h
+; CHECK-NEXT:    uzp1 z19.h, z21.h, z21.h
 ; CHECK-NEXT:    uzp1 z5.b, z16.b, z16.b
 ; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
 ; CHECK-NEXT:    uzp1 z6.b, z6.b, z6.b
 ; CHECK-NEXT:    uzp1 z18.h, z18.h, z18.h
 ; CHECK-NEXT:    splice z19.h, p0, z19.h, z18.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z7.b, z19.b, z19.b
 ; CHECK-NEXT:    splice z6.b, p0, z6.b, z2.b
+; CHECK-NEXT:    uzp1 z7.b, z19.b, z19.b
 ; CHECK-NEXT:    splice z7.b, p0, z7.b, z5.b
-; CHECK-NEXT:    mls z0.b, p1/m, z6.b, z1.b
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    mls z0.b, p0/m, z6.b, z1.b
 ; CHECK-NEXT:    movprfx z2, z3
-; CHECK-NEXT:    mls z2.b, p1/m, z7.b, z4.b
+; CHECK-NEXT:    mls z2.b, p0/m, z7.b, z4.b
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
@@ -199,11 +199,11 @@ define void @srem_v32i8(ptr %a, ptr %b) {
 define <4 x i16> @srem_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-LABEL: srem_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    sunpklo z2.s, z1.h
 ; CHECK-NEXT:    sunpklo z3.s, z0.h
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    sdivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
@@ -223,7 +223,6 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    sunpklo z4.s, z0.h
-; CHECK-NEXT:    ptrue p1.h, vl8
 ; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
 ; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
@@ -235,7 +234,8 @@ define <8 x i16> @srem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    splice z3.h, p0, z3.h, z2.h
-; CHECK-NEXT:    mls z0.h, p1/m, z3.h, z1.h
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    mls z0.h, p0/m, z3.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %res = srem <8 x i16> %op1, %op2
@@ -248,7 +248,6 @@ define void @srem_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldp q4, q1, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldr q0, [x0, #16]
-; CHECK-NEXT:    ptrue p1.h, vl8
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:    mov z5.d, z4.d
@@ -277,9 +276,10 @@ define void @srem_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z6.h, p0, z6.h, z5.h
 ; CHECK-NEXT:    uzp1 z7.h, z7.h, z7.h
 ; CHECK-NEXT:    splice z7.h, p0, z7.h, z2.h
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    movprfx z2, z3
-; CHECK-NEXT:    mls z2.h, p1/m, z6.h, z4.h
-; CHECK-NEXT:    mls z0.h, p1/m, z7.h, z1.h
+; CHECK-NEXT:    mls z2.h, p0/m, z6.h, z4.h
+; CHECK-NEXT:    mls z0.h, p0/m, z7.h, z1.h
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
@@ -322,8 +322,8 @@ define <4 x i32> @srem_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @srem_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: srem_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    movprfx z4, z1
 ; CHECK-NEXT:    sdiv z4.s, p0/m, z4.s, z0.s
@@ -374,8 +374,8 @@ define <2 x i64> @srem_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @srem_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: srem_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    movprfx z4, z1
 ; CHECK-NEXT:    sdiv z4.d, p0/m, z4.d, z0.d
@@ -454,7 +454,6 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ptrue p1.b, vl16
 ; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
 ; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z2.h, z2.b
@@ -480,15 +479,16 @@ define <16 x i8> @urem_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    udivr z3.s, p0/m, z3.s, z5.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    uzp1 z5.h, z6.h, z6.h
 ; CHECK-NEXT:    splice z4.h, p0, z4.h, z2.h
+; CHECK-NEXT:    uzp1 z5.h, z6.h, z6.h
 ; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    splice z5.h, p0, z5.h, z3.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
 ; CHECK-NEXT:    uzp1 z3.b, z5.b, z5.b
 ; CHECK-NEXT:    splice z3.b, p0, z3.b, z2.b
-; CHECK-NEXT:    mls z0.b, p1/m, z3.b, z1.b
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    mls z0.b, p0/m, z3.b, z1.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %res = urem <16 x i8> %op1, %op2
@@ -501,7 +501,6 @@ define void @urem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q0, [x0, #16]
 ; CHECK-NEXT:    ldr q1, [x1, #16]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ptrue p1.b, vl16
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:    uunpklo z7.h, z1.b
@@ -560,22 +559,23 @@ define void @urem_v32i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    uzp1 z17.h, z17.h, z17.h
 ; CHECK-NEXT:    udivr z18.s, p0/m, z18.s, z20.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    uzp1 z19.h, z21.h, z21.h
 ; CHECK-NEXT:    splice z16.h, p0, z16.h, z17.h
 ; CHECK-NEXT:    splice z2.h, p0, z2.h, z5.h
 ; CHECK-NEXT:    splice z6.h, p0, z6.h, z7.h
+; CHECK-NEXT:    uzp1 z19.h, z21.h, z21.h
 ; CHECK-NEXT:    uzp1 z5.b, z16.b, z16.b
 ; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
 ; CHECK-NEXT:    uzp1 z6.b, z6.b, z6.b
 ; CHECK-NEXT:    uzp1 z18.h, z18.h, z18.h
 ; CHECK-NEXT:    splice z19.h, p0, z19.h, z18.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    uzp1 z7.b, z19.b, z19.b
 ; CHECK-NEXT:    splice z6.b, p0, z6.b, z2.b
+; CHECK-NEXT:    uzp1 z7.b, z19.b, z19.b
 ; CHECK-NEXT:    splice z7.b, p0, z7.b, z5.b
-; CHECK-NEXT:    mls z0.b, p1/m, z6.b, z1.b
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    mls z0.b, p0/m, z6.b, z1.b
 ; CHECK-NEXT:    movprfx z2, z3
-; CHECK-NEXT:    mls z2.b, p1/m, z7.b, z4.b
+; CHECK-NEXT:    mls z2.b, p0/m, z7.b, z4.b
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <32 x i8>, ptr %a
@@ -588,11 +588,11 @@ define void @urem_v32i8(ptr %a, ptr %b) {
 define <4 x i16> @urem_v4i16(<4 x i16> %op1, <4 x i16> %op2) {
 ; CHECK-LABEL: urem_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uunpklo z2.s, z1.h
 ; CHECK-NEXT:    uunpklo z3.s, z0.h
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    udivr z2.s, p0/m, z2.s, z3.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
@@ -612,7 +612,6 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    uunpklo z4.s, z0.h
-; CHECK-NEXT:    ptrue p1.h, vl8
 ; CHECK-NEXT:    ext z2.b, z2.b, z1.b, #8
 ; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
 ; CHECK-NEXT:    uunpklo z2.s, z2.h
@@ -624,7 +623,8 @@ define <8 x i16> @urem_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    splice z3.h, p0, z3.h, z2.h
-; CHECK-NEXT:    mls z0.h, p1/m, z3.h, z1.h
+; CHECK-NEXT:    ptrue p0.h, vl8
+; CHECK-NEXT:    mls z0.h, p0/m, z3.h, z1.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
   %res = urem <8 x i16> %op1, %op2
@@ -637,7 +637,6 @@ define void @urem_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldp q4, q1, [x1]
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldr q0, [x0, #16]
-; CHECK-NEXT:    ptrue p1.h, vl8
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    mov z3.d, z0.d
 ; CHECK-NEXT:    mov z5.d, z4.d
@@ -666,9 +665,10 @@ define void @urem_v16i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    splice z6.h, p0, z6.h, z5.h
 ; CHECK-NEXT:    uzp1 z7.h, z7.h, z7.h
 ; CHECK-NEXT:    splice z7.h, p0, z7.h, z2.h
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    movprfx z2, z3
-; CHECK-NEXT:    mls z2.h, p1/m, z6.h, z4.h
-; CHECK-NEXT:    mls z0.h, p1/m, z7.h, z1.h
+; CHECK-NEXT:    mls z2.h, p0/m, z6.h, z4.h
+; CHECK-NEXT:    mls z0.h, p0/m, z7.h, z1.h
 ; CHECK-NEXT:    stp q2, q0, [x0]
 ; CHECK-NEXT:    ret
   %op1 = load <16 x i16>, ptr %a
@@ -711,8 +711,8 @@ define <4 x i32> @urem_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @urem_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: urem_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    movprfx z4, z1
 ; CHECK-NEXT:    udiv z4.s, p0/m, z4.s, z0.s
@@ -763,8 +763,8 @@ define <2 x i64> @urem_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @urem_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: urem_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    movprfx z4, z1
 ; CHECK-NEXT:    udiv z4.d, p0/m, z4.d, z0.d
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
index 3b58e35bd844c..906112f7ac39e 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-select.ll
@@ -7,8 +7,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w0
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    and z2.h, z2.h, #0x1
@@ -23,8 +23,8 @@ define <4 x i8> @select_v4i8(<4 x i8> %op1, <4 x i8> %op2, i1 %mask) {
 define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z2.b, w0
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z2.b, #0
@@ -38,8 +38,8 @@ define <8 x i8> @select_v8i8(<8 x i8> %op1, <8 x i8> %op2, i1 %mask) {
 define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v16i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z2.b, w0
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z2.b, #0
@@ -53,8 +53,8 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, i1 %mask) {
 define void @select_v32i8(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-LABEL: select_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    mov z0.b, w2
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x0, #16]
@@ -74,8 +74,8 @@ define void @select_v32i8(ptr %a, ptr %b, i1 %mask) {
 define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and w8, w0, #0x1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    mov z2.s, w8
@@ -90,8 +90,8 @@ define <2 x i16> @select_v2i16(<2 x i16> %op1, <2 x i16> %op2, i1 %mask) {
 define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w0
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    and z2.h, z2.h, #0x1
@@ -106,8 +106,8 @@ define <4 x i16> @select_v4i16(<4 x i16> %op1, <4 x i16> %op2, i1 %mask) {
 define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v8i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z2.h, w0
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    and z2.h, z2.h, #0x1
@@ -122,8 +122,8 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, i1 %mask) {
 define void @select_v16i16(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-LABEL: select_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mov z0.h, w2
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    and z0.h, z0.h, #0x1
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -144,8 +144,8 @@ define void @select_v16i16(ptr %a, ptr %b, i1 %mask) {
 define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and w8, w0, #0x1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    mov z2.s, w8
@@ -160,8 +160,8 @@ define <2 x i32> @select_v2i32(<2 x i32> %op1, <2 x i32> %op2, i1 %mask) {
 define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v4i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and w8, w0, #0x1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    mov z2.s, w8
@@ -176,8 +176,8 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, i1 %mask) {
 define void @select_v8i32(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-LABEL: select_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and w8, w2, #0x1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mov z0.s, w8
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    ldr q0, [x0]
@@ -198,9 +198,9 @@ define void @select_v8i32(ptr %a, ptr %b, i1 %mask) {
 define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    and x8, x0, #0x1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    mov z2.d, x8
@@ -215,9 +215,9 @@ define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, i1 %mask) {
 define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) {
 ; CHECK-LABEL: select_v2i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    and x8, x0, #0x1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    mov z2.d, x8
@@ -232,9 +232,9 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, i1 %mask) {
 define void @select_v4i64(ptr %a, ptr %b, i1 %mask) {
 ; CHECK-LABEL: select_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
 ; CHECK-NEXT:    and x8, x2, #0x1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z0.d, x8
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    ldr q0, [x0]
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
index c7fa0e8ad5e4a..9ed52e321d9ab 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-shifts.ll
@@ -52,8 +52,8 @@ define <16 x i8> @ashr_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @ashr_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: ashr_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    asrr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    movprfx z1, z2
@@ -111,8 +111,8 @@ define <8 x i16> @ashr_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @ashr_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: ashr_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    asrr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z2
@@ -155,8 +155,8 @@ define <4 x i32> @ashr_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @ashr_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: ashr_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    asrr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z2
@@ -199,8 +199,8 @@ define <2 x i64> @ashr_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @ashr_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: ashr_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    asrr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    movprfx z1, z2
@@ -221,9 +221,9 @@ define void @ashr_v4i64(ptr %a, ptr %b) {
 define <4 x i8> @lshr_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-LABEL: lshr_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    and z1.h, z1.h, #0xff
 ; CHECK-NEXT:    and z0.h, z0.h, #0xff
 ; CHECK-NEXT:    lsr z0.h, p0/m, z0.h, z1.h
@@ -262,8 +262,8 @@ define <16 x i8> @lshr_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @lshr_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: lshr_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    lsrr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    movprfx z1, z2
@@ -280,9 +280,9 @@ define void @lshr_v32i8(ptr %a, ptr %b) {
 define <2 x i16> @lshr_v2i16(<2 x i16> %op1, <2 x i16> %op2) {
 ; CHECK-LABEL: lshr_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    and z1.s, z1.s, #0xffff
 ; CHECK-NEXT:    and z0.s, z0.s, #0xffff
 ; CHECK-NEXT:    lsr z0.s, p0/m, z0.s, z1.s
@@ -321,8 +321,8 @@ define <8 x i16> @lshr_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @lshr_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: lshr_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    lsrr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z2
@@ -365,8 +365,8 @@ define <4 x i32> @lshr_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @lshr_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: lshr_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    lsrr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z2
@@ -409,8 +409,8 @@ define <2 x i64> @lshr_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @lshr_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: lshr_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    lsrr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    movprfx z1, z2
@@ -431,8 +431,8 @@ define void @lshr_v4i64(ptr %a, ptr %b) {
 define <2 x i8> @shl_v2i8(<2 x i8> %op1, <2 x i8> %op2) {
 ; CHECK-LABEL: shl_v2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    and z1.s, z1.s, #0xff
 ; CHECK-NEXT:    lsl z0.s, p0/m, z0.s, z1.s
@@ -445,8 +445,8 @@ define <2 x i8> @shl_v2i8(<2 x i8> %op1, <2 x i8> %op2) {
 define <4 x i8> @shl_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
 ; CHECK-LABEL: shl_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    and z1.h, z1.h, #0xff
 ; CHECK-NEXT:    lsl z0.h, p0/m, z0.h, z1.h
@@ -485,8 +485,8 @@ define <16 x i8> @shl_v16i8(<16 x i8> %op1, <16 x i8> %op2) {
 define void @shl_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: shl_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    lslr z0.b, p0/m, z0.b, z1.b
 ; CHECK-NEXT:    movprfx z1, z2
@@ -529,8 +529,8 @@ define <8 x i16> @shl_v8i16(<8 x i16> %op1, <8 x i16> %op2) {
 define void @shl_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: shl_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    lslr z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z2
@@ -573,8 +573,8 @@ define <4 x i32> @shl_v4i32(<4 x i32> %op1, <4 x i32> %op2) {
 define void @shl_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: shl_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    lslr z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z2
@@ -617,8 +617,8 @@ define <2 x i64> @shl_v2i64(<2 x i64> %op1, <2 x i64> %op2) {
 define void @shl_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: shl_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    lslr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    movprfx z1, z2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index 5c5cf68135bf8..b285659258f31 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -36,8 +36,8 @@ define void @ucvtf_v8i16_v8f16(ptr %a, ptr %b) {
 define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: ucvtf_v16i16_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    ucvtf z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
@@ -55,8 +55,8 @@ define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) {
 define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) {
 ; CHECK-LABEL: ucvtf_v2i16_v2f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    and z0.s, z0.s, #0xffff
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
@@ -68,8 +68,8 @@ define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) {
 define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) {
 ; CHECK-LABEL: ucvtf_v4i16_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    uunpklo z0.s, z0.h
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -277,8 +277,8 @@ define <4 x half> @ucvtf_v4i32_v4f16(<4 x i32> %op1) {
 define <8 x half> @ucvtf_v8i32_v8f16(ptr %a) {
 ; CHECK-LABEL: ucvtf_v8i32_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ucvtf z1.h, p0/m, z1.s
 ; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
@@ -295,8 +295,8 @@ define <8 x half> @ucvtf_v8i32_v8f16(ptr %a) {
 define void @ucvtf_v16i32_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: ucvtf_v16i32_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    ucvtf z1.h, p0/m, z1.s
 ; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.s
@@ -348,8 +348,8 @@ define <4 x float> @ucvtf_v4i32_v4f32(<4 x i32> %op1) {
 define void @ucvtf_v8i32_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: ucvtf_v8i32_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    ucvtf z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
@@ -367,8 +367,8 @@ define void @ucvtf_v8i32_v8f32(ptr %a, ptr %b) {
 define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) {
 ; CHECK-LABEL: ucvtf_v2i32_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    uunpklo z0.d, z0.s
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -446,16 +446,16 @@ define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) {
 define <4 x half> @ucvtf_v4i64_v4f16(ptr %a) {
 ; CHECK-LABEL: ucvtf_v4i64_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ucvtf z1.s, p0/m, z1.d
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.d
 ; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
-; CHECK-NEXT:    fcvt z0.h, p1/m, z0.s
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -467,10 +467,9 @@ define <4 x half> @ucvtf_v4i64_v4f16(ptr %a) {
 define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) {
 ; CHECK-LABEL: ucvtf_v8i64_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q0, [x0, #32]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q2, q3, [x0]
-; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.d
 ; CHECK-NEXT:    ucvtf z1.s, p0/m, z1.d
 ; CHECK-NEXT:    ucvtf z3.s, p0/m, z3.d
@@ -482,11 +481,12 @@ define <8 x half> @ucvtf_v8i64_v8f16(ptr %a) {
 ; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
 ; CHECK-NEXT:    splice z1.s, p0, z1.s, z0.s
 ; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
-; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    movprfx z0, z1
-; CHECK-NEXT:    fcvt z0.h, p1/m, z1.s
+; CHECK-NEXT:    fcvt z0.h, p0/m, z1.s
 ; CHECK-NEXT:    movprfx z1, z2
-; CHECK-NEXT:    fcvt z1.h, p1/m, z2.s
+; CHECK-NEXT:    fcvt z1.h, p0/m, z2.s
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z2.h, z0.h, z0.h
 ; CHECK-NEXT:    uzp1 z0.h, z1.h, z1.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z2.h
@@ -517,8 +517,8 @@ define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) {
 define <4 x float> @ucvtf_v4i64_v4f32(ptr %a) {
 ; CHECK-LABEL: ucvtf_v4i64_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ucvtf z1.s, p0/m, z1.d
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.d
 ; CHECK-NEXT:    ptrue p0.s, vl2
@@ -535,8 +535,8 @@ define <4 x float> @ucvtf_v4i64_v4f32(ptr %a) {
 define void @ucvtf_v8i64_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: ucvtf_v8i64_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    ucvtf z1.s, p0/m, z1.d
 ; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.d
@@ -576,8 +576,8 @@ define <2 x double> @ucvtf_v2i64_v2f64(<2 x i64> %op1) {
 define void @ucvtf_v4i64_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: ucvtf_v4i64_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    ucvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]
@@ -621,8 +621,8 @@ define void @scvtf_v8i16_v8f16(ptr %a, ptr %b) {
 define void @scvtf_v16i16_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: scvtf_v16i16_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    scvtf z0.h, p0/m, z0.h
 ; CHECK-NEXT:    scvtf z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x1]
@@ -652,8 +652,8 @@ define <2 x float> @scvtf_v2i16_v2f32(<2 x i16> %op1) {
 define <4 x float> @scvtf_v4i16_v4f32(<4 x i16> %op1) {
 ; CHECK-LABEL: scvtf_v4i16_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -850,8 +850,8 @@ define <4 x half> @scvtf_v4i32_v4f16(<4 x i32> %op1) {
 define <8 x half> @scvtf_v8i32_v8f16(ptr %a) {
 ; CHECK-LABEL: scvtf_v8i32_v8f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    scvtf z1.h, p0/m, z1.s
 ; CHECK-NEXT:    scvtf z0.h, p0/m, z0.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
@@ -896,8 +896,8 @@ define <4 x float> @scvtf_v4i32_v4f32(<4 x i32> %op1) {
 define void @scvtf_v8i32_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: scvtf_v8i32_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
 ; CHECK-NEXT:    scvtf z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x1]
@@ -915,8 +915,8 @@ define void @scvtf_v8i32_v8f32(ptr %a, ptr %b) {
 define <2 x double> @scvtf_v2i32_v2f64(<2 x i32> %op1) {
 ; CHECK-LABEL: scvtf_v2i32_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    sunpklo z0.d, z0.s
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
@@ -1038,16 +1038,16 @@ define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) {
 define <4 x half> @scvtf_v4i64_v4f16(ptr %a) {
 ; CHECK-LABEL: scvtf_v4i64_v4f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ptrue p1.s
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    scvtf z1.s, p0/m, z1.d
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.d
 ; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
-; CHECK-NEXT:    fcvt z0.h, p1/m, z0.s
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -1076,8 +1076,8 @@ define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) {
 define <4 x float> @scvtf_v4i64_v4f32(ptr %a) {
 ; CHECK-LABEL: scvtf_v4i64_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    scvtf z1.s, p0/m, z1.d
 ; CHECK-NEXT:    scvtf z0.s, p0/m, z0.d
 ; CHECK-NEXT:    ptrue p0.s, vl2
@@ -1110,8 +1110,8 @@ define <2 x double> @scvtf_v2i64_v2f64(<2 x i64> %op1) {
 define void @scvtf_v4i64_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: scvtf_v4i64_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
 ; CHECK-NEXT:    scvtf z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x1]
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
index 1809cfcf3db69..81bbaa92d4b47 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll
@@ -61,8 +61,8 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask)
 define void @select_v32i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: select_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q0, q2, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q3, [x1]
 ; CHECK-NEXT:    cmpeq p1.b, p0/z, z0.b, z1.b
 ; CHECK-NEXT:    cmpeq p0.b, p0/z, z2.b, z3.b
@@ -136,8 +136,8 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) {
 define void @select_v16i16(ptr %a, ptr %b) {
 ; CHECK-LABEL: select_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q2, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q3, [x1]
 ; CHECK-NEXT:    cmpeq p1.h, p0/z, z0.h, z1.h
 ; CHECK-NEXT:    cmpeq p0.h, p0/z, z2.h, z3.h
@@ -193,8 +193,8 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) {
 define void @select_v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: select_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q2, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q3, [x1]
 ; CHECK-NEXT:    cmpeq p1.s, p0/z, z0.s, z1.s
 ; CHECK-NEXT:    cmpeq p0.s, p0/z, z2.s, z3.s
@@ -213,9 +213,9 @@ define void @select_v8i32(ptr %a, ptr %b) {
 define <1 x i64> @select_v1i64(<1 x i64> %op1, <1 x i64> %op2, <1 x i1> %mask) {
 ; CHECK-LABEL: select_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-NEXT:    and x8, x0, #0x1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    // kill: def $d1 killed $d1 def $z1
 ; CHECK-NEXT:    mov z2.d, x8
@@ -249,8 +249,8 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) {
 define void @select_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: select_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q2, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q3, [x1]
 ; CHECK-NEXT:    cmpeq p1.d, p0/z, z0.d, z1.d
 ; CHECK-NEXT:    cmpeq p0.d, p0/z, z2.d, z3.d
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
index bb1bd8fe72b21..c4aeb4465c537 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-log-reduce.ll
@@ -48,8 +48,8 @@ define i8 @andv_v16i8(<16 x i8> %a) {
 define i8 @andv_v32i8(ptr %a) {
 ; CHECK-LABEL: andv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    and z0.d, z1.d, z0.d
 ; CHECK-NEXT:    andv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
@@ -98,8 +98,8 @@ define i16 @andv_v8i16(<8 x i16> %a) {
 define i16 @andv_v16i16(ptr %a) {
 ; CHECK-LABEL: andv_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    and z0.d, z1.d, z0.d
 ; CHECK-NEXT:    andv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
@@ -136,8 +136,8 @@ define i32 @andv_v4i32(<4 x i32> %a) {
 define i32 @andv_v8i32(ptr %a) {
 ; CHECK-LABEL: andv_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    and z0.d, z1.d, z0.d
 ; CHECK-NEXT:    andv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
@@ -162,8 +162,8 @@ define i64 @andv_v2i64(<2 x i64> %a) {
 define i64 @andv_v4i64(ptr %a) {
 ; CHECK-LABEL: andv_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    and z0.d, z1.d, z0.d
 ; CHECK-NEXT:    andv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
@@ -216,8 +216,8 @@ define i8 @eorv_v16i8(<16 x i8> %a) {
 define i8 @eorv_v32i8(ptr %a) {
 ; CHECK-LABEL: eorv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    eor z0.d, z1.d, z0.d
 ; CHECK-NEXT:    eorv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
@@ -266,8 +266,8 @@ define i16 @eorv_v8i16(<8 x i16> %a) {
 define i16 @eorv_v16i16(ptr %a) {
 ; CHECK-LABEL: eorv_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    eor z0.d, z1.d, z0.d
 ; CHECK-NEXT:    eorv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
@@ -304,8 +304,8 @@ define i32 @eorv_v4i32(<4 x i32> %a) {
 define i32 @eorv_v8i32(ptr %a) {
 ; CHECK-LABEL: eorv_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    eor z0.d, z1.d, z0.d
 ; CHECK-NEXT:    eorv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
@@ -330,8 +330,8 @@ define i64 @eorv_v2i64(<2 x i64> %a) {
 define i64 @eorv_v4i64(ptr %a) {
 ; CHECK-LABEL: eorv_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    eor z0.d, z1.d, z0.d
 ; CHECK-NEXT:    eorv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
@@ -384,8 +384,8 @@ define i8 @orv_v16i8(<16 x i8> %a) {
 define i8 @orv_v32i8(ptr %a) {
 ; CHECK-LABEL: orv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    orr z0.d, z1.d, z0.d
 ; CHECK-NEXT:    orv b0, p0, z0.b
 ; CHECK-NEXT:    fmov w0, s0
@@ -434,8 +434,8 @@ define i16 @orv_v8i16(<8 x i16> %a) {
 define i16 @orv_v16i16(ptr %a) {
 ; CHECK-LABEL: orv_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    orr z0.d, z1.d, z0.d
 ; CHECK-NEXT:    orv h0, p0, z0.h
 ; CHECK-NEXT:    fmov w0, s0
@@ -472,8 +472,8 @@ define i32 @orv_v4i32(<4 x i32> %a) {
 define i32 @orv_v8i32(ptr %a) {
 ; CHECK-LABEL: orv_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    orr z0.d, z1.d, z0.d
 ; CHECK-NEXT:    orv s0, p0, z0.s
 ; CHECK-NEXT:    fmov w0, s0
@@ -498,8 +498,8 @@ define i64 @orv_v2i64(<2 x i64> %a) {
 define i64 @orv_v4i64(ptr %a) {
 ; CHECK-LABEL: orv_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    orr z0.d, z1.d, z0.d
 ; CHECK-NEXT:    orv d0, p0, z0.d
 ; CHECK-NEXT:    fmov x0, d0
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
index e812706744745..f2b3f9b12ea71 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-masked-store.ll
@@ -123,8 +123,8 @@ define void @masked_store_v32i8(ptr %dst, <32 x i1> %mask) {
 ; CHECK-NEXT:    asr z0.b, z0.b, #7
 ; CHECK-NEXT:    asr z1.b, z1.b, #7
 ; CHECK-NEXT:    cmpne p1.b, p0/z, z0.b, #0
-; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
+; CHECK-NEXT:    mov z0.b, #0 // =0x0
 ; CHECK-NEXT:    st1b { z0.b }, p1, [x0, x8]
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    add sp, sp, #32
@@ -204,8 +204,8 @@ define void @masked_store_v16f16(ptr %dst, <16 x i1> %mask) {
 ; CHECK-NEXT:    asr z0.h, z0.h, #15
 ; CHECK-NEXT:    asr z1.h, z1.h, #15
 ; CHECK-NEXT:    cmpne p1.h, p0/z, z1.h, #0
-; CHECK-NEXT:    mov z1.h, #0 // =0x0
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT:    mov z1.h, #0 // =0x0
 ; CHECK-NEXT:    st1h { z1.h }, p1, [x0, x8, lsl #1]
 ; CHECK-NEXT:    st1h { z1.h }, p0, [x0]
 ; CHECK-NEXT:    ret
@@ -242,7 +242,7 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    fmov w8, s1
 ; CHECK-NEXT:    fmov w9, s2
-; CHECK-NEXT:    mov z2.s, #0 // =0x0
+; CHECK-NEXT:    mov z2.b, z0.b[3]
 ; CHECK-NEXT:    strh w8, [sp, #14]
 ; CHECK-NEXT:    fmov w8, s3
 ; CHECK-NEXT:    mov z3.b, z0.b[2]
@@ -258,9 +258,9 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) {
 ; CHECK-NEXT:    lsl z1.s, z1.s, #31
 ; CHECK-NEXT:    asr z1.s, z1.s, #31
 ; CHECK-NEXT:    cmpne p1.s, p0/z, z1.s, #0
-; CHECK-NEXT:    mov z1.b, z0.b[3]
-; CHECK-NEXT:    st1w { z2.s }, p1, [x0, x8, lsl #2]
-; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov z1.s, #0 // =0x0
+; CHECK-NEXT:    st1w { z1.s }, p1, [x0, x8, lsl #2]
+; CHECK-NEXT:    fmov w8, s2
 ; CHECK-NEXT:    strh w9, [sp]
 ; CHECK-NEXT:    strh w8, [sp, #6]
 ; CHECK-NEXT:    fmov w8, s3
@@ -272,7 +272,7 @@ define void @masked_store_v8f32(ptr %dst, <8 x i1> %mask) {
 ; CHECK-NEXT:    lsl z0.s, z0.s, #31
 ; CHECK-NEXT:    asr z0.s, z0.s, #31
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
-; CHECK-NEXT:    st1w { z2.s }, p0, [x0]
+; CHECK-NEXT:    st1w { z1.s }, p0, [x0]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
   call void @llvm.masked.store.v8f32(<8 x float> zeroinitializer, ptr %dst, i32 8, <8 x i1> %mask)
@@ -310,8 +310,8 @@ define void @masked_store_v4f64(ptr %dst, <4 x i1> %mask) {
 ; CHECK-NEXT:    asr z1.d, z1.d, #63
 ; CHECK-NEXT:    asr z0.d, z0.d, #63
 ; CHECK-NEXT:    cmpne p1.d, p0/z, z0.d, #0
-; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
 ; CHECK-NEXT:    st1d { z0.d }, p1, [x0, x8, lsl #3]
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
index f0b0b3269e98f..6fcb95f283338 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-optimize-ptrue.ll
@@ -170,8 +170,8 @@ define void @abs_v4i32(ptr %a) {
 define void @abs_v8i32(ptr %a) {
 ; CHECK-LABEL: abs_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    abs z0.s, p0/m, z0.s
 ; CHECK-NEXT:    abs z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -199,8 +199,8 @@ define void @abs_v2i64(ptr %a) {
 define void @abs_v4i64(ptr %a) {
 ; CHECK-LABEL: abs_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    abs z0.d, p0/m, z0.d
 ; CHECK-NEXT:    abs z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -263,8 +263,8 @@ define void @fadd_v8f16(ptr %a, ptr %b) {
 define void @fadd_v16f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: fadd_v16f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fadd z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z2
@@ -313,8 +313,8 @@ define void @fadd_v4f32(ptr %a, ptr %b) {
 define void @fadd_v8f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: fadd_v8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    movprfx z1, z2
@@ -347,8 +347,8 @@ define void @fadd_v2f64(ptr %a, ptr %b) {
 define void @fadd_v4f64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fadd_v4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q3, [x1]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q1, q2, [x0]
 ; CHECK-NEXT:    fadd z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    movprfx z1, z2
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
index d1bff4fa21a11..00413302798ca 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll
@@ -9,8 +9,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @test_revbv16i16(ptr %a) {
 ; CHECK-LABEL: test_revbv16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK-NEXT:    revb z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -25,8 +25,8 @@ define void @test_revbv16i16(ptr %a) {
 define void @test_revbv8i32(ptr %a) {
 ; CHECK-LABEL: test_revbv8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK-NEXT:    revb z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -41,8 +41,8 @@ define void @test_revbv8i32(ptr %a) {
 define void @test_revbv4i64(ptr %a) {
 ; CHECK-LABEL: test_revbv4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    revb z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -57,8 +57,8 @@ define void @test_revbv4i64(ptr %a) {
 define void @test_revhv8i32(ptr %a) {
 ; CHECK-LABEL: test_revhv8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    revh z0.s, p0/m, z0.s
 ; CHECK-NEXT:    revh z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -73,8 +73,8 @@ define void @test_revhv8i32(ptr %a) {
 define void @test_revhv8f32(ptr %a) {
 ; CHECK-LABEL: test_revhv8f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    revh z0.s, p0/m, z0.s
 ; CHECK-NEXT:    revh z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -89,8 +89,8 @@ define void @test_revhv8f32(ptr %a) {
 define void @test_revhv4i64(ptr %a) {
 ; CHECK-LABEL: test_revhv4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    revh z0.d, p0/m, z0.d
 ; CHECK-NEXT:    revh z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -105,8 +105,8 @@ define void @test_revhv4i64(ptr %a) {
 define void @test_revwv4i64(ptr %a) {
 ; CHECK-LABEL: test_revwv4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    revw z0.d, p0/m, z0.d
 ; CHECK-NEXT:    revw z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -121,8 +121,8 @@ define void @test_revwv4i64(ptr %a) {
 define void @test_revwv4f64(ptr %a) {
 ; CHECK-LABEL: test_revwv4f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    revw z0.d, p0/m, z0.d
 ; CHECK-NEXT:    revw z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -150,8 +150,8 @@ define <16 x i8> @test_revv16i8(ptr %a) {
 define void @test_revwv8i32v8i32(ptr %a, ptr %b) {
 ; CHECK-LABEL: test_revwv8i32v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldp q0, q1, [x1]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    revw z0.d, p0/m, z0.d
 ; CHECK-NEXT:    revw z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -166,8 +166,8 @@ define void @test_revwv8i32v8i32(ptr %a, ptr %b) {
 define void @test_revhv32i16(ptr %a) {
 ; CHECK-LABEL: test_revhv32i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    revh z0.d, p0/m, z0.d
 ; CHECK-NEXT:    revh z1.d, p0/m, z1.d
@@ -202,8 +202,8 @@ define void @test_rev_elts_fail(ptr %a) {
 define void @test_revdv4i64_sve2p1(ptr %a) #1 {
 ; CHECK-LABEL: test_revdv4i64_sve2p1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    revd z0.q, p0/m, z0.q
 ; CHECK-NEXT:    revd z1.q, p0/m, z1.q
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -217,8 +217,8 @@ define void @test_revdv4i64_sve2p1(ptr %a) #1 {
 define void @test_revdv4f64_sve2p1(ptr %a) #1 {
 ; CHECK-LABEL: test_revdv4f64_sve2p1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    revd z0.q, p0/m, z0.q
 ; CHECK-NEXT:    revd z1.q, p0/m, z1.q
 ; CHECK-NEXT:    stp q0, q1, [x0]
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
index d7bfb6b2680e1..cb73030306b02 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll
@@ -82,11 +82,11 @@ define void @zip_v32i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK-NEXT:    ldp q1, q3, [x1]
 ; CHECK-NEXT:    ldp q0, q4, [x0]
-; CHECK-NEXT:    ldp q2, q6, [x0, #32]
+; CHECK-NEXT:    ldp q2, q5, [x0, #32]
 ; CHECK-NEXT:    mov z16.h, z3.h[7]
 ; CHECK-NEXT:    mov z18.h, z3.h[6]
 ; CHECK-NEXT:    mov z17.h, z4.h[7]
-; CHECK-NEXT:    ldp q5, q7, [x1, #32]
+; CHECK-NEXT:    ldp q6, q7, [x1, #32]
 ; CHECK-NEXT:    mov z19.h, z4.h[6]
 ; CHECK-NEXT:    fmov w8, s16
 ; CHECK-NEXT:    mov z16.h, z3.h[5]
@@ -98,13 +98,13 @@ define void @zip_v32i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z18.h, z3.h[4]
 ; CHECK-NEXT:    strh w9, [sp, #28]
 ; CHECK-NEXT:    fmov w9, s19
-; CHECK-NEXT:    mov z19.h, z6.h[7]
+; CHECK-NEXT:    mov z19.h, z5.h[7]
 ; CHECK-NEXT:    zip1 z3.h, z4.h, z3.h
 ; CHECK-NEXT:    strh w8, [sp, #26]
 ; CHECK-NEXT:    fmov w8, s16
 ; CHECK-NEXT:    mov z16.h, z4.h[4]
 ; CHECK-NEXT:    strh w9, [sp, #24]
-; CHECK-NEXT:    zip1 z4.h, z6.h, z7.h
+; CHECK-NEXT:    zip1 z4.h, z5.h, z7.h
 ; CHECK-NEXT:    strh w8, [sp, #22]
 ; CHECK-NEXT:    fmov w8, s17
 ; CHECK-NEXT:    mov z17.h, z1.h[7]
@@ -131,7 +131,7 @@ define void @zip_v32i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    fmov w8, s18
 ; CHECK-NEXT:    mov z18.h, z0.h[4]
 ; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
-; CHECK-NEXT:    zip1 z1.h, z2.h, z5.h
+; CHECK-NEXT:    zip1 z1.h, z2.h, z6.h
 ; CHECK-NEXT:    strh w8, [sp, #54]
 ; CHECK-NEXT:    fmov w8, s16
 ; CHECK-NEXT:    ldr q16, [sp, #16]
@@ -143,41 +143,41 @@ define void @zip_v32i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z18.h, z7.h[7]
 ; CHECK-NEXT:    strh w8, [sp, #48]
 ; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z6.h[6]
+; CHECK-NEXT:    mov z18.h, z5.h[6]
 ; CHECK-NEXT:    ldr q17, [sp, #48]
 ; CHECK-NEXT:    strh w8, [sp, #46]
 ; CHECK-NEXT:    fmov w8, s19
 ; CHECK-NEXT:    mov z19.h, z7.h[5]
 ; CHECK-NEXT:    strh w8, [sp, #44]
 ; CHECK-NEXT:    fmov w8, s20
-; CHECK-NEXT:    mov z20.h, z6.h[5]
+; CHECK-NEXT:    mov z20.h, z5.h[5]
 ; CHECK-NEXT:    strh w8, [sp, #42]
 ; CHECK-NEXT:    fmov w8, s18
 ; CHECK-NEXT:    mov z18.h, z7.h[4]
 ; CHECK-NEXT:    strh w8, [sp, #40]
 ; CHECK-NEXT:    fmov w8, s19
-; CHECK-NEXT:    mov z19.h, z6.h[4]
+; CHECK-NEXT:    mov z19.h, z5.h[4]
 ; CHECK-NEXT:    strh w8, [sp, #38]
 ; CHECK-NEXT:    fmov w8, s20
-; CHECK-NEXT:    mov z20.h, z5.h[7]
+; CHECK-NEXT:    mov z20.h, z6.h[7]
 ; CHECK-NEXT:    strh w8, [sp, #36]
 ; CHECK-NEXT:    fmov w8, s18
 ; CHECK-NEXT:    mov z18.h, z2.h[7]
 ; CHECK-NEXT:    strh w8, [sp, #34]
 ; CHECK-NEXT:    fmov w8, s19
-; CHECK-NEXT:    mov z19.h, z5.h[6]
+; CHECK-NEXT:    mov z19.h, z6.h[6]
 ; CHECK-NEXT:    strh w8, [sp, #32]
 ; CHECK-NEXT:    fmov w8, s20
 ; CHECK-NEXT:    mov z20.h, z2.h[6]
 ; CHECK-NEXT:    strh w8, [sp, #14]
 ; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    mov z18.h, z5.h[5]
+; CHECK-NEXT:    mov z18.h, z6.h[5]
 ; CHECK-NEXT:    strh w8, [sp, #12]
 ; CHECK-NEXT:    fmov w8, s19
 ; CHECK-NEXT:    mov z19.h, z2.h[5]
 ; CHECK-NEXT:    strh w8, [sp, #10]
 ; CHECK-NEXT:    fmov w8, s20
-; CHECK-NEXT:    mov z20.h, z5.h[4]
+; CHECK-NEXT:    mov z20.h, z6.h[4]
 ; CHECK-NEXT:    fmov w9, s19
 ; CHECK-NEXT:    strh w8, [sp, #8]
 ; CHECK-NEXT:    fmov w8, s18
@@ -186,10 +186,10 @@ define void @zip_v32i16(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ldr q2, [sp, #32]
 ; CHECK-NEXT:    strh w8, [sp, #6]
 ; CHECK-NEXT:    fmov w8, s20
+; CHECK-NEXT:    fmov w9, s18
 ; CHECK-NEXT:    add z2.h, z16.h, z2.h
 ; CHECK-NEXT:    strh w8, [sp, #2]
-; CHECK-NEXT:    fmov w8, s18
-; CHECK-NEXT:    strh w8, [sp]
+; CHECK-NEXT:    strh w9, [sp]
 ; CHECK-NEXT:    ldr q4, [sp]
 ; CHECK-NEXT:    stp q3, q2, [x0, #32]
 ; CHECK-NEXT:    add z1.h, z17.h, z4.h
@@ -471,9 +471,9 @@ define void @trn_v4f64(ptr %a, ptr %b) {
 define void @trn_v4f32(ptr %a, ptr %b) {
 ; CHECK-LABEL: trn_v4f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    trn1 z2.s, z0.s, z1.s
 ; CHECK-NEXT:    trn2 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
index f2ba4a7cc3567..ab7c42b3e9e37 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ptest.ll
@@ -8,8 +8,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define i1 @ptest_v16i1(ptr %a, ptr %b) {
 ; CHECK-LABEL: ptest_v16i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q1, q0, [x0, #32]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q2, q3, [x0]
 ; CHECK-NEXT:    fcmne p1.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    fcmne p2.s, p0/z, z1.s, #0.0
@@ -20,7 +20,6 @@ define i1 @ptest_v16i1(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z2.s, p3/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z3.s, p0/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    ptrue p1.b, vl16
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
@@ -31,7 +30,8 @@ define i1 @ptest_v16i1(ptr %a, ptr %b) {
 ; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
 ; CHECK-NEXT:    uzp1 z1.b, z3.b, z3.b
 ; CHECK-NEXT:    splice z1.b, p0, z1.b, z0.b
-; CHECK-NEXT:    umaxv b0, p1, z1.b
+; CHECK-NEXT:    ptrue p0.b, vl16
+; CHECK-NEXT:    umaxv b0, p0, z1.b
 ; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
@@ -45,41 +45,41 @@ define i1 @ptest_v16i1(ptr %a, ptr %b) {
 define i1 @ptest_or_v16i1(ptr %a, ptr %b) {
 ; CHECK-LABEL: ptest_or_v16i1:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q1, q0, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x0, #32]
-; CHECK-NEXT:    ldp q4, q5, [x1]
-; CHECK-NEXT:    ldp q6, q7, [x1, #32]
-; CHECK-NEXT:    fcmne p1.s, p0/z, z3.s, #0.0
-; CHECK-NEXT:    fcmne p2.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    fcmne p3.s, p0/z, z1.s, #0.0
-; CHECK-NEXT:    fcmne p4.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    fcmne p5.s, p0/z, z7.s, #0.0
-; CHECK-NEXT:    fcmne p6.s, p0/z, z6.s, #0.0
-; CHECK-NEXT:    fcmne p7.s, p0/z, z5.s, #0.0
-; CHECK-NEXT:    fcmne p0.s, p0/z, z4.s, #0.0
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    ldp q4, q5, [x1, #32]
+; CHECK-NEXT:    fcmne p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    fcmne p2.s, p0/z, z1.s, #0.0
+; CHECK-NEXT:    ldp q0, q1, [x1]
+; CHECK-NEXT:    fcmne p3.s, p0/z, z3.s, #0.0
+; CHECK-NEXT:    fcmne p4.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    fcmne p5.s, p0/z, z5.s, #0.0
+; CHECK-NEXT:    fcmne p6.s, p0/z, z4.s, #0.0
+; CHECK-NEXT:    fcmne p7.s, p0/z, z1.s, #0.0
+; CHECK-NEXT:    fcmne p0.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.s, p2/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z2.s, p3/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z3.s, p4/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z4.s, p5/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z5.s, p6/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    mov z6.s, p7/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z7.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    ptrue p1.h, vl4
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
 ; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
 ; CHECK-NEXT:    uzp1 z7.h, z7.h, z7.h
+; CHECK-NEXT:    splice z1.h, p0, z1.h, z0.h
+; CHECK-NEXT:    splice z3.h, p0, z3.h, z2.h
+; CHECK-NEXT:    splice z5.h, p0, z5.h, z4.h
+; CHECK-NEXT:    splice z7.h, p0, z7.h, z6.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    splice z1.h, p1, z1.h, z0.h
-; CHECK-NEXT:    splice z3.h, p1, z3.h, z2.h
-; CHECK-NEXT:    splice z5.h, p1, z5.h, z4.h
-; CHECK-NEXT:    splice z7.h, p1, z7.h, z6.h
 ; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
 ; CHECK-NEXT:    uzp1 z1.b, z3.b, z3.b
 ; CHECK-NEXT:    uzp1 z2.b, z5.b, z5.b
@@ -112,41 +112,41 @@ declare i1 @llvm.vector.reduce.or.i1.v16i1(<16 x i1>)
 define i1 @ptest_and_v16i1(ptr %a, ptr %b) {
 ; CHECK-LABEL: ptest_and_v16i1:
 ; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp q1, q0, [x0, #32]
 ; CHECK-NEXT:    ptrue p0.s, vl4
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    ldp q2, q3, [x0, #32]
-; CHECK-NEXT:    ldp q4, q5, [x1]
-; CHECK-NEXT:    ldp q6, q7, [x1, #32]
-; CHECK-NEXT:    fcmne p1.s, p0/z, z3.s, #0.0
-; CHECK-NEXT:    fcmne p2.s, p0/z, z2.s, #0.0
-; CHECK-NEXT:    fcmne p3.s, p0/z, z1.s, #0.0
-; CHECK-NEXT:    fcmne p4.s, p0/z, z0.s, #0.0
-; CHECK-NEXT:    fcmne p5.s, p0/z, z7.s, #0.0
-; CHECK-NEXT:    fcmne p6.s, p0/z, z6.s, #0.0
-; CHECK-NEXT:    fcmne p7.s, p0/z, z5.s, #0.0
-; CHECK-NEXT:    fcmne p0.s, p0/z, z4.s, #0.0
+; CHECK-NEXT:    ldp q2, q3, [x0]
+; CHECK-NEXT:    ldp q4, q5, [x1, #32]
+; CHECK-NEXT:    fcmne p1.s, p0/z, z0.s, #0.0
+; CHECK-NEXT:    fcmne p2.s, p0/z, z1.s, #0.0
+; CHECK-NEXT:    ldp q0, q1, [x1]
+; CHECK-NEXT:    fcmne p3.s, p0/z, z3.s, #0.0
+; CHECK-NEXT:    fcmne p4.s, p0/z, z2.s, #0.0
+; CHECK-NEXT:    fcmne p5.s, p0/z, z5.s, #0.0
+; CHECK-NEXT:    fcmne p6.s, p0/z, z4.s, #0.0
+; CHECK-NEXT:    fcmne p7.s, p0/z, z1.s, #0.0
+; CHECK-NEXT:    fcmne p0.s, p0/z, z0.s, #0.0
 ; CHECK-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z1.s, p2/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z2.s, p3/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z3.s, p4/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z4.s, p5/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z5.s, p6/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    mov z6.s, p7/z, #-1 // =0xffffffffffffffff
 ; CHECK-NEXT:    mov z7.s, p0/z, #-1 // =0xffffffffffffffff
-; CHECK-NEXT:    ptrue p1.h, vl4
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
 ; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
 ; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
 ; CHECK-NEXT:    uzp1 z7.h, z7.h, z7.h
+; CHECK-NEXT:    splice z1.h, p0, z1.h, z0.h
+; CHECK-NEXT:    splice z3.h, p0, z3.h, z2.h
+; CHECK-NEXT:    splice z5.h, p0, z5.h, z4.h
+; CHECK-NEXT:    splice z7.h, p0, z7.h, z6.h
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    splice z1.h, p1, z1.h, z0.h
-; CHECK-NEXT:    splice z3.h, p1, z3.h, z2.h
-; CHECK-NEXT:    splice z5.h, p1, z5.h, z4.h
-; CHECK-NEXT:    splice z7.h, p1, z7.h, z6.h
 ; CHECK-NEXT:    uzp1 z0.b, z1.b, z1.b
 ; CHECK-NEXT:    uzp1 z1.b, z3.b, z3.b
 ; CHECK-NEXT:    uzp1 z2.b, z5.b, z5.b
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
index f686efff67b66..bfa931044bc53 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-rev.ll
@@ -49,8 +49,8 @@ define <16 x i8> @bitreverse_v16i8(<16 x i8> %op) {
 define void @bitreverse_v32i8(ptr %a) {
 ; CHECK-LABEL: bitreverse_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
 ; CHECK-NEXT:    rbit z1.b, p0/m, z1.b
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -101,8 +101,8 @@ define <8 x i16> @bitreverse_v8i16(<8 x i16> %op) {
 define void @bitreverse_v16i16(ptr %a) {
 ; CHECK-LABEL: bitreverse_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
 ; CHECK-NEXT:    rbit z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -140,8 +140,8 @@ define <4 x i32> @bitreverse_v4i32(<4 x i32> %op) {
 define void @bitreverse_v8i32(ptr %a) {
 ; CHECK-LABEL: bitreverse_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
 ; CHECK-NEXT:    rbit z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -179,8 +179,8 @@ define <2 x i64> @bitreverse_v2i64(<2 x i64> %op) {
 define void @bitreverse_v4i64(ptr %a) {
 ; CHECK-LABEL: bitreverse_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
 ; CHECK-NEXT:    rbit z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -235,8 +235,8 @@ define <8 x i16> @bswap_v8i16(<8 x i16> %op) {
 define void @bswap_v16i16(ptr %a) {
 ; CHECK-LABEL: bswap_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    revb z0.h, p0/m, z0.h
 ; CHECK-NEXT:    revb z1.h, p0/m, z1.h
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -274,8 +274,8 @@ define <4 x i32> @bswap_v4i32(<4 x i32> %op) {
 define void @bswap_v8i32(ptr %a) {
 ; CHECK-LABEL: bswap_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    revb z0.s, p0/m, z0.s
 ; CHECK-NEXT:    revb z1.s, p0/m, z1.s
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -313,8 +313,8 @@ define <2 x i64> @bswap_v2i64(<2 x i64> %op) {
 define void @bswap_v4i64(ptr %a) {
 ; CHECK-LABEL: bswap_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    revb z0.d, p0/m, z0.d
 ; CHECK-NEXT:    revb z1.d, p0/m, z1.d
 ; CHECK-NEXT:    stp q0, q1, [x0]
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
index 76bb465774d5b..9dd42e7831e0d 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-sdiv-pow2.ll
@@ -45,8 +45,8 @@ define <16 x i8> @sdiv_v16i8(<16 x i8> %op1) {
 define void @sdiv_v32i8(ptr %a) {
 ; CHECK-LABEL: sdiv_v32i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    asrd z0.b, p0/m, z0.b, #5
 ; CHECK-NEXT:    asrd z1.b, p0/m, z1.b, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -97,8 +97,8 @@ define <8 x i16> @sdiv_v8i16(<8 x i16> %op1) {
 define void @sdiv_v16i16(ptr %a) {
 ; CHECK-LABEL: sdiv_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    asrd z0.h, p0/m, z0.h, #5
 ; CHECK-NEXT:    asrd z1.h, p0/m, z1.h, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -136,8 +136,8 @@ define <4 x i32> @sdiv_v4i32(<4 x i32> %op1) {
 define void @sdiv_v8i32(ptr %a) {
 ; CHECK-LABEL: sdiv_v8i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    asrd z0.s, p0/m, z0.s, #5
 ; CHECK-NEXT:    asrd z1.s, p0/m, z1.s, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]
@@ -176,8 +176,8 @@ define <2 x i64> @sdiv_v2i64(<2 x i64> %op1) {
 define void @sdiv_v4i64(ptr %a) {
 ; CHECK-LABEL: sdiv_v4i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    ldp q0, q1, [x0]
+; CHECK-NEXT:    ptrue p0.d, vl2
 ; CHECK-NEXT:    asrd z0.d, p0/m, z0.d, #5
 ; CHECK-NEXT:    asrd z1.d, p0/m, z1.d, #5
 ; CHECK-NEXT:    stp q0, q1, [x0]
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
index ff1f8699b91af..ad0d4ef0afef3 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll
@@ -37,9 +37,9 @@ define void @interleave_store_without_splat(ptr %a, <4 x i32> %v1, <4 x i32> %v2
 define void @interleave_store_legalization(ptr %a, <8 x i32> %v1, <8 x i32> %v2) {
 ; CHECK-LABEL: interleave_store_legalization:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    mov z5.d, z2.d
 ; CHECK-NEXT:    // kill: def $q3 killed $q3 def $z2_z3
+; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    mov x8, #8 // =0x8
 ; CHECK-NEXT:    mov z4.d, z0.d
 ; CHECK-NEXT:    mov z2.d, z1.d
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
index 367ccbeeea81e..06709ca3685c8 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-stores.ll
@@ -8,8 +8,8 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @store_v4i8(ptr %a) {
 ; CHECK-LABEL: store_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    st1b { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   store <4 x i8> zeroinitializer, ptr %a
@@ -49,8 +49,8 @@ define void @store_v32i8(ptr %a) {
 define void @store_v2i16(ptr %a) {
 ; CHECK-LABEL: store_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
   store <2 x i16> zeroinitializer, ptr %a
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
index 4fef678314019..70219dd30f769 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-trunc.ll
@@ -87,51 +87,51 @@ define void @trunc_v64i16_v64i8(ptr %in, ptr %out) nounwind {
 define void @trunc_v128i16_v128i8(ptr %in, ptr %out) nounwind {
 ; CHECK-LABEL: trunc_v128i16_v128i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0, #192]
+; CHECK-NEXT:    ldp q2, q3, [x0, #192]
 ; CHECK-NEXT:    ptrue p0.b, vl8
-; CHECK-NEXT:    ldp q6, q7, [x0, #224]
-; CHECK-NEXT:    ldp q2, q3, [x0, #32]
-; CHECK-NEXT:    uzp1 z1.b, z1.b, z1.b
-; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
-; CHECK-NEXT:    uzp1 z7.b, z7.b, z7.b
-; CHECK-NEXT:    uzp1 z6.b, z6.b, z6.b
-; CHECK-NEXT:    ldp q4, q5, [x0]
+; CHECK-NEXT:    ldp q6, q7, [x0, #64]
+; CHECK-NEXT:    ldp q16, q17, [x0, #224]
 ; CHECK-NEXT:    uzp1 z3.b, z3.b, z3.b
-; CHECK-NEXT:    ldp q16, q17, [x0, #64]
 ; CHECK-NEXT:    uzp1 z2.b, z2.b, z2.b
-; CHECK-NEXT:    ldp q18, q19, [x0, #128]
-; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    ldp q20, q21, [x0, #160]
-; CHECK-NEXT:    splice z6.b, p0, z6.b, z7.b
-; CHECK-NEXT:    ldp q22, q23, [x0, #96]
-; CHECK-NEXT:    uzp1 z1.b, z17.b, z17.b
-; CHECK-NEXT:    uzp1 z19.b, z19.b, z19.b
-; CHECK-NEXT:    uzp1 z18.b, z18.b, z18.b
+; CHECK-NEXT:    uzp1 z7.b, z7.b, z7.b
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    uzp1 z17.b, z17.b, z17.b
+; CHECK-NEXT:    ldp q4, q5, [x0, #96]
 ; CHECK-NEXT:    uzp1 z16.b, z16.b, z16.b
-; CHECK-NEXT:    uzp1 z21.b, z21.b, z21.b
+; CHECK-NEXT:    ldp q18, q19, [x0, #128]
+; CHECK-NEXT:    splice z2.b, p0, z2.b, z3.b
+; CHECK-NEXT:    uzp1 z3.b, z21.b, z21.b
 ; CHECK-NEXT:    uzp1 z20.b, z20.b, z20.b
-; CHECK-NEXT:    uzp1 z5.b, z5.b, z5.b
-; CHECK-NEXT:    uzp1 z7.b, z23.b, z23.b
-; CHECK-NEXT:    uzp1 z17.b, z22.b, z22.b
+; CHECK-NEXT:    uzp1 z6.b, z6.b, z6.b
+; CHECK-NEXT:    ldp q21, q22, [x0]
+; CHECK-NEXT:    splice z16.b, p0, z16.b, z17.b
+; CHECK-NEXT:    uzp1 z19.b, z19.b, z19.b
+; CHECK-NEXT:    uzp1 z18.b, z18.b, z18.b
 ; CHECK-NEXT:    uzp1 z4.b, z4.b, z4.b
-; CHECK-NEXT:    splice z2.b, p0, z2.b, z3.b
-; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    splice z20.b, p0, z20.b, z3.b
+; CHECK-NEXT:    uzp1 z3.b, z5.b, z5.b
+; CHECK-NEXT:    splice z6.b, p0, z6.b, z7.b
+; CHECK-NEXT:    uzp1 z5.b, z22.b, z22.b
+; CHECK-NEXT:    uzp1 z7.b, z21.b, z21.b
+; CHECK-NEXT:    uzp1 z1.b, z1.b, z1.b
+; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
 ; CHECK-NEXT:    splice z18.b, p0, z18.b, z19.b
-; CHECK-NEXT:    splice z16.b, p0, z16.b, z1.b
-; CHECK-NEXT:    add z1.b, z6.b, z6.b
-; CHECK-NEXT:    splice z20.b, p0, z20.b, z21.b
-; CHECK-NEXT:    splice z17.b, p0, z17.b, z7.b
-; CHECK-NEXT:    splice z4.b, p0, z4.b, z5.b
-; CHECK-NEXT:    stp q0, q1, [x1, #96]
 ; CHECK-NEXT:    add z2.b, z2.b, z2.b
+; CHECK-NEXT:    splice z4.b, p0, z4.b, z3.b
+; CHECK-NEXT:    add z3.b, z16.b, z16.b
+; CHECK-NEXT:    splice z7.b, p0, z7.b, z5.b
+; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
+; CHECK-NEXT:    add z1.b, z20.b, z20.b
 ; CHECK-NEXT:    add z5.b, z18.b, z18.b
-; CHECK-NEXT:    add z0.b, z16.b, z16.b
-; CHECK-NEXT:    add z3.b, z20.b, z20.b
-; CHECK-NEXT:    add z1.b, z17.b, z17.b
-; CHECK-NEXT:    add z4.b, z4.b, z4.b
-; CHECK-NEXT:    stp q5, q3, [x1, #64]
-; CHECK-NEXT:    stp q4, q2, [x1]
-; CHECK-NEXT:    stp q0, q1, [x1, #32]
+; CHECK-NEXT:    stp q2, q3, [x1, #96]
+; CHECK-NEXT:    add z2.b, z6.b, z6.b
+; CHECK-NEXT:    add z3.b, z4.b, z4.b
+; CHECK-NEXT:    add z4.b, z7.b, z7.b
+; CHECK-NEXT:    add z0.b, z0.b, z0.b
+; CHECK-NEXT:    stp q5, q1, [x1, #64]
+; CHECK-NEXT:    stp q2, q3, [x1, #32]
+; CHECK-NEXT:    stp q4, q0, [x1]
 ; CHECK-NEXT:    ret
   %a = load <128 x i16>, ptr %in
   %b = trunc <128 x i16> %a to <128 x i8>
@@ -226,55 +226,55 @@ define void @trunc_v32i32_v32i8(ptr %in, ptr %out) nounwind {
 define void @trunc_v64i32_v64i8(ptr %in, ptr %out) nounwind {
 ; CHECK-LABEL: trunc_v64i32_v64i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0, #64]
-; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    ldp q2, q3, [x0, #160]
-; CHECK-NEXT:    ptrue p1.b, vl8
-; CHECK-NEXT:    ldp q4, q5, [x0, #96]
-; CHECK-NEXT:    ldp q6, q7, [x0]
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
-; CHECK-NEXT:    ldp q16, q17, [x0, #128]
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    ldp q4, q5, [x0, #128]
+; CHECK-NEXT:    ldp q0, q1, [x0, #64]
+; CHECK-NEXT:    ldp q6, q7, [x0, #96]
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    ldp q18, q19, [x0, #192]
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    ldp q20, q21, [x0, #224]
+; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
+; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    ldp q16, q17, [x0]
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    ldp q18, q19, [x0, #192]
 ; CHECK-NEXT:    uzp1 z7.h, z7.h, z7.h
+; CHECK-NEXT:    ldp q20, q21, [x0, #224]
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
 ; CHECK-NEXT:    ldp q22, q23, [x0, #32]
-; CHECK-NEXT:    uzp1 z17.h, z17.h, z17.h
-; CHECK-NEXT:    uzp1 z16.h, z16.h, z16.h
+; CHECK-NEXT:    splice z4.h, p0, z4.h, z5.h
 ; CHECK-NEXT:    uzp1 z19.h, z19.h, z19.h
 ; CHECK-NEXT:    uzp1 z18.h, z18.h, z18.h
-; CHECK-NEXT:    uzp1 z21.h, z21.h, z21.h
-; CHECK-NEXT:    uzp1 z20.h, z20.h, z20.h
+; CHECK-NEXT:    uzp1 z17.h, z17.h, z17.h
+; CHECK-NEXT:    uzp1 z3.h, z21.h, z21.h
+; CHECK-NEXT:    uzp1 z5.h, z20.h, z20.h
+; CHECK-NEXT:    uzp1 z16.h, z16.h, z16.h
+; CHECK-NEXT:    uzp1 z20.h, z23.h, z23.h
+; CHECK-NEXT:    uzp1 z21.h, z22.h, z22.h
 ; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
-; CHECK-NEXT:    uzp1 z23.h, z23.h, z23.h
-; CHECK-NEXT:    uzp1 z22.h, z22.h, z22.h
-; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
-; CHECK-NEXT:    splice z16.h, p0, z16.h, z17.h
 ; CHECK-NEXT:    splice z18.h, p0, z18.h, z19.h
-; CHECK-NEXT:    splice z20.h, p0, z20.h, z21.h
+; CHECK-NEXT:    splice z5.h, p0, z5.h, z3.h
+; CHECK-NEXT:    splice z16.h, p0, z16.h, z17.h
+; CHECK-NEXT:    splice z21.h, p0, z21.h, z20.h
 ; CHECK-NEXT:    splice z6.h, p0, z6.h, z7.h
-; CHECK-NEXT:    splice z22.h, p0, z22.h, z23.h
-; CHECK-NEXT:    splice z4.h, p0, z4.h, z5.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    uzp1 z1.b, z2.b, z2.b
-; CHECK-NEXT:    uzp1 z2.b, z16.b, z16.b
-; CHECK-NEXT:    uzp1 z5.b, z18.b, z18.b
-; CHECK-NEXT:    uzp1 z3.b, z20.b, z20.b
-; CHECK-NEXT:    uzp1 z6.b, z6.b, z6.b
-; CHECK-NEXT:    uzp1 z7.b, z22.b, z22.b
-; CHECK-NEXT:    uzp1 z4.b, z4.b, z4.b
+; CHECK-NEXT:    uzp1 z2.b, z4.b, z4.b
+; CHECK-NEXT:    ptrue p0.b, vl8
+; CHECK-NEXT:    uzp1 z4.b, z18.b, z18.b
+; CHECK-NEXT:    uzp1 z3.b, z5.b, z5.b
+; CHECK-NEXT:    uzp1 z7.b, z16.b, z16.b
+; CHECK-NEXT:    uzp1 z5.b, z21.b, z21.b
+; CHECK-NEXT:    splice z2.b, p0, z2.b, z1.b
+; CHECK-NEXT:    uzp1 z1.b, z6.b, z6.b
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
-; CHECK-NEXT:    splice z2.b, p1, z2.b, z1.b
-; CHECK-NEXT:    splice z5.b, p1, z5.b, z3.b
-; CHECK-NEXT:    splice z6.b, p1, z6.b, z7.b
-; CHECK-NEXT:    splice z0.b, p1, z0.b, z4.b
+; CHECK-NEXT:    splice z4.b, p0, z4.b, z3.b
+; CHECK-NEXT:    splice z7.b, p0, z7.b, z5.b
+; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
 ; CHECK-NEXT:    add z1.b, z2.b, z2.b
-; CHECK-NEXT:    add z2.b, z5.b, z5.b
-; CHECK-NEXT:    add z3.b, z6.b, z6.b
+; CHECK-NEXT:    add z2.b, z4.b, z4.b
+; CHECK-NEXT:    add z3.b, z7.b, z7.b
 ; CHECK-NEXT:    add z0.b, z0.b, z0.b
 ; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    stp q3, q0, [x1]
@@ -368,51 +368,51 @@ define void @trunc_v32i32_v32i16(ptr %in, ptr %out) nounwind {
 define void @trunc_v64i32_v64i16(ptr %in, ptr %out) nounwind {
 ; CHECK-LABEL: trunc_v64i32_v64i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0, #192]
+; CHECK-NEXT:    ldp q2, q3, [x0, #192]
 ; CHECK-NEXT:    ptrue p0.h, vl4
-; CHECK-NEXT:    ldp q6, q7, [x0, #224]
-; CHECK-NEXT:    ldp q2, q3, [x0, #32]
-; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    uzp1 z7.h, z7.h, z7.h
-; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
-; CHECK-NEXT:    ldp q4, q5, [x0]
+; CHECK-NEXT:    ldp q6, q7, [x0, #64]
+; CHECK-NEXT:    ldp q16, q17, [x0, #224]
 ; CHECK-NEXT:    uzp1 z3.h, z3.h, z3.h
-; CHECK-NEXT:    ldp q16, q17, [x0, #64]
 ; CHECK-NEXT:    uzp1 z2.h, z2.h, z2.h
-; CHECK-NEXT:    ldp q18, q19, [x0, #128]
-; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    ldp q20, q21, [x0, #160]
-; CHECK-NEXT:    splice z6.h, p0, z6.h, z7.h
-; CHECK-NEXT:    ldp q22, q23, [x0, #96]
-; CHECK-NEXT:    uzp1 z1.h, z17.h, z17.h
-; CHECK-NEXT:    uzp1 z19.h, z19.h, z19.h
-; CHECK-NEXT:    uzp1 z18.h, z18.h, z18.h
+; CHECK-NEXT:    uzp1 z7.h, z7.h, z7.h
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    uzp1 z17.h, z17.h, z17.h
+; CHECK-NEXT:    ldp q4, q5, [x0, #96]
 ; CHECK-NEXT:    uzp1 z16.h, z16.h, z16.h
-; CHECK-NEXT:    uzp1 z21.h, z21.h, z21.h
+; CHECK-NEXT:    ldp q18, q19, [x0, #128]
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
+; CHECK-NEXT:    uzp1 z3.h, z21.h, z21.h
 ; CHECK-NEXT:    uzp1 z20.h, z20.h, z20.h
-; CHECK-NEXT:    uzp1 z5.h, z5.h, z5.h
-; CHECK-NEXT:    uzp1 z7.h, z23.h, z23.h
-; CHECK-NEXT:    uzp1 z17.h, z22.h, z22.h
+; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
+; CHECK-NEXT:    ldp q21, q22, [x0]
+; CHECK-NEXT:    splice z16.h, p0, z16.h, z17.h
+; CHECK-NEXT:    uzp1 z19.h, z19.h, z19.h
+; CHECK-NEXT:    uzp1 z18.h, z18.h, z18.h
 ; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
-; CHECK-NEXT:    splice z2.h, p0, z2.h, z3.h
-; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    splice z20.h, p0, z20.h, z3.h
+; CHECK-NEXT:    uzp1 z3.h, z5.h, z5.h
+; CHECK-NEXT:    splice z6.h, p0, z6.h, z7.h
+; CHECK-NEXT:    uzp1 z5.h, z22.h, z22.h
+; CHECK-NEXT:    uzp1 z7.h, z21.h, z21.h
+; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    splice z18.h, p0, z18.h, z19.h
-; CHECK-NEXT:    splice z16.h, p0, z16.h, z1.h
-; CHECK-NEXT:    add z1.h, z6.h, z6.h
-; CHECK-NEXT:    splice z20.h, p0, z20.h, z21.h
-; CHECK-NEXT:    splice z17.h, p0, z17.h, z7.h
-; CHECK-NEXT:    splice z4.h, p0, z4.h, z5.h
-; CHECK-NEXT:    stp q0, q1, [x1, #96]
 ; CHECK-NEXT:    add z2.h, z2.h, z2.h
+; CHECK-NEXT:    splice z4.h, p0, z4.h, z3.h
+; CHECK-NEXT:    add z3.h, z16.h, z16.h
+; CHECK-NEXT:    splice z7.h, p0, z7.h, z5.h
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
+; CHECK-NEXT:    add z1.h, z20.h, z20.h
 ; CHECK-NEXT:    add z5.h, z18.h, z18.h
-; CHECK-NEXT:    add z0.h, z16.h, z16.h
-; CHECK-NEXT:    add z3.h, z20.h, z20.h
-; CHECK-NEXT:    add z1.h, z17.h, z17.h
-; CHECK-NEXT:    add z4.h, z4.h, z4.h
-; CHECK-NEXT:    stp q5, q3, [x1, #64]
-; CHECK-NEXT:    stp q4, q2, [x1]
-; CHECK-NEXT:    stp q0, q1, [x1, #32]
+; CHECK-NEXT:    stp q2, q3, [x1, #96]
+; CHECK-NEXT:    add z2.h, z6.h, z6.h
+; CHECK-NEXT:    add z3.h, z4.h, z4.h
+; CHECK-NEXT:    add z4.h, z7.h, z7.h
+; CHECK-NEXT:    add z0.h, z0.h, z0.h
+; CHECK-NEXT:    stp q5, q1, [x1, #64]
+; CHECK-NEXT:    stp q2, q3, [x1, #32]
+; CHECK-NEXT:    stp q4, q0, [x1]
 ; CHECK-NEXT:    ret
   %a = load <64 x i32>, ptr %in
   %b = trunc <64 x i32> %a to <64 x i16>
@@ -535,19 +535,19 @@ define void @trunc_v32i64_v32i8(ptr %in, ptr %out) nounwind {
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    splice z16.s, p0, z16.s, z17.s
-; CHECK-NEXT:    splice z18.s, p0, z18.s, z19.s
 ; CHECK-NEXT:    splice z20.s, p0, z20.s, z21.s
-; CHECK-NEXT:    splice z6.s, p0, z6.s, z7.s
+; CHECK-NEXT:    splice z18.s, p0, z18.s, z19.s
 ; CHECK-NEXT:    splice z22.s, p0, z22.s, z23.s
+; CHECK-NEXT:    splice z6.s, p0, z6.s, z7.s
 ; CHECK-NEXT:    splice z4.s, p0, z4.s, z5.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
 ; CHECK-NEXT:    uzp1 z2.h, z16.h, z16.h
-; CHECK-NEXT:    uzp1 z5.h, z18.h, z18.h
 ; CHECK-NEXT:    uzp1 z3.h, z20.h, z20.h
-; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
+; CHECK-NEXT:    uzp1 z5.h, z18.h, z18.h
 ; CHECK-NEXT:    uzp1 z7.h, z22.h, z22.h
+; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
 ; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    splice z2.h, p0, z2.h, z1.h
@@ -658,55 +658,55 @@ define void @trunc_v16i64_v16i16(ptr %in, ptr %out) nounwind {
 define void @trunc_v32i64_v32i16(ptr %in, ptr %out) nounwind {
 ; CHECK-LABEL: trunc_v32i64_v32i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0, #64]
-; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    ldp q2, q3, [x0, #160]
-; CHECK-NEXT:    ptrue p1.h, vl4
-; CHECK-NEXT:    ldp q4, q5, [x0, #96]
-; CHECK-NEXT:    ldp q6, q7, [x0]
-; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    ldp q16, q17, [x0, #128]
+; CHECK-NEXT:    ptrue p0.s, vl2
+; CHECK-NEXT:    ldp q4, q5, [x0, #128]
+; CHECK-NEXT:    ldp q0, q1, [x0, #64]
+; CHECK-NEXT:    ldp q6, q7, [x0, #96]
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    ldp q18, q19, [x0, #192]
 ; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
-; CHECK-NEXT:    ldp q20, q21, [x0, #224]
+; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
+; CHECK-NEXT:    uzp1 z4.s, z4.s, z4.s
+; CHECK-NEXT:    ldp q16, q17, [x0]
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    ldp q18, q19, [x0, #192]
 ; CHECK-NEXT:    uzp1 z7.s, z7.s, z7.s
+; CHECK-NEXT:    ldp q20, q21, [x0, #224]
+; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
 ; CHECK-NEXT:    ldp q22, q23, [x0, #32]
-; CHECK-NEXT:    uzp1 z17.s, z17.s, z17.s
-; CHECK-NEXT:    uzp1 z16.s, z16.s, z16.s
+; CHECK-NEXT:    splice z4.s, p0, z4.s, z5.s
 ; CHECK-NEXT:    uzp1 z19.s, z19.s, z19.s
 ; CHECK-NEXT:    uzp1 z18.s, z18.s, z18.s
-; CHECK-NEXT:    uzp1 z21.s, z21.s, z21.s
-; CHECK-NEXT:    uzp1 z20.s, z20.s, z20.s
+; CHECK-NEXT:    uzp1 z17.s, z17.s, z17.s
+; CHECK-NEXT:    uzp1 z3.s, z21.s, z21.s
+; CHECK-NEXT:    uzp1 z5.s, z20.s, z20.s
+; CHECK-NEXT:    uzp1 z16.s, z16.s, z16.s
+; CHECK-NEXT:    uzp1 z20.s, z23.s, z23.s
+; CHECK-NEXT:    uzp1 z21.s, z22.s, z22.s
 ; CHECK-NEXT:    uzp1 z6.s, z6.s, z6.s
-; CHECK-NEXT:    uzp1 z23.s, z23.s, z23.s
-; CHECK-NEXT:    uzp1 z22.s, z22.s, z22.s
-; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
-; CHECK-NEXT:    uzp1 z4.s, z4.s, z4.s
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
-; CHECK-NEXT:    splice z16.s, p0, z16.s, z17.s
 ; CHECK-NEXT:    splice z18.s, p0, z18.s, z19.s
-; CHECK-NEXT:    splice z20.s, p0, z20.s, z21.s
+; CHECK-NEXT:    splice z5.s, p0, z5.s, z3.s
+; CHECK-NEXT:    splice z16.s, p0, z16.s, z17.s
+; CHECK-NEXT:    splice z21.s, p0, z21.s, z20.s
 ; CHECK-NEXT:    splice z6.s, p0, z6.s, z7.s
-; CHECK-NEXT:    splice z22.s, p0, z22.s, z23.s
-; CHECK-NEXT:    splice z4.s, p0, z4.s, z5.s
 ; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    uzp1 z1.h, z2.h, z2.h
-; CHECK-NEXT:    uzp1 z2.h, z16.h, z16.h
-; CHECK-NEXT:    uzp1 z5.h, z18.h, z18.h
-; CHECK-NEXT:    uzp1 z3.h, z20.h, z20.h
-; CHECK-NEXT:    uzp1 z6.h, z6.h, z6.h
-; CHECK-NEXT:    uzp1 z7.h, z22.h, z22.h
-; CHECK-NEXT:    uzp1 z4.h, z4.h, z4.h
+; CHECK-NEXT:    uzp1 z2.h, z4.h, z4.h
+; CHECK-NEXT:    ptrue p0.h, vl4
+; CHECK-NEXT:    uzp1 z4.h, z18.h, z18.h
+; CHECK-NEXT:    uzp1 z3.h, z5.h, z5.h
+; CHECK-NEXT:    uzp1 z7.h, z16.h, z16.h
+; CHECK-NEXT:    uzp1 z5.h, z21.h, z21.h
+; CHECK-NEXT:    splice z2.h, p0, z2.h, z1.h
+; CHECK-NEXT:    uzp1 z1.h, z6.h, z6.h
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
-; CHECK-NEXT:    splice z2.h, p1, z2.h, z1.h
-; CHECK-NEXT:    splice z5.h, p1, z5.h, z3.h
-; CHECK-NEXT:    splice z6.h, p1, z6.h, z7.h
-; CHECK-NEXT:    splice z0.h, p1, z0.h, z4.h
+; CHECK-NEXT:    splice z4.h, p0, z4.h, z3.h
+; CHECK-NEXT:    splice z7.h, p0, z7.h, z5.h
+; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    add z1.h, z2.h, z2.h
-; CHECK-NEXT:    add z2.h, z5.h, z5.h
-; CHECK-NEXT:    add z3.h, z6.h, z6.h
+; CHECK-NEXT:    add z2.h, z4.h, z4.h
+; CHECK-NEXT:    add z3.h, z7.h, z7.h
 ; CHECK-NEXT:    add z0.h, z0.h, z0.h
 ; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    stp q3, q0, [x1]
@@ -800,51 +800,51 @@ define void @trunc_v16i64_v16i32(ptr %in, ptr %out) nounwind {
 define void @trunc_v32i64_v32i32(ptr %in, ptr %out) nounwind {
 ; CHECK-LABEL: trunc_v32i64_v32i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0, #192]
+; CHECK-NEXT:    ldp q2, q3, [x0, #192]
 ; CHECK-NEXT:    ptrue p0.s, vl2
-; CHECK-NEXT:    ldp q6, q7, [x0, #224]
-; CHECK-NEXT:    ldp q2, q3, [x0, #32]
-; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
-; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
-; CHECK-NEXT:    uzp1 z7.s, z7.s, z7.s
-; CHECK-NEXT:    uzp1 z6.s, z6.s, z6.s
-; CHECK-NEXT:    ldp q4, q5, [x0]
+; CHECK-NEXT:    ldp q6, q7, [x0, #64]
+; CHECK-NEXT:    ldp q16, q17, [x0, #224]
 ; CHECK-NEXT:    uzp1 z3.s, z3.s, z3.s
-; CHECK-NEXT:    ldp q16, q17, [x0, #64]
 ; CHECK-NEXT:    uzp1 z2.s, z2.s, z2.s
-; CHECK-NEXT:    ldp q18, q19, [x0, #128]
-; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
 ; CHECK-NEXT:    ldp q20, q21, [x0, #160]
-; CHECK-NEXT:    splice z6.s, p0, z6.s, z7.s
-; CHECK-NEXT:    ldp q22, q23, [x0, #96]
-; CHECK-NEXT:    uzp1 z1.s, z17.s, z17.s
-; CHECK-NEXT:    uzp1 z19.s, z19.s, z19.s
-; CHECK-NEXT:    uzp1 z18.s, z18.s, z18.s
+; CHECK-NEXT:    uzp1 z7.s, z7.s, z7.s
+; CHECK-NEXT:    ldp q0, q1, [x0, #32]
+; CHECK-NEXT:    uzp1 z17.s, z17.s, z17.s
+; CHECK-NEXT:    ldp q4, q5, [x0, #96]
 ; CHECK-NEXT:    uzp1 z16.s, z16.s, z16.s
-; CHECK-NEXT:    uzp1 z21.s, z21.s, z21.s
+; CHECK-NEXT:    ldp q18, q19, [x0, #128]
+; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z3.s, z21.s, z21.s
 ; CHECK-NEXT:    uzp1 z20.s, z20.s, z20.s
-; CHECK-NEXT:    uzp1 z5.s, z5.s, z5.s
-; CHECK-NEXT:    uzp1 z7.s, z23.s, z23.s
-; CHECK-NEXT:    uzp1 z17.s, z22.s, z22.s
+; CHECK-NEXT:    uzp1 z6.s, z6.s, z6.s
+; CHECK-NEXT:    ldp q21, q22, [x0]
+; CHECK-NEXT:    splice z16.s, p0, z16.s, z17.s
+; CHECK-NEXT:    uzp1 z19.s, z19.s, z19.s
+; CHECK-NEXT:    uzp1 z18.s, z18.s, z18.s
 ; CHECK-NEXT:    uzp1 z4.s, z4.s, z4.s
-; CHECK-NEXT:    splice z2.s, p0, z2.s, z3.s
-; CHECK-NEXT:    add z0.s, z0.s, z0.s
+; CHECK-NEXT:    splice z20.s, p0, z20.s, z3.s
+; CHECK-NEXT:    uzp1 z3.s, z5.s, z5.s
+; CHECK-NEXT:    splice z6.s, p0, z6.s, z7.s
+; CHECK-NEXT:    uzp1 z5.s, z22.s, z22.s
+; CHECK-NEXT:    uzp1 z7.s, z21.s, z21.s
+; CHECK-NEXT:    uzp1 z1.s, z1.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    splice z18.s, p0, z18.s, z19.s
-; CHECK-NEXT:    splice z16.s, p0, z16.s, z1.s
-; CHECK-NEXT:    add z1.s, z6.s, z6.s
-; CHECK-NEXT:    splice z20.s, p0, z20.s, z21.s
-; CHECK-NEXT:    splice z17.s, p0, z17.s, z7.s
-; CHECK-NEXT:    splice z4.s, p0, z4.s, z5.s
-; CHECK-NEXT:    stp q0, q1, [x1, #96]
 ; CHECK-NEXT:    add z2.s, z2.s, z2.s
+; CHECK-NEXT:    splice z4.s, p0, z4.s, z3.s
+; CHECK-NEXT:    add z3.s, z16.s, z16.s
+; CHECK-NEXT:    splice z7.s, p0, z7.s, z5.s
+; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
+; CHECK-NEXT:    add z1.s, z20.s, z20.s
 ; CHECK-NEXT:    add z5.s, z18.s, z18.s
-; CHECK-NEXT:    add z0.s, z16.s, z16.s
-; CHECK-NEXT:    add z3.s, z20.s, z20.s
-; CHECK-NEXT:    add z1.s, z17.s, z17.s
-; CHECK-NEXT:    add z4.s, z4.s, z4.s
-; CHECK-NEXT:    stp q5, q3, [x1, #64]
-; CHECK-NEXT:    stp q4, q2, [x1]
-; CHECK-NEXT:    stp q0, q1, [x1, #32]
+; CHECK-NEXT:    stp q2, q3, [x1, #96]
+; CHECK-NEXT:    add z2.s, z6.s, z6.s
+; CHECK-NEXT:    add z3.s, z4.s, z4.s
+; CHECK-NEXT:    add z4.s, z7.s, z7.s
+; CHECK-NEXT:    add z0.s, z0.s, z0.s
+; CHECK-NEXT:    stp q5, q1, [x1, #64]
+; CHECK-NEXT:    stp q2, q3, [x1, #32]
+; CHECK-NEXT:    stp q4, q0, [x1]
 ; CHECK-NEXT:    ret
   %a = load <32 x i64>, ptr %in
   %b = trunc <32 x i64> %a to <32 x i32>
diff --git a/llvm/test/CodeGen/AArch64/sve-trunc.ll b/llvm/test/CodeGen/AArch64/sve-trunc.ll
index 92dfc73961362..0ec6538947c73 100644
--- a/llvm/test/CodeGen/AArch64/sve-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/sve-trunc.ll
@@ -61,8 +61,8 @@ entry:
 define <vscale x 2 x i1> @trunc_i64toi1(<vscale x 2 x i64> %in) {
 ; CHECK-LABEL: trunc_i64toi1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and z0.d, z0.d, #0x1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    ret
 entry:
@@ -73,9 +73,9 @@ entry:
 define <vscale x 4 x i1> @trunc_i64toi1_split(<vscale x 4 x i64> %in) {
 ; CHECK-LABEL: trunc_i64toi1_split:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and z1.d, z1.d, #0x1
 ; CHECK-NEXT:    and z0.d, z0.d, #0x1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cmpne p1.d, p0/z, z1.d, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
 ; CHECK-NEXT:    uzp1 p0.s, p0.s, p1.s
@@ -88,11 +88,11 @@ entry:
 define <vscale x 8 x i1> @trunc_i64toi1_split2(<vscale x 8 x i64> %in) {
 ; CHECK-LABEL: trunc_i64toi1_split2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and z3.d, z3.d, #0x1
 ; CHECK-NEXT:    and z2.d, z2.d, #0x1
 ; CHECK-NEXT:    and z1.d, z1.d, #0x1
 ; CHECK-NEXT:    and z0.d, z0.d, #0x1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cmpne p1.d, p0/z, z3.d, #0
 ; CHECK-NEXT:    cmpne p2.d, p0/z, z2.d, #0
 ; CHECK-NEXT:    cmpne p3.d, p0/z, z1.d, #0
@@ -111,12 +111,12 @@ define <vscale x 16 x i1> @trunc_i64toi1_split3(<vscale x 16 x i64> %in) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p7, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and z7.d, z7.d, #0x1
 ; CHECK-NEXT:    and z6.d, z6.d, #0x1
 ; CHECK-NEXT:    and z5.d, z5.d, #0x1
@@ -125,23 +125,25 @@ define <vscale x 16 x i1> @trunc_i64toi1_split3(<vscale x 16 x i64> %in) {
 ; CHECK-NEXT:    and z2.d, z2.d, #0x1
 ; CHECK-NEXT:    and z1.d, z1.d, #0x1
 ; CHECK-NEXT:    and z0.d, z0.d, #0x1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    cmpne p1.d, p0/z, z7.d, #0
 ; CHECK-NEXT:    cmpne p2.d, p0/z, z6.d, #0
 ; CHECK-NEXT:    cmpne p3.d, p0/z, z5.d, #0
 ; CHECK-NEXT:    cmpne p4.d, p0/z, z4.d, #0
 ; CHECK-NEXT:    cmpne p5.d, p0/z, z3.d, #0
 ; CHECK-NEXT:    cmpne p6.d, p0/z, z2.d, #0
-; CHECK-NEXT:    uzp1 p1.s, p2.s, p1.s
-; CHECK-NEXT:    cmpne p2.d, p0/z, z1.d, #0
+; CHECK-NEXT:    cmpne p7.d, p0/z, z1.d, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
-; CHECK-NEXT:    uzp1 p3.s, p4.s, p3.s
-; CHECK-NEXT:    uzp1 p4.s, p6.s, p5.s
+; CHECK-NEXT:    uzp1 p1.s, p2.s, p1.s
+; CHECK-NEXT:    uzp1 p2.s, p4.s, p3.s
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    uzp1 p3.s, p6.s, p5.s
 ; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    uzp1 p0.s, p0.s, p2.s
+; CHECK-NEXT:    uzp1 p0.s, p0.s, p7.s
+; CHECK-NEXT:    ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
 ; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    uzp1 p1.h, p3.h, p1.h
-; CHECK-NEXT:    uzp1 p0.h, p0.h, p4.h
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    uzp1 p0.h, p0.h, p3.h
 ; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
 ; CHECK-NEXT:    addvl sp, sp, #1
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
@@ -155,8 +157,8 @@ entry:
 define <vscale x 4 x i1> @trunc_i32toi1(<vscale x 4 x i32> %in) {
 ; CHECK-LABEL: trunc_i32toi1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and z0.s, z0.s, #0x1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    ret
 entry:
@@ -167,8 +169,8 @@ entry:
 define <vscale x 8 x i1> @trunc_i16toi1(<vscale x 8 x i16> %in) {
 ; CHECK-LABEL: trunc_i16toi1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    and z0.h, z0.h, #0x1
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z0.h, #0
 ; CHECK-NEXT:    ret
 entry:
@@ -179,8 +181,8 @@ entry:
 define <vscale x 16 x i1> @trunc_i8toi1(<vscale x 16 x i8> %in) {
 ; CHECK-LABEL: trunc_i8toi1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    and z0.b, z0.b, #0x1
+; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
 ; CHECK-NEXT:    ret
 entry:
@@ -191,8 +193,8 @@ entry:
 define <vscale x 1 x i1> @trunc_nxv1i32_to_nxv1i1(<vscale x 1 x i32> %in) {
 ; CHECK-LABEL: trunc_nxv1i32_to_nxv1i1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and z0.s, z0.s, #0x1
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z0.s, #0
 ; CHECK-NEXT:    punpklo p0.h, p0.b
 ; CHECK-NEXT:    punpklo p0.h, p0.b
@@ -204,8 +206,8 @@ define <vscale x 1 x i1> @trunc_nxv1i32_to_nxv1i1(<vscale x 1 x i32> %in) {
 define void @trunc_promoteIntRes(<vscale x 4 x i64> %0, ptr %ptr) {
 ; CHECK-LABEL: trunc_promoteIntRes:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    st1h { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll b/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll
index 36d64725742e5..818f37c85ffdb 100644
--- a/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll
+++ b/llvm/test/CodeGen/AArch64/sve-umulo-sdnode.ll
@@ -6,9 +6,9 @@ declare { <vscale x 2 x i8>, <vscale x 2 x i1> } @llvm.umul.with.overflow.nxv2i8
 define <vscale x 2 x i8> @umulo_nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %y) {
 ; CHECK-LABEL: umulo_nxv2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and z1.d, z1.d, #0xff
 ; CHECK-NEXT:    and z0.d, z0.d, #0xff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    lsr z1.d, z0.d, #8
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
@@ -26,9 +26,9 @@ declare { <vscale x 4 x i8>, <vscale x 4 x i1> } @llvm.umul.with.overflow.nxv4i8
 define <vscale x 4 x i8> @umulo_nxv4i8(<vscale x 4 x i8> %x, <vscale x 4 x i8> %y) {
 ; CHECK-LABEL: umulo_nxv4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and z1.s, z1.s, #0xff
 ; CHECK-NEXT:    and z0.s, z0.s, #0xff
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    lsr z1.s, z0.s, #8
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
@@ -46,9 +46,9 @@ declare { <vscale x 8 x i8>, <vscale x 8 x i1> } @llvm.umul.with.overflow.nxv8i8
 define <vscale x 8 x i8> @umulo_nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %y) {
 ; CHECK-LABEL: umulo_nxv8i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    and z1.h, z1.h, #0xff
 ; CHECK-NEXT:    and z0.h, z0.h, #0xff
+; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z1.h
 ; CHECK-NEXT:    lsr z1.h, z0.h, #8
 ; CHECK-NEXT:    cmpne p0.h, p0/z, z1.h, #0
@@ -119,9 +119,9 @@ define <vscale x 64 x i8> @umulo_nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i
 ; CHECK-NEXT:    movprfx z27, z1
 ; CHECK-NEXT:    umulh z27.b, p0/m, z27.b, z5.b
 ; CHECK-NEXT:    mul z3.b, p0/m, z3.b, z7.b
-; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z4.b
 ; CHECK-NEXT:    mul z2.b, p0/m, z2.b, z6.b
 ; CHECK-NEXT:    mul z1.b, p0/m, z1.b, z5.b
+; CHECK-NEXT:    mul z0.b, p0/m, z0.b, z4.b
 ; CHECK-NEXT:    cmpne p1.b, p0/z, z25.b, #0
 ; CHECK-NEXT:    cmpne p2.b, p0/z, z24.b, #0
 ; CHECK-NEXT:    cmpne p3.b, p0/z, z26.b, #0
@@ -143,9 +143,9 @@ declare { <vscale x 2 x i16>, <vscale x 2 x i1> } @llvm.umul.with.overflow.nxv2i
 define <vscale x 2 x i16> @umulo_nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i16> %y) {
 ; CHECK-LABEL: umulo_nxv2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and z1.d, z1.d, #0xffff
 ; CHECK-NEXT:    and z0.d, z0.d, #0xffff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    lsr z1.d, z0.d, #16
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
@@ -163,9 +163,9 @@ declare { <vscale x 4 x i16>, <vscale x 4 x i1> } @llvm.umul.with.overflow.nxv4i
 define <vscale x 4 x i16> @umulo_nxv4i16(<vscale x 4 x i16> %x, <vscale x 4 x i16> %y) {
 ; CHECK-LABEL: umulo_nxv4i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    and z1.s, z1.s, #0xffff
 ; CHECK-NEXT:    and z0.s, z0.s, #0xffff
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    lsr z1.s, z0.s, #16
 ; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
@@ -236,9 +236,9 @@ define <vscale x 32 x i16> @umulo_nxv32i16(<vscale x 32 x i16> %x, <vscale x 32
 ; CHECK-NEXT:    movprfx z27, z1
 ; CHECK-NEXT:    umulh z27.h, p0/m, z27.h, z5.h
 ; CHECK-NEXT:    mul z3.h, p0/m, z3.h, z7.h
-; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z4.h
 ; CHECK-NEXT:    mul z2.h, p0/m, z2.h, z6.h
 ; CHECK-NEXT:    mul z1.h, p0/m, z1.h, z5.h
+; CHECK-NEXT:    mul z0.h, p0/m, z0.h, z4.h
 ; CHECK-NEXT:    cmpne p1.h, p0/z, z25.h, #0
 ; CHECK-NEXT:    cmpne p2.h, p0/z, z24.h, #0
 ; CHECK-NEXT:    cmpne p3.h, p0/z, z26.h, #0
@@ -260,9 +260,9 @@ declare { <vscale x 2 x i32>, <vscale x 2 x i1> } @llvm.umul.with.overflow.nxv2i
 define <vscale x 2 x i32> @umulo_nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y) {
 ; CHECK-LABEL: umulo_nxv2i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and z1.d, z1.d, #0xffffffff
 ; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    lsr z1.d, z0.d, #32
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
@@ -333,9 +333,9 @@ define <vscale x 16 x i32> @umulo_nxv16i32(<vscale x 16 x i32> %x, <vscale x 16
 ; CHECK-NEXT:    movprfx z27, z1
 ; CHECK-NEXT:    umulh z27.s, p0/m, z27.s, z5.s
 ; CHECK-NEXT:    mul z3.s, p0/m, z3.s, z7.s
-; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z4.s
 ; CHECK-NEXT:    mul z2.s, p0/m, z2.s, z6.s
 ; CHECK-NEXT:    mul z1.s, p0/m, z1.s, z5.s
+; CHECK-NEXT:    mul z0.s, p0/m, z0.s, z4.s
 ; CHECK-NEXT:    cmpne p1.s, p0/z, z25.s, #0
 ; CHECK-NEXT:    cmpne p2.s, p0/z, z24.s, #0
 ; CHECK-NEXT:    cmpne p3.s, p0/z, z26.s, #0
@@ -410,9 +410,9 @@ define <vscale x 8 x i64> @umulo_nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i6
 ; CHECK-NEXT:    movprfx z27, z1
 ; CHECK-NEXT:    umulh z27.d, p0/m, z27.d, z5.d
 ; CHECK-NEXT:    mul z3.d, p0/m, z3.d, z7.d
-; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z4.d
 ; CHECK-NEXT:    mul z2.d, p0/m, z2.d, z6.d
 ; CHECK-NEXT:    mul z1.d, p0/m, z1.d, z5.d
+; CHECK-NEXT:    mul z0.d, p0/m, z0.d, z4.d
 ; CHECK-NEXT:    cmpne p1.d, p0/z, z25.d, #0
 ; CHECK-NEXT:    cmpne p2.d, p0/z, z24.d, #0
 ; CHECK-NEXT:    cmpne p3.d, p0/z, z26.d, #0
diff --git a/llvm/test/CodeGen/AArch64/sve-uunpklo-load-uzp1-store-combine.ll b/llvm/test/CodeGen/AArch64/sve-uunpklo-load-uzp1-store-combine.ll
index fc0dd2b163369..fc315b3a4730a 100644
--- a/llvm/test/CodeGen/AArch64/sve-uunpklo-load-uzp1-store-combine.ll
+++ b/llvm/test/CodeGen/AArch64/sve-uunpklo-load-uzp1-store-combine.ll
@@ -114,8 +114,8 @@ define void @uzp1_i8_valid(<vscale x 8 x i16> %a, ptr %b) #0 {
 define void @uzp1_i8_invalid(<vscale x 8 x i16> %a, ptr %b) #0 {
 ; CHECK-LABEL: uzp1_i8_invalid:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b, vl128
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT:    ptrue p0.b, vl128
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
 ; CHECK-NEXT:    ret
   %a.bc = bitcast <vscale x 8 x i16> %a to <vscale x 16 x i8>
@@ -141,8 +141,8 @@ define void @uzp1_i16_valid(<vscale x 4 x i32> %a, ptr %b) #0 {
 define void @uzp1_i16_invalid(<vscale x 4 x i32> %a, ptr %b) #0 {
 ; CHECK-LABEL: uzp1_i16_invalid:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h, vl64
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT:    ptrue p0.h, vl64
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
 ; CHECK-NEXT:    ret
   %a.bc = bitcast <vscale x 4 x i32> %a to <vscale x 8 x i16>
@@ -168,8 +168,8 @@ define void @uzp1_i32_valid(<vscale x 2 x i64> %a, ptr %b) #0 {
 define void @uzp1_i32_invalid(<vscale x 2 x i64> %a, ptr %b) #0 {
 ; CHECK-LABEL: uzp1_i32_invalid:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
   %a.bc = bitcast <vscale x 2 x i64> %a to <vscale x 4 x i32>
@@ -182,8 +182,8 @@ define void @uzp1_i32_invalid(<vscale x 2 x i64> %a, ptr %b) #0 {
 define void @uzp1_invalid_all(<vscale x 2 x i64> %a, ptr %b) #0 {
 ; CHECK-LABEL: uzp1_invalid_all:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
 ; CHECK-NEXT:    ret
   %a.bc = bitcast <vscale x 2 x i64> %a to <vscale x 4 x i32>
diff --git a/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll b/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll
index 194d1071301d4..91f8f5c2c90d8 100644
--- a/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll
@@ -8,11 +8,11 @@ define i32 @test(<vscale x 32 x i8> %bin.rdx, <vscale x 32 x i8> %bin.rdx2)  {
 ; CHECK-NEXT:    sunpklo z5.h, z0.b
 ; CHECK-NEXT:    sunpkhi z0.h, z0.b
 ; CHECK-NEXT:    sunpkhi z2.h, z2.b
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpklo z6.h, z1.b
 ; CHECK-NEXT:    sunpkhi z1.h, z1.b
 ; CHECK-NEXT:    sunpklo z7.h, z3.b
 ; CHECK-NEXT:    sunpkhi z3.h, z3.b
+; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    sunpkhi z24.s, z5.h
 ; CHECK-NEXT:    sunpklo z5.s, z5.h
 ; CHECK-NEXT:    sunpklo z25.s, z4.h
diff --git a/llvm/test/CodeGen/AArch64/sve-vecreduce-fold.ll b/llvm/test/CodeGen/AArch64/sve-vecreduce-fold.ll
index 898090340869e..0bdaefdfc2a3f 100644
--- a/llvm/test/CodeGen/AArch64/sve-vecreduce-fold.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vecreduce-fold.ll
@@ -80,8 +80,8 @@ define i1 @reduce_and_insert_subvec_into_var(<vscale x 4 x i1> %in, <vscale x 16
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    punpklo p3.h, p1.b
 ; CHECK-NEXT:    punpkhi p1.h, p1.b
-; CHECK-NEXT:    ptrue p2.b
 ; CHECK-NEXT:    punpkhi p3.h, p3.b
+; CHECK-NEXT:    ptrue p2.b
 ; CHECK-NEXT:    uzp1 p0.h, p0.h, p3.h
 ; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
 ; CHECK-NEXT:    nots p0.b, p2/z, p0.b
diff --git a/llvm/test/CodeGen/AArch64/sve2-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve2-fcopysign.ll
index 14cc8cd268261..778d7e193e23e 100644
--- a/llvm/test/CodeGen/AArch64/sve2-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-fcopysign.ll
@@ -48,11 +48,11 @@ define <vscale x 4 x float> @test_copysign_v4f32_v4f64(<vscale x 4 x float> %a,
 ; CHECK_NO_EXTEND_ROUND-LABEL: test_copysign_v4f32_v4f64:
 ; CHECK_NO_EXTEND_ROUND:       // %bb.0:
 ; CHECK_NO_EXTEND_ROUND-NEXT:    ptrue p0.d
+; CHECK_NO_EXTEND_ROUND-NEXT:    mov z3.s, #0x7fffffff
 ; CHECK_NO_EXTEND_ROUND-NEXT:    fcvt z2.s, p0/m, z2.d
 ; CHECK_NO_EXTEND_ROUND-NEXT:    fcvt z1.s, p0/m, z1.d
 ; CHECK_NO_EXTEND_ROUND-NEXT:    uzp1 z1.s, z1.s, z2.s
-; CHECK_NO_EXTEND_ROUND-NEXT:    mov z2.s, #0x7fffffff
-; CHECK_NO_EXTEND_ROUND-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
+; CHECK_NO_EXTEND_ROUND-NEXT:    bsl z0.d, z0.d, z1.d, z3.d
 ; CHECK_NO_EXTEND_ROUND-NEXT:    ret
 ;
 ; CHECK_EXTEND_ROUND-LABEL: test_copysign_v4f32_v4f64:
@@ -107,9 +107,9 @@ declare <vscale x 2 x double> @llvm.copysign.v2f64(<vscale x 2 x double> %a, <vs
 define <vscale x 4 x double> @test_copysign_v4f64_v4f32(<vscale x 4 x double> %a, <vscale x 4 x float> %b) #0 {
 ; CHECK_NO_EXTEND_ROUND-LABEL: test_copysign_v4f64_v4f32:
 ; CHECK_NO_EXTEND_ROUND:       // %bb.0:
-; CHECK_NO_EXTEND_ROUND-NEXT:    ptrue p0.d
 ; CHECK_NO_EXTEND_ROUND-NEXT:    uunpkhi z3.d, z2.s
 ; CHECK_NO_EXTEND_ROUND-NEXT:    uunpklo z2.d, z2.s
+; CHECK_NO_EXTEND_ROUND-NEXT:    ptrue p0.d
 ; CHECK_NO_EXTEND_ROUND-NEXT:    mov z4.d, #0x7fffffffffffffff
 ; CHECK_NO_EXTEND_ROUND-NEXT:    fcvt z3.d, p0/m, z3.s
 ; CHECK_NO_EXTEND_ROUND-NEXT:    fcvt z2.d, p0/m, z2.s
@@ -119,9 +119,9 @@ define <vscale x 4 x double> @test_copysign_v4f64_v4f32(<vscale x 4 x double> %a
 ;
 ; CHECK_EXTEND_ROUND-LABEL: test_copysign_v4f64_v4f32:
 ; CHECK_EXTEND_ROUND:       // %bb.0:
-; CHECK_EXTEND_ROUND-NEXT:    ptrue p0.d
 ; CHECK_EXTEND_ROUND-NEXT:    uunpkhi z3.d, z2.s
 ; CHECK_EXTEND_ROUND-NEXT:    uunpklo z2.d, z2.s
+; CHECK_EXTEND_ROUND-NEXT:    ptrue p0.d
 ; CHECK_EXTEND_ROUND-NEXT:    mov z4.d, #0x7fffffffffffffff
 ; CHECK_EXTEND_ROUND-NEXT:    fcvt z2.d, p0/m, z2.s
 ; CHECK_EXTEND_ROUND-NEXT:    fcvt z3.d, p0/m, z3.s
@@ -176,11 +176,11 @@ define <vscale x 4 x half> @test_copysign_v4f16_v4f64(<vscale x 4 x half> %a, <v
 ; CHECK_NO_EXTEND_ROUND-LABEL: test_copysign_v4f16_v4f64:
 ; CHECK_NO_EXTEND_ROUND:       // %bb.0:
 ; CHECK_NO_EXTEND_ROUND-NEXT:    ptrue p0.d
+; CHECK_NO_EXTEND_ROUND-NEXT:    mov z3.h, #32767 // =0x7fff
 ; CHECK_NO_EXTEND_ROUND-NEXT:    fcvt z2.h, p0/m, z2.d
 ; CHECK_NO_EXTEND_ROUND-NEXT:    fcvt z1.h, p0/m, z1.d
 ; CHECK_NO_EXTEND_ROUND-NEXT:    uzp1 z1.s, z1.s, z2.s
-; CHECK_NO_EXTEND_ROUND-NEXT:    mov z2.h, #32767 // =0x7fff
-; CHECK_NO_EXTEND_ROUND-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
+; CHECK_NO_EXTEND_ROUND-NEXT:    bsl z0.d, z0.d, z1.d, z3.d
 ; CHECK_NO_EXTEND_ROUND-NEXT:    ret
 ;
 ; CHECK_EXTEND_ROUND-LABEL: test_copysign_v4f16_v4f64:
@@ -218,11 +218,11 @@ define <vscale x 8 x half> @test_copysign_v8f16_v8f32(<vscale x 8 x half> %a, <v
 ; CHECK_NO_EXTEND_ROUND-LABEL: test_copysign_v8f16_v8f32:
 ; CHECK_NO_EXTEND_ROUND:       // %bb.0:
 ; CHECK_NO_EXTEND_ROUND-NEXT:    ptrue p0.s
+; CHECK_NO_EXTEND_ROUND-NEXT:    mov z3.h, #32767 // =0x7fff
 ; CHECK_NO_EXTEND_ROUND-NEXT:    fcvt z2.h, p0/m, z2.s
 ; CHECK_NO_EXTEND_ROUND-NEXT:    fcvt z1.h, p0/m, z1.s
 ; CHECK_NO_EXTEND_ROUND-NEXT:    uzp1 z1.h, z1.h, z2.h
-; CHECK_NO_EXTEND_ROUND-NEXT:    mov z2.h, #32767 // =0x7fff
-; CHECK_NO_EXTEND_ROUND-NEXT:    bsl z0.d, z0.d, z1.d, z2.d
+; CHECK_NO_EXTEND_ROUND-NEXT:    bsl z0.d, z0.d, z1.d, z3.d
 ; CHECK_NO_EXTEND_ROUND-NEXT:    ret
 ;
 ; CHECK_EXTEND_ROUND-LABEL: test_copysign_v8f16_v8f32:
diff --git a/llvm/test/CodeGen/AArch64/sve2-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve2-fixed-length-fcopysign.ll
index 5735c36558bdb..e39e5feefa255 100644
--- a/llvm/test/CodeGen/AArch64/sve2-fixed-length-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-fixed-length-fcopysign.ll
@@ -376,9 +376,9 @@ define void @test_copysign_v4f32_v4f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    mvni v1.4s, #128, lsl #24
 ; CHECK-NEXT:    ldr q2, [x0]
-; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
-; CHECK-NEXT:    fcvt z0.s, p1/m, z0.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.s, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    bit v0.16b, v2.16b, v1.16b
 ; CHECK-NEXT:    str q0, [x0]
@@ -429,12 +429,12 @@ define void @test_copysign_v4f64_v4f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ;
 ; CHECK_EXTEND_ROUND-LABEL: test_copysign_v4f64_v4f32:
 ; CHECK_EXTEND_ROUND:       // %bb.0:
-; CHECK_EXTEND_ROUND-NEXT:    ptrue p0.d, vl4
 ; CHECK_EXTEND_ROUND-NEXT:    ldr q0, [x1]
+; CHECK_EXTEND_ROUND-NEXT:    ptrue p0.d, vl4
 ; CHECK_EXTEND_ROUND-NEXT:    mov z1.d, #0x7fffffffffffffff
 ; CHECK_EXTEND_ROUND-NEXT:    uunpklo z0.d, z0.s
-; CHECK_EXTEND_ROUND-NEXT:    fcvt z0.d, p0/m, z0.s
 ; CHECK_EXTEND_ROUND-NEXT:    ld1d { z2.d }, p0/z, [x0]
+; CHECK_EXTEND_ROUND-NEXT:    fcvt z0.d, p0/m, z0.s
 ; CHECK_EXTEND_ROUND-NEXT:    bsl z2.d, z2.d, z0.d, z1.d
 ; CHECK_EXTEND_ROUND-NEXT:    st1d { z2.d }, p0, [x0]
 ; CHECK_EXTEND_ROUND-NEXT:    ret
@@ -472,9 +472,9 @@ define void @test_copysign_v4f16_v4f64(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.d, vl4
 ; CHECK-NEXT:    mvni v1.4h, #128, lsl #8
 ; CHECK-NEXT:    ldr d2, [x0]
-; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x1]
-; CHECK-NEXT:    fcvt z0.h, p1/m, z0.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvt z0.h, p0/m, z0.d
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    bit v0.8b, v2.8b, v1.8b
@@ -499,9 +499,9 @@ define void @test_copysign_v8f16_v8f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    mvni v1.8h, #128, lsl #8
 ; CHECK-NEXT:    ldr q2, [x0]
-; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x1]
-; CHECK-NEXT:    fcvt z0.h, p1/m, z0.s
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcvt z0.h, p0/m, z0.s
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    bit v0.16b, v2.16b, v1.16b
 ; CHECK-NEXT:    str q0, [x0]
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll
index 02b9562b8f52b..82078ad63d66b 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll
@@ -120,12 +120,12 @@ define void @neg_zero_shift(ptr %ptr, ptr %dst, i64 %index){
 define void @wide_add_shift_add_rshrnb_b(ptr %dest, i64 %index, <vscale x 16 x i16> %arg1){
 ; CHECK-LABEL: wide_add_shift_add_rshrnb_b:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    rshrnb z1.b, z1.h, #6
 ; CHECK-NEXT:    rshrnb z0.b, z0.h, #6
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ld1b { z2.b }, p0/z, [x0, x1]
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
-; CHECK-NEXT:    ld1b { z1.b }, p0/z, [x0, x1]
-; CHECK-NEXT:    add z0.b, z1.b, z0.b
+; CHECK-NEXT:    add z0.b, z2.b, z0.b
 ; CHECK-NEXT:    st1b { z0.b }, p0, [x0, x1]
 ; CHECK-NEXT:    ret
   %1 = add <vscale x 16 x i16> %arg1, splat (i16 32)
@@ -141,12 +141,12 @@ define void @wide_add_shift_add_rshrnb_b(ptr %dest, i64 %index, <vscale x 16 x i
 define void @wide_add_shift_add_rshrnb_h(ptr %dest, i64 %index, <vscale x 8 x i32> %arg1){
 ; CHECK-LABEL: wide_add_shift_add_rshrnb_h:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    rshrnb z1.h, z1.s, #6
 ; CHECK-NEXT:    rshrnb z0.h, z0.s, #6
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x0, x1, lsl #1]
 ; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x0, x1, lsl #1]
-; CHECK-NEXT:    add z0.h, z1.h, z0.h
+; CHECK-NEXT:    add z0.h, z2.h, z0.h
 ; CHECK-NEXT:    st1h { z0.h }, p0, [x0, x1, lsl #1]
 ; CHECK-NEXT:    ret
   %1 = add <vscale x 8 x i32> %arg1, splat (i32 32)
@@ -162,12 +162,12 @@ define void @wide_add_shift_add_rshrnb_h(ptr %dest, i64 %index, <vscale x 8 x i3
 define void @wide_add_shift_add_rshrnb_d(ptr %dest, i64 %index, <vscale x 4 x i64> %arg1){
 ; CHECK-LABEL: wide_add_shift_add_rshrnb_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    rshrnb z1.s, z1.d, #32
 ; CHECK-NEXT:    rshrnb z0.s, z0.d, #32
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0, x1, lsl #2]
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0, x1, lsl #2]
-; CHECK-NEXT:    add z0.s, z1.s, z0.s
+; CHECK-NEXT:    add z0.s, z2.s, z0.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0, x1, lsl #2]
 ; CHECK-NEXT:    ret
   %1 = add <vscale x 4 x i64> %arg1, splat (i64 2147483648)
@@ -188,11 +188,11 @@ define void @neg_wide_add_shift_add_rshrnb_d(ptr %dest, i64 %index, <vscale x 4
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    add z0.d, z0.d, z2.d
 ; CHECK-NEXT:    add z1.d, z1.d, z2.d
+; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0, x1, lsl #2]
 ; CHECK-NEXT:    lsr z1.d, z1.d, #48
 ; CHECK-NEXT:    lsr z0.d, z0.d, #48
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x0, x1, lsl #2]
-; CHECK-NEXT:    add z0.s, z1.s, z0.s
+; CHECK-NEXT:    add z0.s, z2.s, z0.s
 ; CHECK-NEXT:    st1w { z0.s }, p0, [x0, x1, lsl #2]
 ; CHECK-NEXT:    ret
   %1 = add <vscale x 4 x i64> %arg1, splat (i64 140737488355328)
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-arith-imm.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-arith-imm.ll
index 8bfcd088e1a86..500973d053f5b 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-arith-imm.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-int-arith-imm.ll
@@ -437,8 +437,8 @@ define <vscale x 4 x i32> @uqsub_i32_ptrue_all_h(<vscale x 4 x i32> %a) #0 {
 define <vscale x 4 x i32> @uqsub_i32_ptrue_all_d(<vscale x 4 x i32> %a) #0 {
 ; CHECK-LABEL: uqsub_i32_ptrue_all_d:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.s, #1 // =0x1
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uqsub z0.s, p0/m, z0.s, z1.s
 ; CHECK-NEXT:    ret
   %pg.d = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
diff --git a/llvm/test/CodeGen/AArch64/sve2-rsh.ll b/llvm/test/CodeGen/AArch64/sve2-rsh.ll
index 516ef3bd581ee..9addd16f89292 100644
--- a/llvm/test/CodeGen/AArch64/sve2-rsh.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-rsh.ll
@@ -18,8 +18,8 @@ define <vscale x 2 x i64> @neg_urshr_1(<vscale x 2 x i64> %x) {
 define <vscale x 2 x i64> @neg_urshr_2(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
 ; CHECK-LABEL: neg_urshr_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    add z0.d, z0.d, #32 // =0x20
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    lsr z0.d, p0/m, z0.d, z1.d
 ; CHECK-NEXT:    ret
   %add = add nuw nsw <vscale x 2 x i64> %x, splat (i64 32)
diff --git a/llvm/test/CodeGen/AArch64/sve2-xar.ll b/llvm/test/CodeGen/AArch64/sve2-xar.ll
index e297ade6b9ae1..e5a240b7a53fd 100644
--- a/llvm/test/CodeGen/AArch64/sve2-xar.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-xar.ll
@@ -152,9 +152,9 @@ define <vscale x 2 x i64> @xar_nxv2i64_l_neg1(<vscale x 2 x i64> %x, <vscale x 2
 ; CHECK-LABEL: xar_nxv2i64_l_neg1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z3.d, z2.d
-; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    subr z2.d, z2.d, #0 // =0x0
 ; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    and z3.d, z3.d, #0x3f
 ; CHECK-NEXT:    and z2.d, z2.d, #0x3f
 ; CHECK-NEXT:    movprfx z1, z0
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx2.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx2.ll
index a42d99fd8318e..e821de24b5f14 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx2.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx2.ll
@@ -8,12 +8,12 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8> } @sel_x2_i8(target("aarch64.svc
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    sel { z0.b, z1.b }, pn8, { z6.b, z7.b }, { z4.b, z5.b }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -28,12 +28,12 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16> } @sel_x2_i16(target("aarch64.sv
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    sel { z0.h, z1.h }, pn8, { z6.h, z7.h }, { z4.h, z5.h }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -48,12 +48,12 @@ define { <vscale x 8 x half>, <vscale x 8 x half> } @sel_x2_f16(target("aarch64.
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    sel { z0.h, z1.h }, pn8, { z6.h, z7.h }, { z4.h, z5.h }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -68,12 +68,12 @@ define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @sel_x2_bf16(target("aar
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    sel { z0.h, z1.h }, pn8, { z6.h, z7.h }, { z4.h, z5.h }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -88,12 +88,12 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32> } @sel_x2_i32(target("aarch64.sv
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    sel { z0.s, z1.s }, pn8, { z6.s, z7.s }, { z4.s, z5.s }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -108,12 +108,12 @@ define { <vscale x 4 x float>, <vscale x 4 x float> } @sel_x2_f32(target("aarch6
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    sel { z0.s, z1.s }, pn8, { z6.s, z7.s }, { z4.s, z5.s }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -128,12 +128,12 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64> } @sel_x2_i64(target("aarch64.sv
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    sel { z0.d, z1.d }, pn8, { z6.d, z7.d }, { z4.d, z5.d }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -148,12 +148,12 @@ define { <vscale x 2 x double>, <vscale x 2 x double> } @sel_x2_f64(target("aarc
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z5.d, z4.d
 ; CHECK-NEXT:    mov z7.d, z2.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z4.d, z3.d
 ; CHECK-NEXT:    mov z6.d, z1.d
+; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    sel { z0.d, z1.d }, pn8, { z6.d, z7.d }, { z4.d, z5.d }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll
index df504362680ba..3a21eaead5f72 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-selx4.ll
@@ -8,17 +8,17 @@ define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 1
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    ld1b { z27.b }, p1/z, [x0]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    sel { z0.b - z3.b }, pn8, { z28.b - z31.b }, { z24.b - z27.b }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -33,17 +33,17 @@ define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue p1.h
 ; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    ld1h { z27.h }, p1/z, [x0]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    sel { z0.h - z3.h }, pn8, { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -58,17 +58,17 @@ define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue p1.h
 ; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    ld1h { z27.h }, p1/z, [x0]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    sel { z0.h - z3.h }, pn8, { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -83,17 +83,17 @@ define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue p1.h
 ; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    ld1h { z27.h }, p1/z, [x0]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    sel { z0.h - z3.h }, pn8, { z28.h - z31.h }, { z24.h - z27.h }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -108,17 +108,17 @@ define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    ld1w { z27.s }, p1/z, [x0]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    sel { z0.s - z3.s }, pn8, { z28.s - z31.s }, { z24.s - z27.s }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -133,17 +133,17 @@ define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vsca
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue p1.s
 ; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    ld1w { z27.s }, p1/z, [x0]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    sel { z0.s - z3.s }, pn8, { z28.s - z31.s }, { z24.s - z27.s }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -158,17 +158,17 @@ define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    ld1d { z27.d }, p1/z, [x0]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    sel { z0.d - z3.d }, pn8, { z28.d - z31.d }, { z24.d - z27.d }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
@@ -183,17 +183,17 @@ define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z26.d, z7.d
 ; CHECK-NEXT:    mov z31.d, z4.d
-; CHECK-NEXT:    mov p8.b, p0.b
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue p1.d
 ; CHECK-NEXT:    mov z25.d, z6.d
 ; CHECK-NEXT:    mov z30.d, z3.d
 ; CHECK-NEXT:    mov z24.d, z5.d
 ; CHECK-NEXT:    mov z29.d, z2.d
-; CHECK-NEXT:    mov z28.d, z1.d
+; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    ld1d { z27.d }, p1/z, [x0]
+; CHECK-NEXT:    mov z28.d, z1.d
 ; CHECK-NEXT:    sel { z0.d - z3.d }, pn8, { z28.d - z31.d }, { z24.d - z27.d }
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll
index 39639120802fa..8b54a626b3660 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll
@@ -9,9 +9,9 @@ define void @st1_x2_i8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vsc
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    st1b { z2.b, z3.b }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -27,9 +27,9 @@ define void @st1_x2_i16(<vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vs
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    st1h { z2.h, z3.h }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -45,9 +45,9 @@ define void @st1_x2_i32(<vscale x 16 x i8> %unused, <vscale x 4 x i32> %zn0, <vs
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    st1w { z2.s, z3.s }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -63,9 +63,9 @@ define void @st1_x2_i64(<vscale x 16 x i8> %unused, <vscale x 2 x i64> %zn0, <vs
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    st1d { z2.d, z3.d }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -81,9 +81,9 @@ define void @st1_x2_f16(<vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    st1h { z2.h, z3.h }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -99,9 +99,9 @@ define void @st1_x2_bf16(<vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    st1h { z2.h, z3.h }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -117,9 +117,9 @@ define void @st1_x2_f32(<vscale x 16 x i8> %unused, <vscale x 4 x float> %zn0, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    st1w { z2.s, z3.s }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -135,9 +135,9 @@ define void @st1_x2_f64(<vscale x 16 x i8> %unused, <vscale x 2 x double> %zn0,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    st1d { z2.d, z3.d }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -153,8 +153,8 @@ define void @st1_x4_i8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vsc
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
@@ -173,8 +173,8 @@ define void @st1_x4_i16(<vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vs
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
@@ -193,8 +193,8 @@ define void @st1_x4_i32(<vscale x 16 x i8> %unused, <vscale x 4 x i32> %zn0, <vs
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
@@ -213,8 +213,8 @@ define void @st1_x4_i64(<vscale x 16 x i8> %unused, <vscale x 2 x i64> %zn0, <vs
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
@@ -233,8 +233,8 @@ define void @st1_x4_f16(<vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
@@ -253,8 +253,8 @@ define void @st1_x4_bf16(<vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
@@ -273,8 +273,8 @@ define void @st1_x4_f32(<vscale x 16 x i8> %unused, <vscale x 4 x float> %zn0, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
@@ -293,8 +293,8 @@ define void @st1_x4_f64(<vscale x 16 x i8> %unused, <vscale x 2 x double> %zn0,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
@@ -315,9 +315,9 @@ define void @stnt1_x2_i8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    stnt1b { z2.b, z3.b }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -333,9 +333,9 @@ define void @stnt1_x2_i16(<vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    stnt1h { z2.h, z3.h }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -351,9 +351,9 @@ define void @stnt1_x2_i32(<vscale x 16 x i8> %unused, <vscale x 4 x i32> %zn0, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    stnt1w { z2.s, z3.s }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -369,9 +369,9 @@ define void @stnt1_x2_i64(<vscale x 16 x i8> %unused, <vscale x 2 x i64> %zn0, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    stnt1d { z2.d, z3.d }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -387,9 +387,9 @@ define void @stnt1_x2_f16(<vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    stnt1h { z2.h, z3.h }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -405,9 +405,9 @@ define void @stnt1_x2_bf16(<vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    stnt1h { z2.h, z3.h }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -423,9 +423,9 @@ define void @stnt1_x2_f32(<vscale x 16 x i8> %unused, <vscale x 4 x float> %zn0,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    stnt1w { z2.s, z3.s }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -441,9 +441,9 @@ define void @stnt1_x2_f64(<vscale x 16 x i8> %unused, <vscale x 2 x double> %zn0
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
-; CHECK-NEXT:    mov z3.d, z2.d
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    stnt1d { z2.d, z3.d }, pn8, [x0]
 ; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
@@ -459,8 +459,8 @@ define void @stnt1_x4_i8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <v
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
@@ -479,8 +479,8 @@ define void @stnt1_x4_i16(<vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
@@ -499,8 +499,8 @@ define void @stnt1_x4_i32(<vscale x 16 x i8> %unused, <vscale x 4 x i32> %zn0, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
@@ -519,8 +519,8 @@ define void @stnt1_x4_i64(<vscale x 16 x i8> %unused, <vscale x 2 x i64> %zn0, <
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
@@ -539,8 +539,8 @@ define void @stnt1_x4_f16(<vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
@@ -559,8 +559,8 @@ define void @stnt1_x4_bf16(<vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
@@ -579,8 +579,8 @@ define void @stnt1_x4_f32(<vscale x 16 x i8> %unused, <vscale x 4 x float> %zn0,
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
@@ -599,8 +599,8 @@ define void @stnt1_x4_f64(<vscale x 16 x i8> %unused, <vscale x 2 x double> %zn0
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov z7.d, z4.d
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
 ; CHECK-NEXT:    mov p8.b, p0.b
 ; CHECK-NEXT:    mov z6.d, z3.d
 ; CHECK-NEXT:    mov z5.d, z2.d
diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index 30ff70088454d..16521834090b5 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -147,11 +147,11 @@ define void @v8i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v4i8:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldr s1, [x0]
-; CHECK-SD-NEXT:    ldr s2, [x1]
-; CHECK-SD-NEXT:    movi d0, #0xff00ff00ff00ff
-; CHECK-SD-NEXT:    uaddl v1.8h, v1.8b, v2.8b
-; CHECK-SD-NEXT:    umin v0.4h, v1.4h, v0.4h
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    movi d2, #0xff00ff00ff00ff
+; CHECK-SD-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-SD-NEXT:    umin v0.4h, v0.4h, v2.4h
 ; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    str s0, [x2]
 ; CHECK-SD-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
index b31ce94cdaaea..d5f1febaeb7db 100644
--- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
+++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll
@@ -52,11 +52,11 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind {
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    dup v1.4s, w8
 ; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    shl v1.4s, v0.4s, #31
-; CHECK-NEXT:    usra v1.4s, v0.4s, #1
-; CHECK-NEXT:    movi v0.16b, #170
-; CHECK-NEXT:    fneg v0.4s, v0.4s
-; CHECK-NEXT:    cmhs v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    movi v1.16b, #170
+; CHECK-NEXT:    shl v2.4s, v0.4s, #31
+; CHECK-NEXT:    fneg v1.4s, v1.4s
+; CHECK-NEXT:    usra v2.4s, v0.4s, #1
+; CHECK-NEXT:    cmhs v0.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:    ret
   %urem = urem <4 x i32> %X, <i32 6, i32 6, i32 6, i32 6>
diff --git a/llvm/test/CodeGen/AArch64/vec_uaddo.ll b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
index 00609b0df9b4e..37c6374215d81 100644
--- a/llvm/test/CodeGen/AArch64/vec_uaddo.ll
+++ b/llvm/test/CodeGen/AArch64/vec_uaddo.ll
@@ -100,9 +100,9 @@ define <6 x i32> @uaddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
 ; CHECK-NEXT:    mov v0.s[3], w3
 ; CHECK-NEXT:    cmhi v3.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    str d2, [x8, #16]
-; CHECK-NEXT:    add v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    mov w5, v3.s[1]
 ; CHECK-NEXT:    fmov w4, s3
+; CHECK-NEXT:    add v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    cmhi v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    str q1, [x8]
 ; CHECK-NEXT:    mov w1, v0.s[1]
@@ -248,10 +248,10 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
 ; CHECK-NEXT:    eor v2.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-NEXT:    adrp x8, .LCPI10_0
-; CHECK-NEXT:    shl v2.4h, v2.4h, #15
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    cmlt v1.4h, v2.4h, #0
+; CHECK-NEXT:    shl v1.4h, v2.4h, #15
 ; CHECK-NEXT:    ldr d2, [x8, :lo12:.LCPI10_0]
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    cmlt v1.4h, v1.4h, #0
 ; CHECK-NEXT:    shl v0.4s, v0.4s, #31
 ; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
 ; CHECK-NEXT:    cmlt v0.4s, v0.4s, #0
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 66ef436f48c63..3254c5ebe9c6b 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -2177,65 +2177,65 @@ define i32 @test_udot_v48i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-BASE-LABEL: test_udot_v48i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
 ; CHECK-GI-BASE-NEXT:    ldp q0, q3, [x1]
-; CHECK-GI-BASE-NEXT:    ldr q6, [x1, #32]
+; CHECK-GI-BASE-NEXT:    ldr q6, [x0, #32]
 ; CHECK-GI-BASE-NEXT:    ldp q1, q2, [x0]
-; CHECK-GI-BASE-NEXT:    ldr q17, [x0, #32]
+; CHECK-GI-BASE-NEXT:    ldr q7, [x1, #32]
+; CHECK-GI-BASE-NEXT:    ushll v20.8h, v6.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll2 v6.8h, v6.16b, #0
 ; CHECK-GI-BASE-NEXT:    ushll v4.8h, v0.8b, #0
 ; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll v7.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v16.8h, v3.8b, #0
 ; CHECK-GI-BASE-NEXT:    ushll v5.8h, v1.8b, #0
 ; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll v16.8h, v2.8b, #0
+; CHECK-GI-BASE-NEXT:    ushll v17.8h, v2.8b, #0
 ; CHECK-GI-BASE-NEXT:    ushll2 v3.8h, v3.16b, #0
 ; CHECK-GI-BASE-NEXT:    ushll2 v2.8h, v2.16b, #0
 ; CHECK-GI-BASE-NEXT:    umull v18.4s, v4.4h, v5.4h
 ; CHECK-GI-BASE-NEXT:    umull2 v4.4s, v4.8h, v5.8h
-; CHECK-GI-BASE-NEXT:    umull2 v19.4s, v0.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    umull v20.4s, v7.4h, v16.4h
-; CHECK-GI-BASE-NEXT:    umull v0.4s, v0.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    ushll v5.8h, v6.8b, #0
-; CHECK-GI-BASE-NEXT:    ushll v1.8h, v17.8b, #0
-; CHECK-GI-BASE-NEXT:    umull2 v7.4s, v7.8h, v16.8h
-; CHECK-GI-BASE-NEXT:    ushll2 v6.8h, v6.16b, #0
-; CHECK-GI-BASE-NEXT:    ushll2 v17.8h, v17.16b, #0
-; CHECK-GI-BASE-NEXT:    addv s16, v18.4s
-; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
-; CHECK-GI-BASE-NEXT:    umull v18.4s, v3.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    umull v5.4s, v0.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    umull2 v0.4s, v0.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    umull v19.4s, v16.4h, v17.4h
+; CHECK-GI-BASE-NEXT:    ushll v1.8h, v7.8b, #0
+; CHECK-GI-BASE-NEXT:    umull2 v16.4s, v16.8h, v17.8h
+; CHECK-GI-BASE-NEXT:    umull v17.4s, v3.4h, v2.4h
 ; CHECK-GI-BASE-NEXT:    umull2 v2.4s, v3.8h, v2.8h
-; CHECK-GI-BASE-NEXT:    addv s3, v19.4s
-; CHECK-GI-BASE-NEXT:    umull v19.4s, v5.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    umull2 v1.4s, v5.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    addv s5, v20.4s
+; CHECK-GI-BASE-NEXT:    ushll2 v7.8h, v7.16b, #0
+; CHECK-GI-BASE-NEXT:    addv s18, v18.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
+; CHECK-GI-BASE-NEXT:    addv s5, v5.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    addv s7, v7.4s
-; CHECK-GI-BASE-NEXT:    umull v20.4s, v6.4h, v17.4h
-; CHECK-GI-BASE-NEXT:    umull2 v6.4s, v6.8h, v17.8h
-; CHECK-GI-BASE-NEXT:    fmov w8, s16
-; CHECK-GI-BASE-NEXT:    fmov w9, s4
-; CHECK-GI-BASE-NEXT:    fmov w10, s3
-; CHECK-GI-BASE-NEXT:    addv s3, v18.4s
+; CHECK-GI-BASE-NEXT:    addv s19, v19.4s
+; CHECK-GI-BASE-NEXT:    umull v3.4s, v1.4h, v20.4h
 ; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
-; CHECK-GI-BASE-NEXT:    fmov w11, s5
-; CHECK-GI-BASE-NEXT:    addv s4, v19.4s
+; CHECK-GI-BASE-NEXT:    umull2 v1.4s, v1.8h, v20.8h
+; CHECK-GI-BASE-NEXT:    umull v20.4s, v7.4h, v6.4h
+; CHECK-GI-BASE-NEXT:    fmov w8, s18
+; CHECK-GI-BASE-NEXT:    fmov w9, s4
+; CHECK-GI-BASE-NEXT:    fmov w10, s5
+; CHECK-GI-BASE-NEXT:    fmov w11, s0
+; CHECK-GI-BASE-NEXT:    fmov w12, s19
+; CHECK-GI-BASE-NEXT:    addv s4, v16.4s
+; CHECK-GI-BASE-NEXT:    addv s5, v17.4s
+; CHECK-GI-BASE-NEXT:    addv s3, v3.4s
+; CHECK-GI-BASE-NEXT:    umull2 v0.4s, v7.8h, v6.8h
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
-; CHECK-GI-BASE-NEXT:    fmov w9, s0
-; CHECK-GI-BASE-NEXT:    addv s0, v1.4s
-; CHECK-GI-BASE-NEXT:    addv s1, v20.4s
-; CHECK-GI-BASE-NEXT:    addv s5, v6.4s
-; CHECK-GI-BASE-NEXT:    add w10, w10, w11
-; CHECK-GI-BASE-NEXT:    fmov w11, s3
+; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
+; CHECK-GI-BASE-NEXT:    add w9, w11, w12
+; CHECK-GI-BASE-NEXT:    add w8, w8, w10
+; CHECK-GI-BASE-NEXT:    fmov w10, s4
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
 ; CHECK-GI-BASE-NEXT:    fmov w12, s2
-; CHECK-GI-BASE-NEXT:    add w8, w8, w9
-; CHECK-GI-BASE-NEXT:    fmov w9, s7
-; CHECK-GI-BASE-NEXT:    add w9, w10, w9
+; CHECK-GI-BASE-NEXT:    addv s4, v20.4s
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    add w9, w9, w10
 ; CHECK-GI-BASE-NEXT:    add w10, w11, w12
-; CHECK-GI-BASE-NEXT:    fmov w11, s4
+; CHECK-GI-BASE-NEXT:    fmov w11, s3
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
 ; CHECK-GI-BASE-NEXT:    add w9, w10, w11
-; CHECK-GI-BASE-NEXT:    fmov w10, s0
-; CHECK-GI-BASE-NEXT:    fmov w11, s5
-; CHECK-GI-BASE-NEXT:    add w9, w9, w10
 ; CHECK-GI-BASE-NEXT:    fmov w10, s1
+; CHECK-GI-BASE-NEXT:    fmov w11, s0
+; CHECK-GI-BASE-NEXT:    add w9, w9, w10
+; CHECK-GI-BASE-NEXT:    fmov w10, s4
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
 ; CHECK-GI-BASE-NEXT:    add w9, w10, w11
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w9
@@ -2527,65 +2527,65 @@ define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) {
 ; CHECK-GI-BASE-LABEL: test_sdot_v48i8:
 ; CHECK-GI-BASE:       // %bb.0: // %entry
 ; CHECK-GI-BASE-NEXT:    ldp q0, q3, [x1]
-; CHECK-GI-BASE-NEXT:    ldr q6, [x1, #32]
+; CHECK-GI-BASE-NEXT:    ldr q6, [x0, #32]
 ; CHECK-GI-BASE-NEXT:    ldp q1, q2, [x0]
-; CHECK-GI-BASE-NEXT:    ldr q17, [x0, #32]
+; CHECK-GI-BASE-NEXT:    ldr q7, [x1, #32]
+; CHECK-GI-BASE-NEXT:    sshll v20.8h, v6.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll2 v6.8h, v6.16b, #0
 ; CHECK-GI-BASE-NEXT:    sshll v4.8h, v0.8b, #0
 ; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll v7.8h, v3.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v16.8h, v3.8b, #0
 ; CHECK-GI-BASE-NEXT:    sshll v5.8h, v1.8b, #0
 ; CHECK-GI-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll v16.8h, v2.8b, #0
+; CHECK-GI-BASE-NEXT:    sshll v17.8h, v2.8b, #0
 ; CHECK-GI-BASE-NEXT:    sshll2 v3.8h, v3.16b, #0
 ; CHECK-GI-BASE-NEXT:    sshll2 v2.8h, v2.16b, #0
 ; CHECK-GI-BASE-NEXT:    smull v18.4s, v4.4h, v5.4h
 ; CHECK-GI-BASE-NEXT:    smull2 v4.4s, v4.8h, v5.8h
-; CHECK-GI-BASE-NEXT:    smull2 v19.4s, v0.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    smull v20.4s, v7.4h, v16.4h
-; CHECK-GI-BASE-NEXT:    smull v0.4s, v0.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    sshll v5.8h, v6.8b, #0
-; CHECK-GI-BASE-NEXT:    sshll v1.8h, v17.8b, #0
-; CHECK-GI-BASE-NEXT:    smull2 v7.4s, v7.8h, v16.8h
-; CHECK-GI-BASE-NEXT:    sshll2 v6.8h, v6.16b, #0
-; CHECK-GI-BASE-NEXT:    sshll2 v17.8h, v17.16b, #0
-; CHECK-GI-BASE-NEXT:    addv s16, v18.4s
-; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
-; CHECK-GI-BASE-NEXT:    smull v18.4s, v3.4h, v2.4h
+; CHECK-GI-BASE-NEXT:    smull v5.4s, v0.4h, v1.4h
+; CHECK-GI-BASE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
+; CHECK-GI-BASE-NEXT:    smull v19.4s, v16.4h, v17.4h
+; CHECK-GI-BASE-NEXT:    sshll v1.8h, v7.8b, #0
+; CHECK-GI-BASE-NEXT:    smull2 v16.4s, v16.8h, v17.8h
+; CHECK-GI-BASE-NEXT:    smull v17.4s, v3.4h, v2.4h
 ; CHECK-GI-BASE-NEXT:    smull2 v2.4s, v3.8h, v2.8h
-; CHECK-GI-BASE-NEXT:    addv s3, v19.4s
-; CHECK-GI-BASE-NEXT:    smull v19.4s, v5.4h, v1.4h
-; CHECK-GI-BASE-NEXT:    smull2 v1.4s, v5.8h, v1.8h
-; CHECK-GI-BASE-NEXT:    addv s5, v20.4s
+; CHECK-GI-BASE-NEXT:    sshll2 v7.8h, v7.16b, #0
+; CHECK-GI-BASE-NEXT:    addv s18, v18.4s
+; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
+; CHECK-GI-BASE-NEXT:    addv s5, v5.4s
 ; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    addv s7, v7.4s
-; CHECK-GI-BASE-NEXT:    smull v20.4s, v6.4h, v17.4h
-; CHECK-GI-BASE-NEXT:    smull2 v6.4s, v6.8h, v17.8h
-; CHECK-GI-BASE-NEXT:    fmov w8, s16
-; CHECK-GI-BASE-NEXT:    fmov w9, s4
-; CHECK-GI-BASE-NEXT:    fmov w10, s3
-; CHECK-GI-BASE-NEXT:    addv s3, v18.4s
+; CHECK-GI-BASE-NEXT:    addv s19, v19.4s
+; CHECK-GI-BASE-NEXT:    smull v3.4s, v1.4h, v20.4h
 ; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
-; CHECK-GI-BASE-NEXT:    fmov w11, s5
-; CHECK-GI-BASE-NEXT:    addv s4, v19.4s
+; CHECK-GI-BASE-NEXT:    smull2 v1.4s, v1.8h, v20.8h
+; CHECK-GI-BASE-NEXT:    smull v20.4s, v7.4h, v6.4h
+; CHECK-GI-BASE-NEXT:    fmov w8, s18
+; CHECK-GI-BASE-NEXT:    fmov w9, s4
+; CHECK-GI-BASE-NEXT:    fmov w10, s5
+; CHECK-GI-BASE-NEXT:    fmov w11, s0
+; CHECK-GI-BASE-NEXT:    fmov w12, s19
+; CHECK-GI-BASE-NEXT:    addv s4, v16.4s
+; CHECK-GI-BASE-NEXT:    addv s5, v17.4s
+; CHECK-GI-BASE-NEXT:    addv s3, v3.4s
+; CHECK-GI-BASE-NEXT:    smull2 v0.4s, v7.8h, v6.8h
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
-; CHECK-GI-BASE-NEXT:    fmov w9, s0
-; CHECK-GI-BASE-NEXT:    addv s0, v1.4s
-; CHECK-GI-BASE-NEXT:    addv s1, v20.4s
-; CHECK-GI-BASE-NEXT:    addv s5, v6.4s
-; CHECK-GI-BASE-NEXT:    add w10, w10, w11
-; CHECK-GI-BASE-NEXT:    fmov w11, s3
+; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
+; CHECK-GI-BASE-NEXT:    add w9, w11, w12
+; CHECK-GI-BASE-NEXT:    add w8, w8, w10
+; CHECK-GI-BASE-NEXT:    fmov w10, s4
+; CHECK-GI-BASE-NEXT:    fmov w11, s5
 ; CHECK-GI-BASE-NEXT:    fmov w12, s2
-; CHECK-GI-BASE-NEXT:    add w8, w8, w9
-; CHECK-GI-BASE-NEXT:    fmov w9, s7
-; CHECK-GI-BASE-NEXT:    add w9, w10, w9
+; CHECK-GI-BASE-NEXT:    addv s4, v20.4s
+; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
+; CHECK-GI-BASE-NEXT:    add w9, w9, w10
 ; CHECK-GI-BASE-NEXT:    add w10, w11, w12
-; CHECK-GI-BASE-NEXT:    fmov w11, s4
+; CHECK-GI-BASE-NEXT:    fmov w11, s3
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
 ; CHECK-GI-BASE-NEXT:    add w9, w10, w11
-; CHECK-GI-BASE-NEXT:    fmov w10, s0
-; CHECK-GI-BASE-NEXT:    fmov w11, s5
-; CHECK-GI-BASE-NEXT:    add w9, w9, w10
 ; CHECK-GI-BASE-NEXT:    fmov w10, s1
+; CHECK-GI-BASE-NEXT:    fmov w11, s0
+; CHECK-GI-BASE-NEXT:    add w9, w9, w10
+; CHECK-GI-BASE-NEXT:    fmov w10, s4
 ; CHECK-GI-BASE-NEXT:    add w8, w8, w9
 ; CHECK-GI-BASE-NEXT:    add w9, w10, w11
 ; CHECK-GI-BASE-NEXT:    add w0, w8, w9
@@ -2640,13 +2640,13 @@ define i32 @test_udot_v8i8_multi_use(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-SD-DOT-LABEL: test_udot_v8i8_multi_use:
 ; CHECK-SD-DOT:       // %bb.0: // %entry
 ; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-DOT-NEXT:    ushll v3.8h, v0.8b, #0
+; CHECK-SD-DOT-NEXT:    ushll v4.8h, v1.8b, #0
 ; CHECK-SD-DOT-NEXT:    udot v2.2s, v1.8b, v0.8b
-; CHECK-SD-DOT-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-DOT-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-SD-DOT-NEXT:    umull v0.4s, v1.4h, v0.4h
-; CHECK-SD-DOT-NEXT:    addp v2.2s, v2.2s, v2.2s
+; CHECK-SD-DOT-NEXT:    umull v0.4s, v4.4h, v3.4h
+; CHECK-SD-DOT-NEXT:    addp v1.2s, v2.2s, v2.2s
 ; CHECK-SD-DOT-NEXT:    fmov w9, s0
-; CHECK-SD-DOT-NEXT:    fmov w8, s2
+; CHECK-SD-DOT-NEXT:    fmov w8, s1
 ; CHECK-SD-DOT-NEXT:    add w0, w8, w9
 ; CHECK-SD-DOT-NEXT:    ret
 ;
@@ -3534,21 +3534,21 @@ entry:
 define i64 @add_pair_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-SD-LABEL: add_pair_v4i8_v4i64_sext:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-SD-NEXT:    ushll v2.2d, v0.2s, #0
-; CHECK-SD-NEXT:    ushll v3.2d, v1.2s, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v2.2d, v1.2s, #0
+; CHECK-SD-NEXT:    ushll v3.2d, v0.2s, #0
 ; CHECK-SD-NEXT:    ushll2 v0.2d, v0.4s, #0
 ; CHECK-SD-NEXT:    ushll2 v1.2d, v1.4s, #0
-; CHECK-SD-NEXT:    shl v2.2d, v2.2d, #56
 ; CHECK-SD-NEXT:    shl v3.2d, v3.2d, #56
+; CHECK-SD-NEXT:    shl v2.2d, v2.2d, #56
 ; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #56
 ; CHECK-SD-NEXT:    shl v1.2d, v1.2d, #56
-; CHECK-SD-NEXT:    sshr v2.2d, v2.2d, #56
 ; CHECK-SD-NEXT:    sshr v3.2d, v3.2d, #56
-; CHECK-SD-NEXT:    ssra v2.2d, v0.2d, #56
-; CHECK-SD-NEXT:    ssra v3.2d, v1.2d, #56
-; CHECK-SD-NEXT:    add v0.2d, v2.2d, v3.2d
+; CHECK-SD-NEXT:    sshr v2.2d, v2.2d, #56
+; CHECK-SD-NEXT:    ssra v3.2d, v0.2d, #56
+; CHECK-SD-NEXT:    ssra v2.2d, v1.2d, #56
+; CHECK-SD-NEXT:    add v0.2d, v3.2d, v2.2d
 ; CHECK-SD-NEXT:    addp d0, v0.2d
 ; CHECK-SD-NEXT:    fmov x0, d0
 ; CHECK-SD-NEXT:    ret
@@ -3816,37 +3816,37 @@ define i16 @add_v24i8_v24i16_zext(<24 x i8> %x) {
 ; CHECK-SD-NEXT:    ldr b1, [sp, #64]
 ; CHECK-SD-NEXT:    add x8, sp, #72
 ; CHECK-SD-NEXT:    ldr b2, [sp]
-; CHECK-SD-NEXT:    add x9, sp, #8
+; CHECK-SD-NEXT:    add x9, sp, #80
 ; CHECK-SD-NEXT:    ld1 { v1.b }[1], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #80
+; CHECK-SD-NEXT:    add x8, sp, #8
 ; CHECK-SD-NEXT:    mov v0.b[1], w1
-; CHECK-SD-NEXT:    ld1 { v2.b }[1], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #16
-; CHECK-SD-NEXT:    ld1 { v1.b }[2], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #88
-; CHECK-SD-NEXT:    ld1 { v2.b }[2], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #24
+; CHECK-SD-NEXT:    ld1 { v2.b }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #16
+; CHECK-SD-NEXT:    ld1 { v1.b }[2], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #88
+; CHECK-SD-NEXT:    ld1 { v2.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #24
 ; CHECK-SD-NEXT:    mov v0.b[2], w2
-; CHECK-SD-NEXT:    ld1 { v1.b }[3], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #96
-; CHECK-SD-NEXT:    ld1 { v2.b }[3], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #32
+; CHECK-SD-NEXT:    ld1 { v1.b }[3], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #96
+; CHECK-SD-NEXT:    ld1 { v2.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #32
 ; CHECK-SD-NEXT:    mov v0.b[3], w3
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #104
-; CHECK-SD-NEXT:    ld1 { v2.b }[4], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #40
-; CHECK-SD-NEXT:    ld1 { v1.b }[5], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #112
+; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #104
+; CHECK-SD-NEXT:    ld1 { v2.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #40
+; CHECK-SD-NEXT:    ld1 { v1.b }[5], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #112
 ; CHECK-SD-NEXT:    mov v0.b[4], w4
-; CHECK-SD-NEXT:    ld1 { v2.b }[5], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #48
-; CHECK-SD-NEXT:    ld1 { v1.b }[6], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #120
-; CHECK-SD-NEXT:    ld1 { v2.b }[6], [x9]
-; CHECK-SD-NEXT:    mov v0.b[5], w5
-; CHECK-SD-NEXT:    ld1 { v1.b }[7], [x8]
+; CHECK-SD-NEXT:    ld1 { v2.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #48
+; CHECK-SD-NEXT:    ld1 { v1.b }[6], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #120
+; CHECK-SD-NEXT:    ld1 { v2.b }[6], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #56
+; CHECK-SD-NEXT:    mov v0.b[5], w5
+; CHECK-SD-NEXT:    ld1 { v1.b }[7], [x9]
 ; CHECK-SD-NEXT:    ld1 { v2.b }[7], [x8]
 ; CHECK-SD-NEXT:    mov v0.b[6], w6
 ; CHECK-SD-NEXT:    mov v0.b[7], w7
@@ -3942,37 +3942,37 @@ define i16 @add_v24i8_v24i16_sext(<24 x i8> %x) {
 ; CHECK-SD-NEXT:    ldr b1, [sp, #64]
 ; CHECK-SD-NEXT:    add x8, sp, #72
 ; CHECK-SD-NEXT:    ldr b2, [sp]
-; CHECK-SD-NEXT:    add x9, sp, #8
+; CHECK-SD-NEXT:    add x9, sp, #80
 ; CHECK-SD-NEXT:    ld1 { v1.b }[1], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #80
+; CHECK-SD-NEXT:    add x8, sp, #8
 ; CHECK-SD-NEXT:    mov v0.b[1], w1
-; CHECK-SD-NEXT:    ld1 { v2.b }[1], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #16
-; CHECK-SD-NEXT:    ld1 { v1.b }[2], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #88
-; CHECK-SD-NEXT:    ld1 { v2.b }[2], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #24
+; CHECK-SD-NEXT:    ld1 { v2.b }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #16
+; CHECK-SD-NEXT:    ld1 { v1.b }[2], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #88
+; CHECK-SD-NEXT:    ld1 { v2.b }[2], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #24
 ; CHECK-SD-NEXT:    mov v0.b[2], w2
-; CHECK-SD-NEXT:    ld1 { v1.b }[3], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #96
-; CHECK-SD-NEXT:    ld1 { v2.b }[3], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #32
+; CHECK-SD-NEXT:    ld1 { v1.b }[3], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #96
+; CHECK-SD-NEXT:    ld1 { v2.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #32
 ; CHECK-SD-NEXT:    mov v0.b[3], w3
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #104
-; CHECK-SD-NEXT:    ld1 { v2.b }[4], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #40
-; CHECK-SD-NEXT:    ld1 { v1.b }[5], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #112
+; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #104
+; CHECK-SD-NEXT:    ld1 { v2.b }[4], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #40
+; CHECK-SD-NEXT:    ld1 { v1.b }[5], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #112
 ; CHECK-SD-NEXT:    mov v0.b[4], w4
-; CHECK-SD-NEXT:    ld1 { v2.b }[5], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #48
-; CHECK-SD-NEXT:    ld1 { v1.b }[6], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #120
-; CHECK-SD-NEXT:    ld1 { v2.b }[6], [x9]
-; CHECK-SD-NEXT:    mov v0.b[5], w5
-; CHECK-SD-NEXT:    ld1 { v1.b }[7], [x8]
+; CHECK-SD-NEXT:    ld1 { v2.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #48
+; CHECK-SD-NEXT:    ld1 { v1.b }[6], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #120
+; CHECK-SD-NEXT:    ld1 { v2.b }[6], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #56
+; CHECK-SD-NEXT:    mov v0.b[5], w5
+; CHECK-SD-NEXT:    ld1 { v1.b }[7], [x9]
 ; CHECK-SD-NEXT:    ld1 { v2.b }[7], [x8]
 ; CHECK-SD-NEXT:    mov v0.b[6], w6
 ; CHECK-SD-NEXT:    mov v0.b[7], w7
@@ -4069,48 +4069,48 @@ define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) {
 ; CHECK-SD-BASE-NEXT:    ldr b1, [sp, #64]
 ; CHECK-SD-BASE-NEXT:    add x8, sp, #72
 ; CHECK-SD-BASE-NEXT:    ldr b2, [sp]
-; CHECK-SD-BASE-NEXT:    add x9, sp, #8
+; CHECK-SD-BASE-NEXT:    add x9, sp, #80
 ; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[1], [x8]
-; CHECK-SD-BASE-NEXT:    add x8, sp, #80
+; CHECK-SD-BASE-NEXT:    add x8, sp, #8
 ; CHECK-SD-BASE-NEXT:    mov v0.b[1], w1
-; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[1], [x9]
-; CHECK-SD-BASE-NEXT:    add x9, sp, #16
-; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[2], [x8]
-; CHECK-SD-BASE-NEXT:    add x8, sp, #88
-; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[2], [x9]
-; CHECK-SD-BASE-NEXT:    add x9, sp, #24
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[1], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #16
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[2], [x9]
+; CHECK-SD-BASE-NEXT:    add x9, sp, #88
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[2], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #24
 ; CHECK-SD-BASE-NEXT:    mov v0.b[2], w2
-; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[3], [x8]
-; CHECK-SD-BASE-NEXT:    add x8, sp, #96
-; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[3], [x9]
-; CHECK-SD-BASE-NEXT:    add x9, sp, #32
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[3], [x9]
+; CHECK-SD-BASE-NEXT:    add x9, sp, #96
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[3], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #32
 ; CHECK-SD-BASE-NEXT:    mov v0.b[3], w3
-; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[4], [x8]
-; CHECK-SD-BASE-NEXT:    add x8, sp, #104
-; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[4], [x9]
-; CHECK-SD-BASE-NEXT:    add x9, sp, #40
-; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[5], [x8]
-; CHECK-SD-BASE-NEXT:    add x8, sp, #112
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[4], [x9]
+; CHECK-SD-BASE-NEXT:    add x9, sp, #104
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[4], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #40
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[5], [x9]
+; CHECK-SD-BASE-NEXT:    add x9, sp, #112
 ; CHECK-SD-BASE-NEXT:    mov v0.b[4], w4
-; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[5], [x9]
-; CHECK-SD-BASE-NEXT:    add x9, sp, #48
-; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[6], [x8]
-; CHECK-SD-BASE-NEXT:    add x8, sp, #120
-; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[6], [x9]
-; CHECK-SD-BASE-NEXT:    mov v0.b[5], w5
-; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[7], [x8]
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[5], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #48
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[6], [x9]
+; CHECK-SD-BASE-NEXT:    add x9, sp, #120
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[6], [x8]
 ; CHECK-SD-BASE-NEXT:    add x8, sp, #56
+; CHECK-SD-BASE-NEXT:    mov v0.b[5], w5
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[7], [x9]
 ; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[7], [x8]
 ; CHECK-SD-BASE-NEXT:    mov v0.b[6], w6
 ; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    ushll v2.8h, v2.8b, #0
 ; CHECK-SD-BASE-NEXT:    mov v0.b[7], w7
 ; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-SD-BASE-NEXT:    uaddl2 v3.4s, v0.8h, v1.8h
 ; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-SD-BASE-NEXT:    ushll v1.8h, v2.8b, #0
-; CHECK-SD-BASE-NEXT:    uaddw2 v2.4s, v3.4s, v1.8h
-; CHECK-SD-BASE-NEXT:    uaddw v0.4s, v0.4s, v1.4h
-; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    uaddw2 v1.4s, v3.4s, v2.8h
+; CHECK-SD-BASE-NEXT:    uaddw v0.4s, v0.4s, v2.4h
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-SD-BASE-NEXT:    fmov w0, s0
 ; CHECK-SD-BASE-NEXT:    ret
@@ -4147,9 +4147,9 @@ define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) {
 ; CHECK-SD-DOT-NEXT:    udot v4.2s, v1.8b, v5.8b
 ; CHECK-SD-DOT-NEXT:    mov v0.b[7], w7
 ; CHECK-SD-DOT-NEXT:    addp v1.2s, v4.2s, v4.2s
+; CHECK-SD-DOT-NEXT:    fmov w9, s1
 ; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[8], [x8]
 ; CHECK-SD-DOT-NEXT:    add x8, sp, #8
-; CHECK-SD-DOT-NEXT:    fmov w9, s1
 ; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[9], [x8]
 ; CHECK-SD-DOT-NEXT:    add x8, sp, #16
 ; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[10], [x8]
@@ -4342,48 +4342,48 @@ define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) {
 ; CHECK-SD-BASE-NEXT:    ldr b1, [sp, #64]
 ; CHECK-SD-BASE-NEXT:    add x8, sp, #72
 ; CHECK-SD-BASE-NEXT:    ldr b2, [sp]
-; CHECK-SD-BASE-NEXT:    add x9, sp, #8
+; CHECK-SD-BASE-NEXT:    add x9, sp, #80
 ; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[1], [x8]
-; CHECK-SD-BASE-NEXT:    add x8, sp, #80
+; CHECK-SD-BASE-NEXT:    add x8, sp, #8
 ; CHECK-SD-BASE-NEXT:    mov v0.b[1], w1
-; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[1], [x9]
-; CHECK-SD-BASE-NEXT:    add x9, sp, #16
-; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[2], [x8]
-; CHECK-SD-BASE-NEXT:    add x8, sp, #88
-; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[2], [x9]
-; CHECK-SD-BASE-NEXT:    add x9, sp, #24
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[1], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #16
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[2], [x9]
+; CHECK-SD-BASE-NEXT:    add x9, sp, #88
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[2], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #24
 ; CHECK-SD-BASE-NEXT:    mov v0.b[2], w2
-; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[3], [x8]
-; CHECK-SD-BASE-NEXT:    add x8, sp, #96
-; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[3], [x9]
-; CHECK-SD-BASE-NEXT:    add x9, sp, #32
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[3], [x9]
+; CHECK-SD-BASE-NEXT:    add x9, sp, #96
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[3], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #32
 ; CHECK-SD-BASE-NEXT:    mov v0.b[3], w3
-; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[4], [x8]
-; CHECK-SD-BASE-NEXT:    add x8, sp, #104
-; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[4], [x9]
-; CHECK-SD-BASE-NEXT:    add x9, sp, #40
-; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[5], [x8]
-; CHECK-SD-BASE-NEXT:    add x8, sp, #112
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[4], [x9]
+; CHECK-SD-BASE-NEXT:    add x9, sp, #104
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[4], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #40
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[5], [x9]
+; CHECK-SD-BASE-NEXT:    add x9, sp, #112
 ; CHECK-SD-BASE-NEXT:    mov v0.b[4], w4
-; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[5], [x9]
-; CHECK-SD-BASE-NEXT:    add x9, sp, #48
-; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[6], [x8]
-; CHECK-SD-BASE-NEXT:    add x8, sp, #120
-; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[6], [x9]
-; CHECK-SD-BASE-NEXT:    mov v0.b[5], w5
-; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[7], [x8]
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[5], [x8]
+; CHECK-SD-BASE-NEXT:    add x8, sp, #48
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[6], [x9]
+; CHECK-SD-BASE-NEXT:    add x9, sp, #120
+; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[6], [x8]
 ; CHECK-SD-BASE-NEXT:    add x8, sp, #56
+; CHECK-SD-BASE-NEXT:    mov v0.b[5], w5
+; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[7], [x9]
 ; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[7], [x8]
 ; CHECK-SD-BASE-NEXT:    mov v0.b[6], w6
 ; CHECK-SD-BASE-NEXT:    sshll v1.8h, v1.8b, #0
+; CHECK-SD-BASE-NEXT:    sshll v2.8h, v2.8b, #0
 ; CHECK-SD-BASE-NEXT:    mov v0.b[7], w7
 ; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
 ; CHECK-SD-BASE-NEXT:    saddl2 v3.4s, v0.8h, v1.8h
 ; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v1.4h
-; CHECK-SD-BASE-NEXT:    sshll v1.8h, v2.8b, #0
-; CHECK-SD-BASE-NEXT:    saddw2 v2.4s, v3.4s, v1.8h
-; CHECK-SD-BASE-NEXT:    saddw v0.4s, v0.4s, v1.4h
-; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
+; CHECK-SD-BASE-NEXT:    saddw2 v1.4s, v3.4s, v2.8h
+; CHECK-SD-BASE-NEXT:    saddw v0.4s, v0.4s, v2.4h
+; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
 ; CHECK-SD-BASE-NEXT:    fmov w0, s0
 ; CHECK-SD-BASE-NEXT:    ret
@@ -4420,9 +4420,9 @@ define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) {
 ; CHECK-SD-DOT-NEXT:    sdot v4.2s, v1.8b, v5.8b
 ; CHECK-SD-DOT-NEXT:    mov v0.b[7], w7
 ; CHECK-SD-DOT-NEXT:    addp v1.2s, v4.2s, v4.2s
+; CHECK-SD-DOT-NEXT:    fmov w9, s1
 ; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[8], [x8]
 ; CHECK-SD-DOT-NEXT:    add x8, sp, #8
-; CHECK-SD-DOT-NEXT:    fmov w9, s1
 ; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[9], [x8]
 ; CHECK-SD-DOT-NEXT:    add x8, sp, #16
 ; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[10], [x8]
@@ -4611,23 +4611,23 @@ entry:
 define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
 ; CHECK-SD-BASE-LABEL: full:
 ; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    ldr d0, [x2]
-; CHECK-SD-BASE-NEXT:    ldr d1, [x0]
 ; CHECK-SD-BASE-NEXT:    // kill: def $w3 killed $w3 def $x3
 ; CHECK-SD-BASE-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-SD-BASE-NEXT:    sxtw x8, w3
 ; CHECK-SD-BASE-NEXT:    sxtw x9, w1
-; CHECK-SD-BASE-NEXT:    uabdl v0.8h, v1.8b, v0.8b
-; CHECK-SD-BASE-NEXT:    add x11, x2, x8
+; CHECK-SD-BASE-NEXT:    ldr d0, [x0]
+; CHECK-SD-BASE-NEXT:    ldr d1, [x2]
 ; CHECK-SD-BASE-NEXT:    add x10, x0, x9
-; CHECK-SD-BASE-NEXT:    ldr d2, [x11]
-; CHECK-SD-BASE-NEXT:    add x11, x11, x8
+; CHECK-SD-BASE-NEXT:    add x11, x2, x8
+; CHECK-SD-BASE-NEXT:    uabdl v0.8h, v0.8b, v1.8b
 ; CHECK-SD-BASE-NEXT:    ldr d1, [x10]
+; CHECK-SD-BASE-NEXT:    ldr d2, [x11]
 ; CHECK-SD-BASE-NEXT:    add x10, x10, x9
-; CHECK-SD-BASE-NEXT:    uaddlp v0.4s, v0.8h
+; CHECK-SD-BASE-NEXT:    add x11, x11, x8
 ; CHECK-SD-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
 ; CHECK-SD-BASE-NEXT:    ldr d2, [x11]
 ; CHECK-SD-BASE-NEXT:    add x11, x11, x8
+; CHECK-SD-BASE-NEXT:    uaddlp v0.4s, v0.8h
 ; CHECK-SD-BASE-NEXT:    uadalp v0.4s, v1.8h
 ; CHECK-SD-BASE-NEXT:    ldr d1, [x10]
 ; CHECK-SD-BASE-NEXT:    add x10, x10, x9
@@ -4723,98 +4723,98 @@ define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-GI-NEXT:    // kill: def $w3 killed $w3 def $x3
-; CHECK-GI-NEXT:    sxtw x8, w1
-; CHECK-GI-NEXT:    sxtw x9, w3
+; CHECK-GI-NEXT:    sxtw x8, w3
+; CHECK-GI-NEXT:    sxtw x9, w1
 ; CHECK-GI-NEXT:    ldr d0, [x0]
 ; CHECK-GI-NEXT:    ldr d1, [x2]
-; CHECK-GI-NEXT:    add x10, x0, x8
-; CHECK-GI-NEXT:    add x11, x2, x9
+; CHECK-GI-NEXT:    add x10, x0, x9
+; CHECK-GI-NEXT:    add x11, x2, x8
 ; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-GI-NEXT:    ldr d2, [x10]
-; CHECK-GI-NEXT:    ldr d3, [x11]
+; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    add x12, x11, x8
 ; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-GI-NEXT:    add x10, x10, x8
-; CHECK-GI-NEXT:    add x11, x11, x9
+; CHECK-GI-NEXT:    ldr d3, [x11]
+; CHECK-GI-NEXT:    ldr d4, [x10]
+; CHECK-GI-NEXT:    ldr d5, [x12]
+; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    add x11, x12, x8
 ; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
 ; CHECK-GI-NEXT:    ushll v3.8h, v3.8b, #0
-; CHECK-GI-NEXT:    ldr d4, [x10]
-; CHECK-GI-NEXT:    ldr d5, [x11]
-; CHECK-GI-NEXT:    add x10, x10, x8
-; CHECK-GI-NEXT:    add x11, x11, x9
+; CHECK-GI-NEXT:    ushll v4.8h, v4.8b, #0
+; CHECK-GI-NEXT:    ushll v5.8h, v5.8b, #0
 ; CHECK-GI-NEXT:    uabdl v6.4s, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    uabdl2 v0.4s, v0.8h, v1.8h
 ; CHECK-GI-NEXT:    ldr d1, [x10]
-; CHECK-GI-NEXT:    ushll v4.8h, v4.8b, #0
-; CHECK-GI-NEXT:    ushll v5.8h, v5.8b, #0
 ; CHECK-GI-NEXT:    ldr d7, [x11]
+; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    add x11, x11, x8
 ; CHECK-GI-NEXT:    uabdl v16.4s, v2.4h, v3.4h
 ; CHECK-GI-NEXT:    uabdl2 v2.4s, v2.8h, v3.8h
-; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-GI-NEXT:    ushll v7.8h, v7.8b, #0
-; CHECK-GI-NEXT:    add x10, x10, x8
-; CHECK-GI-NEXT:    add x11, x11, x9
-; CHECK-GI-NEXT:    uabdl v1.4s, v4.4h, v5.4h
+; CHECK-GI-NEXT:    uabdl v3.4s, v4.4h, v5.4h
 ; CHECK-GI-NEXT:    uabdl2 v4.4s, v4.8h, v5.8h
+; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-GI-NEXT:    ushll v7.8h, v7.8b, #0
 ; CHECK-GI-NEXT:    ldr d5, [x10]
-; CHECK-GI-NEXT:    add v2.4s, v16.4s, v2.4s
-; CHECK-GI-NEXT:    ldr d16, [x11]
+; CHECK-GI-NEXT:    ldr d17, [x11]
+; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    add x11, x11, x8
 ; CHECK-GI-NEXT:    add v0.4s, v6.4s, v0.4s
-; CHECK-GI-NEXT:    uabdl v6.4s, v3.4h, v7.4h
-; CHECK-GI-NEXT:    uabdl2 v3.4s, v3.8h, v7.8h
 ; CHECK-GI-NEXT:    ushll v5.8h, v5.8b, #0
-; CHECK-GI-NEXT:    add x10, x10, x8
-; CHECK-GI-NEXT:    ushll v7.8h, v16.8b, #0
-; CHECK-GI-NEXT:    add x11, x11, x9
-; CHECK-GI-NEXT:    ldr d16, [x10]
-; CHECK-GI-NEXT:    ldr d17, [x11]
-; CHECK-GI-NEXT:    add v1.4s, v1.4s, v4.4s
-; CHECK-GI-NEXT:    add x10, x10, x8
-; CHECK-GI-NEXT:    add x11, x11, x9
-; CHECK-GI-NEXT:    add v3.4s, v6.4s, v3.4s
-; CHECK-GI-NEXT:    ushll v16.8h, v16.8b, #0
 ; CHECK-GI-NEXT:    ushll v17.8h, v17.8b, #0
-; CHECK-GI-NEXT:    uabdl v22.4s, v5.4h, v7.4h
-; CHECK-GI-NEXT:    uabdl2 v5.4s, v5.8h, v7.8h
+; CHECK-GI-NEXT:    add v2.4s, v16.4s, v2.4s
+; CHECK-GI-NEXT:    add v3.4s, v3.4s, v4.4s
+; CHECK-GI-NEXT:    uabdl v4.4s, v1.4h, v7.4h
+; CHECK-GI-NEXT:    uabdl2 v1.4s, v1.8h, v7.8h
+; CHECK-GI-NEXT:    ldr d7, [x10]
+; CHECK-GI-NEXT:    ldr d16, [x11]
+; CHECK-GI-NEXT:    add x10, x10, x9
+; CHECK-GI-NEXT:    add x11, x11, x8
 ; CHECK-GI-NEXT:    ldr d18, [x10]
+; CHECK-GI-NEXT:    ldr d20, [x10, x9]
 ; CHECK-GI-NEXT:    ldr d19, [x11]
-; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    ldr d21, [x11, x8]
+; CHECK-GI-NEXT:    uabdl v6.4s, v5.4h, v17.4h
+; CHECK-GI-NEXT:    ushll v7.8h, v7.8b, #0
+; CHECK-GI-NEXT:    ushll v16.8h, v16.8b, #0
+; CHECK-GI-NEXT:    uabdl2 v5.4s, v5.8h, v17.8h
+; CHECK-GI-NEXT:    ushll v17.8h, v18.8b, #0
+; CHECK-GI-NEXT:    ushll v18.8h, v19.8b, #0
+; CHECK-GI-NEXT:    add v1.4s, v4.4s, v1.4s
+; CHECK-GI-NEXT:    ushll v4.8h, v20.8b, #0
+; CHECK-GI-NEXT:    ushll v19.8h, v21.8b, #0
 ; CHECK-GI-NEXT:    addv s2, v2.4s
-; CHECK-GI-NEXT:    addv s1, v1.4s
-; CHECK-GI-NEXT:    ushll v18.8h, v18.8b, #0
-; CHECK-GI-NEXT:    ushll v19.8h, v19.8b, #0
-; CHECK-GI-NEXT:    uabdl v4.4s, v16.4h, v17.4h
-; CHECK-GI-NEXT:    uabdl2 v16.4s, v16.8h, v17.8h
-; CHECK-GI-NEXT:    add v5.4s, v22.4s, v5.4s
-; CHECK-GI-NEXT:    ldr d20, [x10, x8]
-; CHECK-GI-NEXT:    ldr d21, [x11, x9]
+; CHECK-GI-NEXT:    addv s0, v0.4s
 ; CHECK-GI-NEXT:    addv s3, v3.4s
+; CHECK-GI-NEXT:    uabdl v20.4s, v7.4h, v16.4h
+; CHECK-GI-NEXT:    uabdl2 v7.4s, v7.8h, v16.8h
+; CHECK-GI-NEXT:    add v5.4s, v6.4s, v5.4s
+; CHECK-GI-NEXT:    uabdl v6.4s, v17.4h, v18.4h
+; CHECK-GI-NEXT:    uabdl2 v16.4s, v17.8h, v18.8h
+; CHECK-GI-NEXT:    uabdl v17.4s, v4.4h, v19.4h
+; CHECK-GI-NEXT:    uabdl2 v4.4s, v4.8h, v19.8h
 ; CHECK-GI-NEXT:    fmov w8, s2
+; CHECK-GI-NEXT:    addv s1, v1.4s
 ; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    ushll v7.8h, v20.8b, #0
-; CHECK-GI-NEXT:    ushll v20.8h, v21.8b, #0
-; CHECK-GI-NEXT:    uabdl v6.4s, v18.4h, v19.4h
-; CHECK-GI-NEXT:    uabdl2 v17.4s, v18.8h, v19.8h
-; CHECK-GI-NEXT:    add v4.4s, v4.4s, v16.4s
-; CHECK-GI-NEXT:    addv s5, v5.4s
-; CHECK-GI-NEXT:    fmov w10, s1
+; CHECK-GI-NEXT:    fmov w10, s3
+; CHECK-GI-NEXT:    add v7.4s, v20.4s, v7.4s
+; CHECK-GI-NEXT:    add v0.4s, v17.4s, v4.4s
+; CHECK-GI-NEXT:    addv s4, v5.4s
+; CHECK-GI-NEXT:    add v2.4s, v6.4s, v16.4s
 ; CHECK-GI-NEXT:    add w8, w8, w9
-; CHECK-GI-NEXT:    fmov w9, s3
-; CHECK-GI-NEXT:    uabdl v18.4s, v7.4h, v20.4h
-; CHECK-GI-NEXT:    uabdl2 v7.4s, v7.8h, v20.8h
-; CHECK-GI-NEXT:    add v6.4s, v6.4s, v17.4s
+; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    add w8, w10, w8
-; CHECK-GI-NEXT:    addv s0, v4.4s
+; CHECK-GI-NEXT:    addv s3, v7.4s
+; CHECK-GI-NEXT:    addv s1, v2.4s
+; CHECK-GI-NEXT:    addv s0, v0.4s
 ; CHECK-GI-NEXT:    add w8, w9, w8
-; CHECK-GI-NEXT:    fmov w9, s5
-; CHECK-GI-NEXT:    add v7.4s, v18.4s, v7.4s
-; CHECK-GI-NEXT:    addv s1, v6.4s
+; CHECK-GI-NEXT:    fmov w9, s4
 ; CHECK-GI-NEXT:    add w8, w9, w8
-; CHECK-GI-NEXT:    fmov w9, s0
-; CHECK-GI-NEXT:    addv s2, v7.4s
+; CHECK-GI-NEXT:    fmov w9, s3
 ; CHECK-GI-NEXT:    add w8, w9, w8
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    add w8, w9, w8
-; CHECK-GI-NEXT:    fmov w9, s2
+; CHECK-GI-NEXT:    fmov w9, s0
 ; CHECK-GI-NEXT:    add w0, w9, w8
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll
index de26676b5c73e..063b23275c616 100644
--- a/llvm/test/CodeGen/AArch64/vector-fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/vector-fcopysign.ll
@@ -264,15 +264,15 @@ define <4 x bfloat> @test_copysign_v4bf16_v4bf16(<4 x bfloat> %a, <4 x bfloat> %
 define <4 x bfloat> @test_copysign_v4bf16_v4f32(<4 x bfloat> %a, <4 x float> %b) #0 {
 ; CHECK-LABEL: test_copysign_v4bf16_v4f32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    movi.4s v2, #127, msl #8
-; CHECK-NEXT:    movi.4s v3, #1
+; CHECK-NEXT:    movi.4s v2, #1
+; CHECK-NEXT:    movi.4s v3, #127, msl #8
 ; CHECK-NEXT:    ushr.4s v4, v1, #16
-; CHECK-NEXT:    add.4s v2, v1, v2
-; CHECK-NEXT:    and.16b v3, v4, v3
-; CHECK-NEXT:    add.4s v2, v3, v2
-; CHECK-NEXT:    fcmeq.4s v3, v1, v1
+; CHECK-NEXT:    and.16b v2, v4, v2
+; CHECK-NEXT:    add.4s v3, v1, v3
+; CHECK-NEXT:    fcmeq.4s v4, v1, v1
 ; CHECK-NEXT:    orr.4s v1, #64, lsl #16
-; CHECK-NEXT:    bit.16b v1, v2, v3
+; CHECK-NEXT:    add.4s v2, v2, v3
+; CHECK-NEXT:    bit.16b v1, v2, v4
 ; CHECK-NEXT:    mvni.4h v2, #128, lsl #8
 ; CHECK-NEXT:    shrn.4h v1, v1, #16
 ; CHECK-NEXT:    bif.8b v0, v1, v2
@@ -286,16 +286,16 @@ define <4 x bfloat> @test_copysign_v4bf16_v4f64(<4 x bfloat> %a, <4 x double> %b
 ; CHECK-LABEL: test_copysign_v4bf16_v4f64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    fcvtxn v1.2s, v1.2d
-; CHECK-NEXT:    movi.4s v3, #1
+; CHECK-NEXT:    movi.4s v3, #127, msl #8
 ; CHECK-NEXT:    fcvtxn2 v1.4s, v2.2d
-; CHECK-NEXT:    movi.4s v2, #127, msl #8
+; CHECK-NEXT:    movi.4s v2, #1
 ; CHECK-NEXT:    ushr.4s v4, v1, #16
-; CHECK-NEXT:    add.4s v2, v1, v2
-; CHECK-NEXT:    and.16b v3, v4, v3
-; CHECK-NEXT:    add.4s v2, v3, v2
-; CHECK-NEXT:    fcmeq.4s v3, v1, v1
+; CHECK-NEXT:    add.4s v3, v1, v3
+; CHECK-NEXT:    and.16b v2, v4, v2
+; CHECK-NEXT:    fcmeq.4s v4, v1, v1
 ; CHECK-NEXT:    orr.4s v1, #64, lsl #16
-; CHECK-NEXT:    bit.16b v1, v2, v3
+; CHECK-NEXT:    add.4s v2, v2, v3
+; CHECK-NEXT:    bit.16b v1, v2, v4
 ; CHECK-NEXT:    mvni.4h v2, #128, lsl #8
 ; CHECK-NEXT:    shrn.4h v1, v1, #16
 ; CHECK-NEXT:    bif.8b v0, v1, v2
@@ -322,22 +322,22 @@ define <8 x bfloat> @test_copysign_v8bf16_v8bf16(<8 x bfloat> %a, <8 x bfloat> %
 define <8 x bfloat> @test_copysign_v8bf16_v8f32(<8 x bfloat> %a, <8 x float> %b) #0 {
 ; CHECK-LABEL: test_copysign_v8bf16_v8f32:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    movi.4s v3, #127, msl #8
-; CHECK-NEXT:    movi.4s v4, #1
+; CHECK-NEXT:    movi.4s v3, #1
+; CHECK-NEXT:    movi.4s v4, #127, msl #8
 ; CHECK-NEXT:    ushr.4s v5, v2, #16
 ; CHECK-NEXT:    ushr.4s v6, v1, #16
-; CHECK-NEXT:    add.4s v7, v2, v3
-; CHECK-NEXT:    add.4s v3, v1, v3
-; CHECK-NEXT:    and.16b v5, v5, v4
-; CHECK-NEXT:    and.16b v4, v6, v4
+; CHECK-NEXT:    and.16b v5, v5, v3
+; CHECK-NEXT:    add.4s v7, v2, v4
+; CHECK-NEXT:    and.16b v3, v6, v3
+; CHECK-NEXT:    add.4s v4, v1, v4
 ; CHECK-NEXT:    fcmeq.4s v6, v2, v2
 ; CHECK-NEXT:    orr.4s v2, #64, lsl #16
 ; CHECK-NEXT:    add.4s v5, v5, v7
-; CHECK-NEXT:    add.4s v3, v4, v3
-; CHECK-NEXT:    fcmeq.4s v4, v1, v1
+; CHECK-NEXT:    fcmeq.4s v7, v1, v1
 ; CHECK-NEXT:    orr.4s v1, #64, lsl #16
+; CHECK-NEXT:    add.4s v3, v3, v4
 ; CHECK-NEXT:    bit.16b v2, v5, v6
-; CHECK-NEXT:    bit.16b v1, v3, v4
+; CHECK-NEXT:    bit.16b v1, v3, v7
 ; CHECK-NEXT:    uzp2.8h v1, v1, v2
 ; CHECK-NEXT:    mvni.8h v2, #128, lsl #8
 ; CHECK-NEXT:    bif.16b v0, v1, v2
diff --git a/llvm/test/CodeGen/AArch64/vector-gep.ll b/llvm/test/CodeGen/AArch64/vector-gep.ll
index 30317dce85e65..c7858416e1796 100644
--- a/llvm/test/CodeGen/AArch64/vector-gep.ll
+++ b/llvm/test/CodeGen/AArch64/vector-gep.ll
@@ -13,11 +13,11 @@ define <2 x ptr> @vector_gep(<2 x ptr> %0) {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:  Lloh0:
 ; CHECK-NEXT:    adrp x8, lCPI0_0@PAGE
+; CHECK-NEXT:    movi v2.2d, #0x000000ffffffff
 ; CHECK-NEXT:  Lloh1:
 ; CHECK-NEXT:    ldr q1, [x8, lCPI0_0@PAGEOFF]
 ; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    movi v1.2d, #0x000000ffffffff
-; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    and v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:    .loh AdrpLdr Lloh0, Lloh1
 entry:
diff --git a/llvm/test/CodeGen/AArch64/vselect-constants.ll b/llvm/test/CodeGen/AArch64/vselect-constants.ll
index a32147eebd759..5e6ff1e0740ce 100644
--- a/llvm/test/CodeGen/AArch64/vselect-constants.ll
+++ b/llvm/test/CodeGen/AArch64/vselect-constants.ll
@@ -370,10 +370,9 @@ define <vscale x 16 x i8> @signbit_mask_xor_nxv16i8(<vscale x 16 x i8> %a, <vsca
 ; CHECK-LABEL: signbit_mask_xor_nxv16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b
-; CHECK-NEXT:    eor z1.d, z0.d, z1.d
 ; CHECK-NEXT:    cmplt p0.b, p0/z, z0.b, #0
-; CHECK-NEXT:    mov z1.b, p0/m, #0 // =0x0
-; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    eor z0.d, z0.d, z1.d
+; CHECK-NEXT:    mov z0.b, p0/m, #0 // =0x0
 ; CHECK-NEXT:    ret
   %cond = icmp slt <vscale x 16 x i8> %a, zeroinitializer
   %xor = xor <vscale x 16 x i8> %a, %b
diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
index 08ad34c7b03ba..599bd811d7d59 100644
--- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll
@@ -1697,14 +1697,14 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst)
 ; CHECK-NEXT:    ldp q18, q16, [x10, #96]
 ; CHECK-NEXT:    uaddw.2d v2, v17, v2
 ; CHECK-NEXT:    stp q4, q5, [x10, #32]
-; CHECK-NEXT:    ldp q17, q5, [x10, #64]
-; CHECK-NEXT:    uaddw2.2d v16, v16, v7
+; CHECK-NEXT:    uaddw2.2d v5, v16, v7
+; CHECK-NEXT:    ldp q16, q4, [x10, #64]
 ; CHECK-NEXT:    uaddw.2d v7, v18, v7
 ; CHECK-NEXT:    stp q2, q6, [x10]
-; CHECK-NEXT:    uaddw2.2d v4, v5, v3
-; CHECK-NEXT:    uaddw.2d v3, v17, v3
-; CHECK-NEXT:    stp q7, q16, [x10, #96]
-; CHECK-NEXT:    stp q3, q4, [x10, #64]
+; CHECK-NEXT:    uaddw2.2d v4, v4, v3
+; CHECK-NEXT:    uaddw.2d v2, v16, v3
+; CHECK-NEXT:    stp q7, q5, [x10, #96]
+; CHECK-NEXT:    stp q2, q4, [x10, #64]
 ; CHECK-NEXT:    b.ne LBB17_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
@@ -1729,15 +1729,15 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst)
 ; CHECK-BE-NEXT:    ld1 { v3.8b }, [x10]
 ; CHECK-BE-NEXT:    add x10, x1, x8
 ; CHECK-BE-NEXT:    add x8, x8, #128
-; CHECK-BE-NEXT:    add x15, x10, #96
 ; CHECK-BE-NEXT:    add x11, x10, #32
 ; CHECK-BE-NEXT:    add x14, x10, #64
+; CHECK-BE-NEXT:    add x15, x10, #96
 ; CHECK-BE-NEXT:    tbl v4.16b, { v2.16b }, v1.16b
 ; CHECK-BE-NEXT:    tbl v2.16b, { v2.16b }, v0.16b
-; CHECK-BE-NEXT:    ld1 { v16.2d }, [x15]
-; CHECK-BE-NEXT:    tbl v5.16b, { v3.16b }, v1.16b
+; CHECK-BE-NEXT:    ld1 { v5.2d }, [x10]
+; CHECK-BE-NEXT:    tbl v6.16b, { v3.16b }, v1.16b
 ; CHECK-BE-NEXT:    tbl v3.16b, { v3.16b }, v0.16b
-; CHECK-BE-NEXT:    ld1 { v6.2d }, [x10]
+; CHECK-BE-NEXT:    ld1 { v16.2d }, [x15]
 ; CHECK-BE-NEXT:    ld1 { v19.2d }, [x14]
 ; CHECK-BE-NEXT:    ld1 { v21.2d }, [x11]
 ; CHECK-BE-NEXT:    add x12, x10, #48
@@ -1747,11 +1747,12 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst)
 ; CHECK-BE-NEXT:    rev32 v7.8b, v4.8b
 ; CHECK-BE-NEXT:    ext v4.16b, v4.16b, v4.16b, #8
 ; CHECK-BE-NEXT:    rev32 v17.8b, v2.8b
-; CHECK-BE-NEXT:    ext v18.16b, v5.16b, v5.16b, #8
+; CHECK-BE-NEXT:    ext v18.16b, v6.16b, v6.16b, #8
 ; CHECK-BE-NEXT:    ext v20.16b, v3.16b, v3.16b, #8
 ; CHECK-BE-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
-; CHECK-BE-NEXT:    rev32 v5.8b, v5.8b
+; CHECK-BE-NEXT:    rev32 v6.8b, v6.8b
 ; CHECK-BE-NEXT:    rev32 v3.8b, v3.8b
+; CHECK-BE-NEXT:    ld1 { v22.2d }, [x12]
 ; CHECK-BE-NEXT:    cmp x8, #1024
 ; CHECK-BE-NEXT:    rev32 v4.8b, v4.8b
 ; CHECK-BE-NEXT:    uaddw v7.2d, v16.2d, v7.2s
@@ -1760,22 +1761,21 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst)
 ; CHECK-BE-NEXT:    rev32 v20.8b, v20.8b
 ; CHECK-BE-NEXT:    rev32 v2.8b, v2.8b
 ; CHECK-BE-NEXT:    uaddw v17.2d, v19.2d, v17.2s
-; CHECK-BE-NEXT:    ld1 { v19.2d }, [x12]
-; CHECK-BE-NEXT:    uaddw v5.2d, v21.2d, v5.2s
-; CHECK-BE-NEXT:    ld1 { v21.2d }, [x13]
-; CHECK-BE-NEXT:    uaddw v3.2d, v6.2d, v3.2s
-; CHECK-BE-NEXT:    ld1 { v6.2d }, [x17]
-; CHECK-BE-NEXT:    uaddw v4.2d, v16.2d, v4.2s
+; CHECK-BE-NEXT:    ld1 { v19.2d }, [x13]
+; CHECK-BE-NEXT:    uaddw v6.2d, v21.2d, v6.2s
+; CHECK-BE-NEXT:    uaddw v3.2d, v5.2d, v3.2s
+; CHECK-BE-NEXT:    ld1 { v5.2d }, [x17]
 ; CHECK-BE-NEXT:    st1 { v7.2d }, [x15]
-; CHECK-BE-NEXT:    uaddw v7.2d, v19.2d, v18.2s
-; CHECK-BE-NEXT:    uaddw v16.2d, v21.2d, v20.2s
-; CHECK-BE-NEXT:    uaddw v2.2d, v6.2d, v2.2s
-; CHECK-BE-NEXT:    st1 { v17.2d }, [x14]
-; CHECK-BE-NEXT:    st1 { v5.2d }, [x11]
+; CHECK-BE-NEXT:    uaddw v4.2d, v16.2d, v4.2s
+; CHECK-BE-NEXT:    st1 { v6.2d }, [x11]
+; CHECK-BE-NEXT:    uaddw v6.2d, v22.2d, v18.2s
 ; CHECK-BE-NEXT:    st1 { v3.2d }, [x10]
+; CHECK-BE-NEXT:    uaddw v3.2d, v19.2d, v20.2s
+; CHECK-BE-NEXT:    uaddw v2.2d, v5.2d, v2.2s
+; CHECK-BE-NEXT:    st1 { v17.2d }, [x14]
 ; CHECK-BE-NEXT:    st1 { v4.2d }, [x16]
-; CHECK-BE-NEXT:    st1 { v7.2d }, [x12]
-; CHECK-BE-NEXT:    st1 { v16.2d }, [x13]
+; CHECK-BE-NEXT:    st1 { v6.2d }, [x12]
+; CHECK-BE-NEXT:    st1 { v3.2d }, [x13]
 ; CHECK-BE-NEXT:    st1 { v2.2d }, [x17]
 ; CHECK-BE-NEXT:    b.ne .LBB17_1
 ; CHECK-BE-NEXT:  // %bb.2: // %exit
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
index ee0e83c5e0763..020761352148f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
@@ -254,8 +254,8 @@ body: |
     ; CHECK-NEXT: %one_s32:_(s32) = G_ANYEXT %one(s16)
     ; CHECK-NEXT: %one_undef:_(<2 x s16>) = G_BUILD_VECTOR_TRUNC %one_s32(s32), %undef(s32)
     ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(<2 x s16>) = G_FMUL [[COPY]], %two_splat
-    ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE %zero_undef, [[FMUL]]
-    ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE %one_undef, [[FMAXNUM_IEEE]]
+    ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE [[FMUL]], %zero_undef
+    ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE [[FMAXNUM_IEEE]], %one_undef
     ; CHECK-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
     %two:_(s16) = G_FCONSTANT half 0xH4000
@@ -306,7 +306,7 @@ body: |
     ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(<2 x s16>) = G_FMUL [[COPY]], %two_splat
     ; CHECK-NEXT: %snan_undef_fcan:_(<2 x s16>) = G_FCANONICALIZE %snan_undef
     ; CHECK-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE %snan_undef_fcan, [[FMUL]]
-    ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE %qnan_undef, [[FMAXNUM_IEEE]]
+    ; CHECK-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE [[FMAXNUM_IEEE]], %qnan_undef
     ; CHECK-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
     %two:_(s16) = G_FCONSTANT half 0xH4000
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir
index d6321dae3aa7e..67e6de1ce7644 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-and.mir
@@ -318,7 +318,7 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %val:_(s32) = COPY $vgpr4
     ; CHECK-NEXT: %k255:_(s32) = G_CONSTANT i32 255
-    ; CHECK-NEXT: %umin0:_(s32) = G_UMIN %k255, %val
+    ; CHECK-NEXT: %umin0:_(s32) = G_UMIN %val, %k255
     ; CHECK-NEXT: $vgpr0 = COPY %umin0(s32)
     %ptr0:_(p1) = COPY $vgpr0_vgpr1
     %ptr1:_(p1) = COPY $vgpr2_vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll
index dc13dee4f148a..1d94d76da148f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smed3.ll
@@ -145,10 +145,10 @@ define <2 x i16> @test_max_K0min_K1Val__v2i16(<2 x i16> %a) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 17
 ; GFX8-NEXT:    v_min_i16_e32 v1, 17, v0
-; GFX8-NEXT:    v_min_i16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_min_i16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_mov_b32_e32 v2, -12
 ; GFX8-NEXT:    v_max_i16_e32 v1, -12, v1
-; GFX8-NEXT:    v_max_i16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll
index 7e38762e7b559..a8233054db9bc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/umed3.ll
@@ -145,10 +145,10 @@ define <2 x i16> @test_max_K0min_K1Val__v2u16(<2 x i16> %a) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 17
 ; GFX8-NEXT:    v_min_u16_e32 v1, 17, v0
-; GFX8-NEXT:    v_min_u16_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_min_u16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 12
 ; GFX8-NEXT:    v_max_u16_e32 v1, 12, v1
-; GFX8-NEXT:    v_max_u16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_max_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 07480a0ce0c2e..cc0f7e2ca5a54 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -983,7 +983,7 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v5
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v3, v5
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v5
 ; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v6, v3
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v7
@@ -1010,7 +1010,7 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
 ; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v3, v5
-; CHECK-NEXT:    v_mul_hi_u32 v7, v5, v3
+; CHECK-NEXT:    v_mul_hi_u32 v7, v3, v5
 ; CHECK-NEXT:    v_mul_lo_u32 v5, v4, v5
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v6
@@ -1058,7 +1058,7 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v3, v2
-; CHECK-NEXT:    v_mul_hi_u32 v3, v2, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v3, v2
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CHECK-NEXT:    v_mul_lo_u32 v4, v4, v2
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
@@ -1265,10 +1265,10 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v4
-; GISEL-NEXT:    v_mul_hi_u32 v9, v4, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v9, v4
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v4
-; GISEL-NEXT:    v_mul_hi_u32 v5, v4, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v5, v4
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v7, v7, v4
@@ -1339,7 +1339,7 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; CGP-NEXT:    v_mul_lo_u32 v8, v6, v7
 ; CGP-NEXT:    v_mul_lo_u32 v9, v5, v7
-; CGP-NEXT:    v_mul_hi_u32 v10, v7, v5
+; CGP-NEXT:    v_mul_hi_u32 v10, v5, v7
 ; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v5
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CGP-NEXT:    v_mul_lo_u32 v10, v6, v9
@@ -1366,7 +1366,7 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v8, v5, v7
-; CGP-NEXT:    v_mul_hi_u32 v9, v7, v5
+; CGP-NEXT:    v_mul_hi_u32 v9, v5, v7
 ; CGP-NEXT:    v_mul_lo_u32 v7, v6, v7
 ; CGP-NEXT:    v_mul_lo_u32 v10, v6, v8
 ; CGP-NEXT:    v_mul_hi_u32 v11, v5, v8
@@ -1433,10 +1433,10 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CGP-NEXT:    v_mul_lo_u32 v9, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v7, v4, v7
+; CGP-NEXT:    v_mul_hi_u32 v7, v7, v4
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_mul_lo_u32 v11, v5, v4
-; CGP-NEXT:    v_mul_hi_u32 v5, v4, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v5, v4
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v15, v8
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
 ; CGP-NEXT:    v_mul_lo_u32 v8, v8, v4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-libcall-sincos-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-libcall-sincos-pass-ordering.ll
new file mode 100644
index 0000000000000..6b835bb4eef66
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-libcall-sincos-pass-ordering.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -O1 -amdgpu-prelink %s | FileCheck %s
+
+; Make sure that sin+cos -> sincos simplification happens after
+; initial IR simplifications, otherwise we can't identify the common
+; argument value.
+
+@.str = private unnamed_addr addrspace(4) constant [21 x i8] c"x: %f, y: %f, z: %f\0A\00", align 1
+
+; Should have call to sincos declarations, not calls to the asm pseudo-libcalls
+define protected amdgpu_kernel void @swdev456865(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, float noundef %x) #0 {
+; CHECK-LABEL: define protected amdgpu_kernel void @swdev456865(
+; CHECK-SAME: ptr addrspace(1) nocapture writeonly [[OUT0:%.*]], ptr addrspace(1) nocapture writeonly [[OUT1:%.*]], ptr addrspace(1) nocapture writeonly [[OUT2:%.*]], float noundef [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[I_I:%.*]] = call float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    [[I_I2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[I_I]], [[I_I2]]
+; CHECK-NEXT:    [[CONV:%.*]] = fpext float [[X]] to double
+; CHECK-NEXT:    [[CONV5:%.*]] = fpext float [[ADD]] to double
+; CHECK-NEXT:    store double [[CONV]], ptr addrspace(1) [[OUT0]], align 8
+; CHECK-NEXT:    store double [[CONV5]], ptr addrspace(1) [[OUT1]], align 8
+; CHECK-NEXT:    store double [[CONV5]], ptr addrspace(1) [[OUT2]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %x.addr = alloca float, align 4, addrspace(5)
+  %y = alloca float, align 4, addrspace(5)
+  %z = alloca float, align 4, addrspace(5)
+  store float %x, ptr addrspace(5) %x.addr, align 4
+  call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %y)
+  %i = load float, ptr addrspace(5) %x.addr, align 4
+  %call = call float @_Z3sinf(float noundef %i) #3
+  %i1 = load float, ptr addrspace(5) %x.addr, align 4
+  %call1 = call float @_Z3cosf(float noundef %i1) #3
+  %add = fadd float %call, %call1
+  store float %add, ptr addrspace(5) %y, align 4
+  call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %z)
+  %i2 = load float, ptr addrspace(5) %x.addr, align 4
+  %call2 = call float @_Z3cosf(float noundef %i2) #3
+  %i3 = load float, ptr addrspace(5) %x.addr, align 4
+  %call3 = call float @_Z3sinf(float noundef %i3) #3
+  %add4 = fadd float %call2, %call3
+  store float %add4, ptr addrspace(5) %z, align 4
+  %i4 = load float, ptr addrspace(5) %x.addr, align 4
+  %conv = fpext float %i4 to double
+  %i5 = load float, ptr addrspace(5) %y, align 4
+  %conv5 = fpext float %i5 to double
+  %i6 = load float, ptr addrspace(5) %z, align 4
+  %conv6 = fpext float %i6 to double
+  store double %conv, ptr addrspace(1) %out0, align 8
+  store double %conv5, ptr addrspace(1) %out1, align 8
+  store double %conv6, ptr addrspace(1) %out2, align 8
+  call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %z)
+  call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %y)
+  ret void
+}
+
+declare void @llvm.lifetime.start.p5(i64 immarg, ptr addrspace(5) nocapture) #1
+declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) nocapture) #1
+
+define internal float @_Z3cosf(float noundef %arg) #2 {
+bb:
+  %i = tail call float asm "pseudo-libcall-cos %0, %1", "=v,v"(float noundef %arg) #2
+  ret float %i
+}
+
+define internal float @_Z3sinf(float noundef %arg) #2 {
+bb:
+  %i = tail call float asm "pseudo-libcall-sin %0, %1", "=v,v"(float noundef %arg) #2
+  ret float %i
+}
+
+attributes #0 = { norecurse nounwind }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { mustprogress nofree norecurse nounwind willreturn memory(none) }
+attributes #3 = { nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
index 5c56276eeb0f1..9646d196da42f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
@@ -884,10 +884,9 @@ entry:
 
 define float @sincos_f32_unused_result_cos(float %x) {
 ; CHECK-LABEL: define float @sincos_f32_unused_result_cos
-; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
-; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]])
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call contract float @_Z3sinf(float [[X]])
 ; CHECK-NEXT:    ret float [[TMP0]]
 ;
 entry:
@@ -900,11 +899,9 @@ entry:
 
 define float @sincos_f32_unused_result_sin(float %x) {
 ; CHECK-LABEL: define float @sincos_f32_unused_result_sin
-; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR4]] {
+; CHECK-SAME: (float [[X:%.*]]) local_unnamed_addr #[[ATTR5]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
-; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]])
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call contract float @_Z3cosf(float [[X]])
 ; CHECK-NEXT:    ret float [[TMP1]]
 ;
 entry:
@@ -917,13 +914,11 @@ entry:
 
 define void @sincos_f32_repeated_uses(float %x, ptr addrspace(1) %sin_out, ptr addrspace(1) %cos_out) {
 ; CHECK-LABEL: define void @sincos_f32_repeated_uses
-; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) [[SIN_OUT:%.*]], ptr addrspace(1) [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
+; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) [[SIN_OUT:%.*]], ptr addrspace(1) [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR6:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
 ; CHECK-NEXT:    [[__SINCOS_3:%.*]] = alloca float, align 4, addrspace(5)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_3]])
-; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[__SINCOS_]])
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_3]], align 4
 ; CHECK-NEXT:    store volatile float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4
 ; CHECK-NEXT:    store volatile float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4
 ; CHECK-NEXT:    store volatile float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index bf4302c156d83..4c9c34de7194c 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -38342,12 +38342,11 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
 ; GFX10-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
 ; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
 ; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
@@ -38366,7 +38365,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s17, 1, v4
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s18, 1, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s19, 1, v0
-; GFX10-NEXT:    v_writelane_b32 v40, s35, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s20, 1, v27
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s21, 1, v25
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s22, 1, v23
@@ -38377,10 +38376,10 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s27, 1, v13
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s28, 1, v11
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s29, 1, v7
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s30, 1, v3
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s31, 1, v1
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s34, 1, v5
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s35, 1, v9
+; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_hi, 1, v3
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s30, 1, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s31, 1, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s34, 1, v9
 ; GFX10-NEXT:    s_waitcnt vmcnt(32)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
 ; GFX10-NEXT:    s_waitcnt vmcnt(31)
@@ -38460,10 +38459,10 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, v29, v39, s27
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v28, v26, s28
 ; GFX10-NEXT:    v_cndmask_b32_e64 v20, v51, v20, s29
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v14, v12, s31
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v55, v16, s30
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v53, v18, s34
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, v24, v22, s35
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v14, v12, s30
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v55, v16, vcc_hi
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v53, v18, s31
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v24, v22, s34
 ; GFX10-NEXT:    v_cndmask_b32_e64 v16, v4, v3, s4
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v64, 0x5040100
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v54, 0x5040100
@@ -38481,7 +38480,6 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    v_perm_b32 v13, v66, v13, 0x5040100
 ; GFX10-NEXT:    v_perm_b32 v14, v65, v17, 0x5040100
 ; GFX10-NEXT:    v_perm_b32 v15, v16, v15, 0x5040100
-; GFX10-NEXT:    v_readlane_b32 s35, v40, 3
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 37412ac3aa541..99755133f36d6 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -3,6 +3,7 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX8,GFX678,ALL
 ; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX10,GFX1011,ALL
 ; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX1011,ALL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 | FileCheck %s --check-prefixes=GFX940,ALL
 
 ; ALL-LABEL: {{^}}build_vector2:
 ; R600: MOV
@@ -96,3 +97,99 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32
   store <2 x i16> %ins.1, ptr addrspace(1) %out
   ret void
 }
+
+; R600-LABEL: build_v2i32_from_v4i16_shuffle:
+; R600:       ; %bb.0: ; %entry
+; R600-NEXT:    ALU 0, @10, KC0[], KC1[]
+; R600-NEXT:    TEX 1 @6
+; R600-NEXT:    ALU 4, @11, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    Fetch clause starting at 6:
+; R600-NEXT:     VTX_READ_16 T1.X, T0.X, 48, #3
+; R600-NEXT:     VTX_READ_16 T0.X, T0.X, 44, #3
+; R600-NEXT:    ALU clause starting at 10:
+; R600-NEXT:     MOV * T0.X, 0.0,
+; R600-NEXT:    ALU clause starting at 11:
+; R600-NEXT:     LSHL * T0.Y, T1.X, literal.x,
+; R600-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; R600-NEXT:     LSHL T0.X, T0.X, literal.x,
+; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
+; R600-NEXT:    16(2.242078e-44), 2(2.802597e-45)
+;
+; GFX6-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s2
+; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 s4, s0
+; GFX8-NEXT:    s_mov_b32 s5, s1
+; GFX8-NEXT:    s_lshl_b32 s0, s3, 16
+; GFX8-NEXT:    s_lshl_b32 s1, s2, 16
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX8-NEXT:    s_endpgm
+;
+; GFX10-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX10-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10-NEXT:    v_mov_b32_e32 v1, s3
+; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    v_mov_b32_e32 v1, s3
+; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; GFX940-LABEL: build_v2i32_from_v4i16_shuffle:
+; GFX940:       ; %bb.0: ; %entry
+; GFX940-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX940-NEXT:    v_mov_b32_e32 v2, 0
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX940-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX940-NEXT:    v_mov_b32_e32 v0, s2
+; GFX940-NEXT:    v_mov_b32_e32 v1, s3
+; GFX940-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX940-NEXT:    s_endpgm
+define amdgpu_kernel void @build_v2i32_from_v4i16_shuffle(ptr addrspace(1) %out, <4 x i16> %in) {
+entry:
+  %shuf = shufflevector <4 x i16> %in, <4 x i16> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %zextended = zext <2 x i16> %shuf to <2 x i32>
+  %shifted = shl <2 x i32> %zextended, <i32 16, i32 16>
+  store <2 x i32> %shifted, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
index 0c62b52eb92af..587340c7aa342 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
@@ -4,12 +4,14 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=GFX10
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=GFX1030
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefix=GFX1100
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12
 
 ; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=G_SI
 ; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs | FileCheck %s  -check-prefix=G_GFX7
 ; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX10
 ; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX1030
 ; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefix=G_GFX1100
+; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12
 
 declare float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float, <4 x i32>, i32, i32, i32 immarg)
 declare float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float, <4 x i32>, i32, i32, i32 immarg)
@@ -70,6 +72,18 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f32(<4 x i32> inreg %rsrc
 ; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: raw_buffer_atomic_min_noret_f32:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+;
 ; G_SI-LABEL: raw_buffer_atomic_min_noret_f32:
 ; G_SI:       ; %bb.0: ; %main_body
 ; G_SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -170,6 +184,15 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f32(<4 x i32> inreg %rsrc, floa
 ; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: raw_buffer_atomic_min_rtn_f32:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+;
 ; G_SI-LABEL: raw_buffer_atomic_min_rtn_f32:
 ; G_SI:       ; %bb.0: ; %main_body
 ; G_SI-NEXT:    buffer_atomic_fmin v0, v1, s[0:3], 0 offen glc
@@ -292,6 +315,20 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f32_off4_slc(<4 x i32> inre
 ; GFX1100-NEXT:    ds_store_b32 v1, v0
 ; GFX1100-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b96 s[4:6], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_mov_b32 s4, 4
+; GFX12-NEXT:    buffer_atomic_min_num_f32 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_NT_RETURN
+; GFX12-NEXT:    v_mov_b32_e32 v1, s6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    ds_store_b32 v1, v0
+; GFX12-NEXT:    s_endpgm
+;
 ; G_SI-LABEL: raw_buffer_atomic_min_rtn_f32_off4_slc:
 ; G_SI:       ; %bb.0: ; %main_body
 ; G_SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
@@ -427,6 +464,18 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f32(<4 x i32> inreg %rsrc
 ; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: raw_buffer_atomic_max_noret_f32:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[0:1], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+;
 ; G_SI-LABEL: raw_buffer_atomic_max_noret_f32:
 ; G_SI:       ; %bb.0: ; %main_body
 ; G_SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
@@ -527,6 +576,15 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f32(<4 x i32> inreg %rsrc, floa
 ; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: raw_buffer_atomic_max_rtn_f32:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+;
 ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32:
 ; G_SI:       ; %bb.0: ; %main_body
 ; G_SI-NEXT:    buffer_atomic_fmax v0, v1, s[0:3], 0 offen glc
@@ -641,6 +699,20 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f32_off4_slc(<4 x i32> inre
 ; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX1100-NEXT:    s_endpgm
 ;
+; GFX12-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
+; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_load_b256 s[0:7], s[0:1], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_mov_b32 s4, 4
+; GFX12-NEXT:    buffer_atomic_max_num_f32 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_NT_RETURN
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_store_b32 v1, v0, s[6:7]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+;
 ; G_SI-LABEL: raw_buffer_atomic_max_rtn_f32_off4_slc:
 ; G_SI:       ; %bb.0: ; %main_body
 ; G_SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index ec3c08ec79523..da64c379672ef 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -1259,17 +1259,17 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind {
 ; GFX10SELDAG-LABEL: isnan_v4f16:
 ; GFX10SELDAG:       ; %bb.0:
 ; GFX10SELDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10SELDAG-NEXT:    v_mov_b32_e32 v2, 3
-; GFX10SELDAG-NEXT:    v_cmp_class_f16_e64 s5, v0, 3
-; GFX10SELDAG-NEXT:    v_cmp_class_f16_sdwa s4, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
-; GFX10SELDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s5
-; GFX10SELDAG-NEXT:    v_cmp_class_f16_sdwa s5, v0, v2 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10SELDAG-NEXT:    v_cmp_class_f16_e64 s4, v0, 3
+; GFX10SELDAG-NEXT:    v_mov_b32_e32 v3, 3
+; GFX10SELDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s4
+; GFX10SELDAG-NEXT:    v_cmp_class_f16_e64 s4, v1, 3
+; GFX10SELDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s4
+; GFX10SELDAG-NEXT:    v_cmp_class_f16_sdwa s4, v0, v3 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10SELDAG-NEXT:    v_mov_b32_e32 v0, v5
+; GFX10SELDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX10SELDAG-NEXT:    v_cmp_class_f16_sdwa s4, v1, v3 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10SELDAG-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX10SELDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
-; GFX10SELDAG-NEXT:    v_mov_b32_e32 v0, v4
-; GFX10SELDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s5
-; GFX10SELDAG-NEXT:    v_cmp_class_f16_e64 s5, v1, 3
-; GFX10SELDAG-NEXT:    v_mov_b32_e32 v1, v5
-; GFX10SELDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s5
 ; GFX10SELDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10GLISEL-LABEL: isnan_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index ab6a9dcf71ace..a87fa8bf36d9e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -7404,35 +7404,35 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; GFX12-NEXT:    v_dual_mov_b32 v4, s22 :: v_dual_mov_b32 v9, s31
 ; GFX12-NEXT:    v_dual_mov_b32 v8, s30 :: v_dual_mov_b32 v11, s35
 ; GFX12-NEXT:    v_dual_mov_b32 v10, s34 :: v_dual_mov_b32 v3, s5
-; GFX12-NEXT:    s_bfe_i64 s[10:11], s[0:1], 0x100000
-; GFX12-NEXT:    s_lshr_b32 s12, s0, 16
-; GFX12-NEXT:    s_mov_b32 s14, s1
-; GFX12-NEXT:    s_lshr_b32 s16, s1, 16
-; GFX12-NEXT:    s_bfe_i64 s[0:1], s[2:3], 0x100000
+; GFX12-NEXT:    s_bfe_i64 s[16:17], s[2:3], 0x100000
 ; GFX12-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX12-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x100000
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v5, s23
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v13, s25
+; GFX12-NEXT:    s_mov_b32 s12, s1
+; GFX12-NEXT:    s_lshr_b32 s14, s1, 16
 ; GFX12-NEXT:    s_bfe_i64 s[18:19], s[18:19], 0x100000
 ; GFX12-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x100000
 ; GFX12-NEXT:    v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v15, s27
 ; GFX12-NEXT:    v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v7, s7
+; GFX12-NEXT:    s_bfe_i64 s[10:11], s[0:1], 0x100000
+; GFX12-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX12-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x100000
 ; GFX12-NEXT:    v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v17, s19
+; GFX12-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x100000
 ; GFX12-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x100000
-; GFX12-NEXT:    s_bfe_i64 s[16:17], s[16:17], 0x100000
 ; GFX12-NEXT:    v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v19, s21
 ; GFX12-NEXT:    v_mov_b32_e32 v18, s20
-; GFX12-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x100000
+; GFX12-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x100000
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    global_store_b128 v24, v[8:11], s[8:9] offset:80
 ; GFX12-NEXT:    global_store_b128 v24, v[0:3], s[8:9] offset:64
-; GFX12-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-NEXT:    v_dual_mov_b32 v1, s17 :: v_dual_mov_b32 v0, s16
 ; GFX12-NEXT:    v_dual_mov_b32 v3, s3 :: v_dual_mov_b32 v2, s2
-; GFX12-NEXT:    v_dual_mov_b32 v9, s15 :: v_dual_mov_b32 v8, s14
-; GFX12-NEXT:    v_dual_mov_b32 v11, s17 :: v_dual_mov_b32 v10, s16
+; GFX12-NEXT:    v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v8, s12
+; GFX12-NEXT:    v_dual_mov_b32 v11, s15 :: v_dual_mov_b32 v10, s14
 ; GFX12-NEXT:    v_dual_mov_b32 v21, s11 :: v_dual_mov_b32 v20, s10
-; GFX12-NEXT:    v_dual_mov_b32 v23, s13 :: v_dual_mov_b32 v22, s12
+; GFX12-NEXT:    v_dual_mov_b32 v23, s1 :: v_dual_mov_b32 v22, s0
 ; GFX12-NEXT:    s_clause 0x5
 ; GFX12-NEXT:    global_store_b128 v24, v[12:15], s[8:9] offset:112
 ; GFX12-NEXT:    global_store_b128 v24, v[4:7], s[8:9] offset:96
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 952827b8cd0e7..889755c23bbc7 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -8808,73 +8808,73 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_lshrrev_b16 v2, 8, s6
 ; GFX12-NEXT:    v_lshrrev_b16 v4, 8, s5
 ; GFX12-NEXT:    v_lshrrev_b16 v8, 8, s2
-; GFX12-NEXT:    s_lshr_b32 s24, s7, 16
+; GFX12-NEXT:    s_lshr_b32 s22, s7, 16
 ; GFX12-NEXT:    v_bfe_i32 v31, v1, 0, 8
-; GFX12-NEXT:    s_lshr_b32 s42, s2, 24
-; GFX12-NEXT:    s_mov_b32 s48, s7
+; GFX12-NEXT:    s_lshr_b32 s40, s2, 24
+; GFX12-NEXT:    s_mov_b32 s46, s7
 ; GFX12-NEXT:    v_lshrrev_b16 v5, 8, s4
 ; GFX12-NEXT:    v_lshrrev_b16 v7, 8, s1
-; GFX12-NEXT:    s_lshr_b32 s26, s6, 16
-; GFX12-NEXT:    s_lshr_b32 s44, s1, 16
+; GFX12-NEXT:    s_lshr_b32 s24, s6, 16
+; GFX12-NEXT:    s_lshr_b32 s42, s1, 16
 ; GFX12-NEXT:    s_ashr_i64 s[58:59], s[6:7], 56
-; GFX12-NEXT:    s_bfe_i64 s[48:49], s[48:49], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[46:47], s[46:47], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[22:23], s[22:23], 0x80000
 ; GFX12-NEXT:    v_lshrrev_b16 v6, 8, s3
 ; GFX12-NEXT:    v_lshrrev_b16 v3, 8, s0
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v33, s24
-; GFX12-NEXT:    s_lshr_b32 s28, s6, 24
-; GFX12-NEXT:    s_lshr_b32 s30, s5, 16
-; GFX12-NEXT:    s_lshr_b32 s40, s2, 16
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v33, s22
+; GFX12-NEXT:    s_lshr_b32 s26, s6, 24
+; GFX12-NEXT:    s_lshr_b32 s28, s5, 16
+; GFX12-NEXT:    s_lshr_b32 s38, s2, 16
 ; GFX12-NEXT:    v_bfe_i32 v11, v8, 0, 8
 ; GFX12-NEXT:    v_bfe_i32 v23, v4, 0, 8
 ; GFX12-NEXT:    v_bfe_i32 v27, v2, 0, 8
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v32, 31, v31
-; GFX12-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x80000
-; GFX12-NEXT:    v_dual_mov_b32 v34, s25 :: v_dual_mov_b32 v35, s58
-; GFX12-NEXT:    v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v37, s26
-; GFX12-NEXT:    v_dual_mov_b32 v56, s43 :: v_dual_mov_b32 v29, s48
-; GFX12-NEXT:    v_mov_b32_e32 v30, s49
-; GFX12-NEXT:    s_lshr_b32 s46, s0, 24
-; GFX12-NEXT:    s_mov_b32 s50, s5
-; GFX12-NEXT:    s_mov_b32 s52, s3
-; GFX12-NEXT:    s_lshr_b32 s34, s4, 16
-; GFX12-NEXT:    s_lshr_b32 s36, s4, 24
-; GFX12-NEXT:    s_ashr_i64 s[22:23], s[2:3], 56
+; GFX12-NEXT:    s_bfe_i64 s[42:43], s[42:43], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[24:25], s[24:25], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v34, s23 :: v_dual_mov_b32 v35, s58
+; GFX12-NEXT:    v_dual_mov_b32 v36, s59 :: v_dual_mov_b32 v37, s24
+; GFX12-NEXT:    v_dual_mov_b32 v56, s41 :: v_dual_mov_b32 v29, s46
+; GFX12-NEXT:    v_mov_b32_e32 v30, s47
+; GFX12-NEXT:    s_lshr_b32 s44, s0, 24
+; GFX12-NEXT:    s_mov_b32 s48, s5
+; GFX12-NEXT:    s_mov_b32 s50, s3
+; GFX12-NEXT:    s_lshr_b32 s30, s4, 16
+; GFX12-NEXT:    s_lshr_b32 s34, s4, 24
+; GFX12-NEXT:    s_ashr_i64 s[54:55], s[2:3], 56
 ; GFX12-NEXT:    s_ashr_i64 s[56:57], s[4:5], 56
 ; GFX12-NEXT:    v_bfe_i32 v7, v7, 0, 8
 ; GFX12-NEXT:    v_bfe_i32 v19, v5, 0, 8
-; GFX12-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x80000
 ; GFX12-NEXT:    s_bfe_i64 s[28:29], s[28:29], 0x80000
-; GFX12-NEXT:    s_lshr_b32 s38, s3, 16
-; GFX12-NEXT:    s_mov_b32 s54, s1
+; GFX12-NEXT:    s_bfe_i64 s[26:27], s[26:27], 0x80000
+; GFX12-NEXT:    s_lshr_b32 s36, s3, 16
+; GFX12-NEXT:    s_mov_b32 s52, s1
 ; GFX12-NEXT:    s_bfe_i64 s[12:13], s[2:3], 0x80000
 ; GFX12-NEXT:    s_bfe_i64 s[14:15], s[4:5], 0x80000
 ; GFX12-NEXT:    s_bfe_i64 s[16:17], s[6:7], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[2:3], s[52:53], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[4:5], s[50:51], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[6:7], s[46:47], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[2:3], s[50:51], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[4:5], s[48:49], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[6:7], s[44:45], 0x80000
 ; GFX12-NEXT:    s_lshr_b32 s20, s0, 16
 ; GFX12-NEXT:    s_ashr_i64 s[18:19], s[0:1], 56
 ; GFX12-NEXT:    v_bfe_i32 v3, v3, 0, 8
 ; GFX12-NEXT:    v_bfe_i32 v15, v6, 0, 8
-; GFX12-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x80000
 ; GFX12-NEXT:    s_bfe_i64 s[34:35], s[34:35], 0x80000
-; GFX12-NEXT:    v_dual_mov_b32 v38, s27 :: v_dual_mov_b32 v39, s28
-; GFX12-NEXT:    v_dual_mov_b32 v40, s29 :: v_dual_mov_b32 v41, s30
-; GFX12-NEXT:    v_dual_mov_b32 v42, s31 :: v_dual_mov_b32 v43, s56
-; GFX12-NEXT:    v_dual_mov_b32 v44, s57 :: v_dual_mov_b32 v45, s34
-; GFX12-NEXT:    v_dual_mov_b32 v52, s23 :: v_dual_mov_b32 v53, s40
-; GFX12-NEXT:    v_dual_mov_b32 v54, s41 :: v_dual_mov_b32 v55, s42
+; GFX12-NEXT:    s_bfe_i64 s[30:31], s[30:31], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v38, s25 :: v_dual_mov_b32 v39, s26
+; GFX12-NEXT:    v_dual_mov_b32 v40, s27 :: v_dual_mov_b32 v41, s28
+; GFX12-NEXT:    v_dual_mov_b32 v42, s29 :: v_dual_mov_b32 v43, s56
+; GFX12-NEXT:    v_dual_mov_b32 v44, s57 :: v_dual_mov_b32 v45, s30
+; GFX12-NEXT:    v_dual_mov_b32 v52, s55 :: v_dual_mov_b32 v53, s38
+; GFX12-NEXT:    v_dual_mov_b32 v54, s39 :: v_dual_mov_b32 v55, s40
 ; GFX12-NEXT:    s_bfe_i64 s[10:11], s[0:1], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[0:1], s[54:55], 0x80000
+; GFX12-NEXT:    s_bfe_i64 s[0:1], s[52:53], 0x80000
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v28, 31, v27
 ; GFX12-NEXT:    global_store_b128 v0, v[33:36], s[8:9] offset:240
-; GFX12-NEXT:    v_mov_b32_e32 v33, s44
+; GFX12-NEXT:    v_mov_b32_e32 v33, s42
 ; GFX12-NEXT:    global_store_b128 v0, v[29:32], s[8:9] offset:224
 ; GFX12-NEXT:    v_dual_mov_b32 v25, s16 :: v_dual_mov_b32 v26, s17
 ; GFX12-NEXT:    v_dual_mov_b32 v32, s7 :: v_dual_mov_b32 v21, s4
@@ -8882,16 +8882,16 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_dual_mov_b32 v14, s3 :: v_dual_mov_b32 v9, s12
 ; GFX12-NEXT:    v_dual_mov_b32 v10, s13 :: v_dual_mov_b32 v5, s0
 ; GFX12-NEXT:    s_bfe_i64 s[20:21], s[20:21], 0x80000
-; GFX12-NEXT:    s_bfe_i64 s[38:39], s[38:39], 0x80000
-; GFX12-NEXT:    v_dual_mov_b32 v46, s35 :: v_dual_mov_b32 v47, s36
-; GFX12-NEXT:    v_dual_mov_b32 v48, s37 :: v_dual_mov_b32 v49, s38
-; GFX12-NEXT:    v_dual_mov_b32 v34, s45 :: v_dual_mov_b32 v35, s18
+; GFX12-NEXT:    s_bfe_i64 s[36:37], s[36:37], 0x80000
+; GFX12-NEXT:    v_dual_mov_b32 v46, s31 :: v_dual_mov_b32 v47, s34
+; GFX12-NEXT:    v_dual_mov_b32 v48, s35 :: v_dual_mov_b32 v49, s36
+; GFX12-NEXT:    v_dual_mov_b32 v34, s43 :: v_dual_mov_b32 v35, s18
 ; GFX12-NEXT:    v_dual_mov_b32 v36, s19 :: v_dual_mov_b32 v29, s20
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
 ; GFX12-NEXT:    v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v13, s2
 ; GFX12-NEXT:    v_dual_mov_b32 v6, s1 :: v_dual_mov_b32 v1, s10
-; GFX12-NEXT:    v_dual_mov_b32 v50, s39 :: v_dual_mov_b32 v51, s22
+; GFX12-NEXT:    v_dual_mov_b32 v50, s37 :: v_dual_mov_b32 v51, s54
 ; GFX12-NEXT:    v_dual_mov_b32 v30, s21 :: v_dual_mov_b32 v31, s6
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
 ; GFX12-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
new file mode 100644
index 0000000000000..358f42dfe8dd5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -0,0 +1,2696 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 %s -o - | FileCheck %s
+
+%struct.S = type { [32 x i32] }
+
+@shared = addrspace(3) global %struct.S undef, align 4
+
+define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0 {
+; CHECK-LABEL: memcpy_p0_p0_minsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1]
+; CHECK-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:1
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:2
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:3
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:4
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:4
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:5
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:5
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:6
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:6
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:7
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:7
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:8
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:9
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:9
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:10
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:10
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:11
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:11
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:12
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:12
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:13
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:13
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:14
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:14
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:15
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:15
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:16
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:17
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:17
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:18
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:18
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:19
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:19
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:20
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:21
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:21
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:22
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:22
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:23
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:24
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:25
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:25
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:26
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:26
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:27
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:28
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:29
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:29
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:30
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:31
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:31
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:32
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:33
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:33
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:34
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:34
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:35
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:35
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:36
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:36
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:37
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:37
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:38
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:38
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:39
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:39
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:40
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:40
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:41
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:41
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:42
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:42
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:43
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:43
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:44
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:44
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:45
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:45
+; CHECK-NEXT:    flat_load_ubyte v0, v[0:1] offset:46
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v0 offset:46
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memcpy_p1_p1_minsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #0 {
+; CHECK-LABEL: memcpy_p1_p1_minsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:32
+; CHECK-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3] offset:39
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:39
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 47, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memcpy_p1_p4_minsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #0 {
+; CHECK-LABEL: memcpy_p1_p4_minsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) %global, ptr addrspace(4) %0, i64 128, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #0 {
+; CHECK-LABEL: memcpy_p5_p4_minsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; CHECK-NEXT:    s_load_dword s2, s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_add_u32 s8, s8, s7
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1]
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:1
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:2
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:3
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:4
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:5
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:6
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:7
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:8
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:9
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:10
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:11
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:12
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:13
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:14
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:15
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s2
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:16
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:17
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:18
+; CHECK-NEXT:    s_waitcnt vmcnt(18)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:19
+; CHECK-NEXT:    s_waitcnt vmcnt(19)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:1
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:2
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:21
+; CHECK-NEXT:    s_waitcnt vmcnt(21)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:3
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:22
+; CHECK-NEXT:    s_waitcnt vmcnt(22)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:4
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(23)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:5
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(24)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:6
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:25
+; CHECK-NEXT:    s_waitcnt vmcnt(25)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:7
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:26
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:8
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(27)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:9
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(28)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:10
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:29
+; CHECK-NEXT:    s_waitcnt vmcnt(29)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:11
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(30)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:12
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:31
+; CHECK-NEXT:    s_waitcnt vmcnt(31)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:13
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(32)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:14
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:33
+; CHECK-NEXT:    s_waitcnt vmcnt(33)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:15
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:34
+; CHECK-NEXT:    s_waitcnt vmcnt(34)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:16
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:35
+; CHECK-NEXT:    s_waitcnt vmcnt(35)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:17
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:36
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:18
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:37
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:19
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:38
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:20
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:39
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:21
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:40
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:22
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:41
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:23
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:42
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:24
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:43
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:25
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:44
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:26
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:45
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:27
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:46
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:28
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:47
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:29
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:30
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:49
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:31
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:50
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:32
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:51
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:33
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:52
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:34
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:53
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:35
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:54
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:36
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:55
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:37
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:56
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:38
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:57
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:39
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:58
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:40
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:59
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:41
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:60
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:42
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:61
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:43
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:62
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:44
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:63
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:45
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:46
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:65
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:47
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:66
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:48
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:67
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:49
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:68
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:50
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:69
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:51
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:70
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:52
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:71
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:53
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:72
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:54
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:73
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:55
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:74
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:56
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:75
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:57
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:76
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:58
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:77
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:59
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:78
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:60
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:79
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:61
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:62
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:81
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:63
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:82
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:64
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:83
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:65
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:84
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:66
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:85
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:67
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:86
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:68
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:87
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:69
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:88
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:70
+; CHECK-NEXT:    s_waitcnt vmcnt(35)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:71
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:89
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:90
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:72
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:91
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:73
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:92
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:74
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:93
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:75
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:94
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:76
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:95
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:77
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:78
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:97
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:79
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:98
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:80
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:99
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:81
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:100
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:82
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:101
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:83
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:102
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:84
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:103
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:85
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:104
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:86
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:105
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:87
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:106
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:88
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:107
+; CHECK-NEXT:    s_waitcnt vmcnt(35)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:89
+; CHECK-NEXT:    s_waitcnt vmcnt(35)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:90
+; CHECK-NEXT:    s_waitcnt vmcnt(34)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:91
+; CHECK-NEXT:    s_waitcnt vmcnt(33)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:92
+; CHECK-NEXT:    s_waitcnt vmcnt(32)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:93
+; CHECK-NEXT:    s_waitcnt vmcnt(31)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:94
+; CHECK-NEXT:    s_waitcnt vmcnt(30)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:95
+; CHECK-NEXT:    s_waitcnt vmcnt(29)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(28)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:97
+; CHECK-NEXT:    s_waitcnt vmcnt(27)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:98
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:99
+; CHECK-NEXT:    s_waitcnt vmcnt(25)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:100
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:108
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:109
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:110
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:111
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:112
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:113
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:114
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:115
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:116
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:117
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:118
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:119
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:101
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:120
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:102
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:121
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:103
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:122
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:104
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:123
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:105
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:124
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:106
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:125
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:107
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:126
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    global_load_ubyte v21, v0, s[0:1] offset:127
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:108
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:109
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:110
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:111
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:113
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:114
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:115
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:116
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:117
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:118
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:119
+; CHECK-NEXT:    s_waitcnt vmcnt(25)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:120
+; CHECK-NEXT:    s_waitcnt vmcnt(24)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:121
+; CHECK-NEXT:    s_waitcnt vmcnt(23)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:122
+; CHECK-NEXT:    s_waitcnt vmcnt(22)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:123
+; CHECK-NEXT:    s_waitcnt vmcnt(21)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:124
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:125
+; CHECK-NEXT:    s_waitcnt vmcnt(19)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:126
+; CHECK-NEXT:    s_waitcnt vmcnt(19)
+; CHECK-NEXT:    buffer_store_byte v21, v1, s[8:11], 0 offen offset:127
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %src) #0 {
+; CHECK-LABEL: memcpy_p0_p5_minsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; CHECK-NEXT:    s_load_dword s0, s[4:5], 0x8
+; CHECK-NEXT:    s_add_u32 s8, s8, s7
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:1
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:2
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:3
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:5
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:6
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:7
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:9
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:10
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:11
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:13
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:14
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:17
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-NEXT:    s_waitcnt vmcnt(17)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:18
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:1
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:19
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:2
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:20
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:3
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:21
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:4
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:22
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:5
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:23
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:6
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:24
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:7
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:25
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:26
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:9
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:27
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:10
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:28
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:11
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:29
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:12
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:30
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:13
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:31
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:14
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:32
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:15
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:33
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:34
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:35
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:18
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:36
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:19
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:37
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:20
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:38
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:21
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:39
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:22
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:40
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:23
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:41
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:24
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:42
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:43
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:26
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:44
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:27
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:45
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:28
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:46
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:29
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:47
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:30
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:48
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:31
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:49
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:32
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:50
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:33
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:51
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:34
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:52
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:35
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:53
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:36
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:54
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:37
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:55
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:38
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:56
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:39
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:57
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:40
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:58
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:41
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:59
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:42
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:60
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:43
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:61
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:44
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:62
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:45
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:63
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:46
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:64
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:47
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:65
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:48
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:66
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:49
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:67
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:50
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:68
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:51
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:69
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:52
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:70
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:53
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:71
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:54
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:72
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:55
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:73
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:56
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:74
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:57
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:75
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:58
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:76
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:59
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:77
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:60
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:78
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:61
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:79
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:62
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:80
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:63
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:81
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:64
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:82
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:65
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:83
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:66
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:84
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:67
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:85
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:68
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:86
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:69
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:87
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:70
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:88
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:71
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:89
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:72
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:90
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:73
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:74
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:91
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:92
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:75
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:93
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:76
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:94
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:77
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:95
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:78
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:96
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:79
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:97
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:80
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:98
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:81
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:99
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:82
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:100
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:83
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:101
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:84
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:102
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:85
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:103
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:86
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:104
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:87
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:105
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:88
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:106
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:89
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:107
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:90
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:108
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:91
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:92
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:93
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:94
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:95
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:96
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:97
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:98
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:99
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:100
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:101
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:109
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:110
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:111
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:112
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:113
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:114
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:115
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:116
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:117
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:118
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:119
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:102
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:120
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:103
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:121
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:104
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:122
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:105
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:123
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:106
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:124
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:107
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:125
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:108
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:126
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:127
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:109
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:110
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:111
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:112
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:113
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:114
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:115
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:116
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:117
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:118
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:119
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:120
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:121
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:122
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:123
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:124
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:125
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:126
+; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:127
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memcpy_p3_p4_minsize(ptr addrspace(4) %0) #0 {
+; CHECK-LABEL: memcpy_p3_p4_minsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v24, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v24, s[0:1] offset:80
+; CHECK-NEXT:    global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
+; CHECK-NEXT:    global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
+; CHECK-NEXT:    global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    ds_write2_b64 v24, v[0:1], v[2:3] offset0:14 offset1:15
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    ds_write2_b64 v24, v[4:5], v[6:7] offset0:12 offset1:13
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1]
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    ds_write2_b64 v24, v[8:9], v[10:11] offset0:10 offset1:11
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    ds_write2_b64 v24, v[12:13], v[14:15] offset0:8 offset1:9
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    ds_write2_b64 v24, v[16:17], v[18:19] offset0:6 offset1:7
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    ds_write2_b64 v24, v[20:21], v[22:23] offset0:4 offset1:5
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v24, v[0:1], v[2:3] offset0:2 offset1:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write2_b64 v24, v[4:5], v[6:7] offset1:1
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) @shared, ptr addrspace(4) %0, i64 128, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
+; CHECK-LABEL: memcpy_p0_p3_minsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:127
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:126
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:125
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:124
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:127
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:126
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:123
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:125
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:124
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:122
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:121
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:123
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:120
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:119
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:122
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:121
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:118
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:120
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:119
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:117
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:116
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:118
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:115
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:114
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:117
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:116
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:113
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:115
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:114
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:112
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:111
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:113
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:110
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:109
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:112
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:111
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:108
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:110
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:109
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:107
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:106
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:108
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:105
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:104
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:107
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:106
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:103
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:105
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:104
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:102
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:101
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:103
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:100
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:99
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:102
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:101
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:98
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:100
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:99
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:97
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:96
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:98
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:95
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:94
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:97
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:96
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:93
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:95
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:94
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:92
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:91
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:93
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:90
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:89
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:92
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:91
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:88
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:90
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:89
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:87
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:86
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:88
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:85
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:84
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:87
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:86
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:83
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:85
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:84
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:82
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:81
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:83
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:80
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:79
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:82
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:81
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:78
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:80
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:79
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:77
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:76
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:78
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:75
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:74
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:77
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:76
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:73
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:75
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:74
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:72
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:71
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:73
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:70
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:69
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:72
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:71
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:68
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:70
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:69
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:67
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:66
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:68
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:65
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:64
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:67
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:66
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:63
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:65
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:64
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:62
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:61
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:63
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:60
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:59
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:62
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:61
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:58
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:60
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:59
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:57
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:56
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:58
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:55
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:54
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:57
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:56
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:53
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:55
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:54
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:52
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:51
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:53
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:50
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:49
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:52
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:51
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:48
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:50
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:49
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:47
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:46
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:48
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:45
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:44
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:47
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:46
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:43
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:45
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:44
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:42
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:41
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:43
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:40
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:39
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:42
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:41
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:38
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:40
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:39
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:37
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:36
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:38
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:35
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:34
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:37
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:36
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:33
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:35
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:34
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:32
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:31
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:33
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:30
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:29
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:32
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:31
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:28
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:27
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:26
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:28
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:25
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:27
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:26
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:23
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:25
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:24
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:22
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:21
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:23
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:20
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:19
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:21
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:18
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:20
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:19
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:16
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:17
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:18
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:8
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:9
+; CHECK-NEXT:    ds_read_u8 v7, v2 offset:10
+; CHECK-NEXT:    ds_read_u8 v8, v2 offset:11
+; CHECK-NEXT:    ds_read_u8 v9, v2 offset:12
+; CHECK-NEXT:    ds_read_u8 v10, v2 offset:13
+; CHECK-NEXT:    ds_read_u8 v11, v2 offset:14
+; CHECK-NEXT:    ds_read_u8 v12, v2 offset:15
+; CHECK-NEXT:    ds_read_u8 v13, v2
+; CHECK-NEXT:    ds_read_u8 v14, v2 offset:1
+; CHECK-NEXT:    ds_read_u8 v15, v2 offset:2
+; CHECK-NEXT:    ds_read_u8 v16, v2 offset:3
+; CHECK-NEXT:    ds_read_u8 v17, v2 offset:4
+; CHECK-NEXT:    ds_read_u8 v18, v2 offset:5
+; CHECK-NEXT:    ds_read_u8 v19, v2 offset:6
+; CHECK-NEXT:    ds_read_u8 v2, v2 offset:7
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:17
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:16
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:15
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:14
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:13
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:12
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:10
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:9
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:8
+; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:7
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:6
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:5
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:4
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:2
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:1
+; CHECK-NEXT:    flat_store_byte v[0:1], v13
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
+; CHECK-LABEL: memcpy_p0_p0_optsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1]
+; CHECK-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:1
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:2
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:3
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:4
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:4
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:5
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:5
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:6
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:6
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:7
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:7
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:8
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:9
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:9
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:10
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:10
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:11
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:11
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:12
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:12
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:13
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:13
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:14
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:14
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:15
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:15
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:16
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:17
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:17
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:18
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:18
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:19
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:19
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:20
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:21
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:21
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:22
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:22
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:23
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:24
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:25
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:25
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:26
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:26
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:27
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:28
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:29
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:29
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:30
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:31
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:31
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:32
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:33
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:33
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:34
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:34
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:35
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:35
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:36
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:36
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:37
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:37
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:38
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:38
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:39
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:39
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:40
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:40
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:41
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:41
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:42
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:42
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:43
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:43
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:44
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:44
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:45
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:45
+; CHECK-NEXT:    flat_load_ubyte v0, v[0:1] offset:46
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v0 offset:46
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memcpy_p1_p1_optsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #1 {
+; CHECK-LABEL: memcpy_p1_p1_optsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:32
+; CHECK-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3] offset:39
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:39
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 47, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memcpy_p1_p4_optsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #1 {
+; CHECK-LABEL: memcpy_p1_p4_optsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) %global, ptr addrspace(4) %0, i64 128, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #1 {
+; CHECK-LABEL: memcpy_p5_p4_optsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; CHECK-NEXT:    s_load_dword s2, s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_add_u32 s8, s8, s7
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1]
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:1
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:2
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:3
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:4
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:5
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:6
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:7
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:8
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:9
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:10
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:11
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:12
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:13
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:14
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:15
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s2
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:16
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:17
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:18
+; CHECK-NEXT:    s_waitcnt vmcnt(18)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:19
+; CHECK-NEXT:    s_waitcnt vmcnt(19)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:1
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:2
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:21
+; CHECK-NEXT:    s_waitcnt vmcnt(21)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:3
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:22
+; CHECK-NEXT:    s_waitcnt vmcnt(22)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:4
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(23)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:5
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(24)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:6
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:25
+; CHECK-NEXT:    s_waitcnt vmcnt(25)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:7
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:26
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:8
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(27)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:9
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(28)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:10
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:29
+; CHECK-NEXT:    s_waitcnt vmcnt(29)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:11
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(30)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:12
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:31
+; CHECK-NEXT:    s_waitcnt vmcnt(31)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:13
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(32)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:14
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:33
+; CHECK-NEXT:    s_waitcnt vmcnt(33)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:15
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:34
+; CHECK-NEXT:    s_waitcnt vmcnt(34)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:16
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:35
+; CHECK-NEXT:    s_waitcnt vmcnt(35)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:17
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:36
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:18
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:37
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:19
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:38
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:20
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:39
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:21
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:40
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:22
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:41
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:23
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:42
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:24
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:43
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:25
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:44
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:26
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:45
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:27
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:46
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:28
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:47
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:29
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:30
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:49
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:31
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:50
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:32
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:51
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:33
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:52
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:34
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:53
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:35
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:54
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:36
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:55
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:37
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:56
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:38
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:57
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:39
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:58
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:40
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:59
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:41
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:60
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:42
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:61
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:43
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:62
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:44
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:63
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:45
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:46
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:65
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:47
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:66
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:48
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:67
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:49
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:68
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:50
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:69
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:51
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:70
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:52
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:71
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:53
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:72
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:54
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:73
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:55
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:74
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:56
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:75
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:57
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:76
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:58
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:77
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:59
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:78
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:60
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:79
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:61
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:62
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:81
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:63
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:82
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:64
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:83
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:65
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:84
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:66
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:85
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:67
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:86
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:68
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:87
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:69
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:88
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:70
+; CHECK-NEXT:    s_waitcnt vmcnt(35)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:71
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:89
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:90
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:72
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:91
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:73
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:92
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:74
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:93
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:75
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:94
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:76
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:95
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:77
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:78
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:97
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:79
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:98
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:80
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:99
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:81
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:100
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:82
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:101
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:83
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:102
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:84
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:103
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:85
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:104
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:86
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:105
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:87
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:106
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:88
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:107
+; CHECK-NEXT:    s_waitcnt vmcnt(35)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:89
+; CHECK-NEXT:    s_waitcnt vmcnt(35)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:90
+; CHECK-NEXT:    s_waitcnt vmcnt(34)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:91
+; CHECK-NEXT:    s_waitcnt vmcnt(33)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:92
+; CHECK-NEXT:    s_waitcnt vmcnt(32)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:93
+; CHECK-NEXT:    s_waitcnt vmcnt(31)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:94
+; CHECK-NEXT:    s_waitcnt vmcnt(30)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:95
+; CHECK-NEXT:    s_waitcnt vmcnt(29)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(28)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:97
+; CHECK-NEXT:    s_waitcnt vmcnt(27)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:98
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:99
+; CHECK-NEXT:    s_waitcnt vmcnt(25)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:100
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:108
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:109
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:110
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:111
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:112
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:113
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:114
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:115
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:116
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:117
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:118
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:119
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:101
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:120
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:102
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:121
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:103
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:122
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:104
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:123
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:105
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:124
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:106
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:125
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:107
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:126
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    global_load_ubyte v21, v0, s[0:1] offset:127
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:108
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:109
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:110
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:111
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:113
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:114
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:115
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:116
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:117
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:118
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:119
+; CHECK-NEXT:    s_waitcnt vmcnt(25)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:120
+; CHECK-NEXT:    s_waitcnt vmcnt(24)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:121
+; CHECK-NEXT:    s_waitcnt vmcnt(23)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:122
+; CHECK-NEXT:    s_waitcnt vmcnt(22)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:123
+; CHECK-NEXT:    s_waitcnt vmcnt(21)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:124
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:125
+; CHECK-NEXT:    s_waitcnt vmcnt(19)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:126
+; CHECK-NEXT:    s_waitcnt vmcnt(19)
+; CHECK-NEXT:    buffer_store_byte v21, v1, s[8:11], 0 offen offset:127
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %src) #1 {
+; CHECK-LABEL: memcpy_p0_p5_optsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; CHECK-NEXT:    s_load_dword s0, s[4:5], 0x8
+; CHECK-NEXT:    s_add_u32 s8, s8, s7
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:1
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:2
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:3
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:5
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:6
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:7
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:9
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:10
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:11
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:13
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:14
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:17
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-NEXT:    s_waitcnt vmcnt(17)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:18
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:1
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:19
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:2
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:20
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:3
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:21
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:4
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:22
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:5
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:23
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:6
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:24
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:7
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:25
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:26
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:9
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:27
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:10
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:28
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:11
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:29
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:12
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:30
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:13
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:31
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:14
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:32
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:15
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:33
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:34
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:35
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:18
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:36
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:19
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:37
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:20
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:38
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:21
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:39
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:22
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:40
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:23
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:41
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:24
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:42
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:43
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:26
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:44
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:27
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:45
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:28
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:46
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:29
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:47
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:30
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:48
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:31
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:49
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:32
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:50
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:33
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:51
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:34
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:52
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:35
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:53
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:36
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:54
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:37
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:55
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:38
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:56
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:39
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:57
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:40
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:58
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:41
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:59
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:42
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:60
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:43
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:61
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:44
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:62
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:45
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:63
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:46
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:64
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:47
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:65
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:48
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:66
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:49
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:67
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:50
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:68
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:51
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:69
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:52
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:70
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:53
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:71
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:54
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:72
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:55
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:73
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:56
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:74
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:57
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:75
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:58
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:76
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:59
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:77
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:60
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:78
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:61
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:79
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:62
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:80
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:63
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:81
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:64
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:82
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:65
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:83
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:66
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:84
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:67
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:85
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:68
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:86
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:69
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:87
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:70
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:88
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:71
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:89
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:72
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:90
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:73
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:74
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:91
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:92
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:75
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:93
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:76
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:94
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:77
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:95
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:78
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:96
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:79
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:97
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:80
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:98
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:81
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:99
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:82
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:100
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:83
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:101
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:84
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:102
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:85
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:103
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:86
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:104
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:87
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:105
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:88
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:106
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:89
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:107
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:90
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:108
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:91
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:92
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:93
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:94
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:95
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:96
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:97
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:98
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:99
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:100
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:101
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:109
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:110
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:111
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:112
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:113
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:114
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:115
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:116
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:117
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:118
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:119
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:102
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:120
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:103
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:121
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:104
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:122
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:105
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:123
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:106
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:124
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:107
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:125
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:108
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:126
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:127
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:109
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:110
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:111
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:112
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:113
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:114
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:115
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:116
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:117
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:118
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:119
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:120
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:121
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:122
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:123
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:124
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:125
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:126
+; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:127
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memcpy_p3_p4_optsize(ptr addrspace(4) %0) #1 {
+; CHECK-LABEL: memcpy_p3_p4_optsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v24, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v24, s[0:1] offset:80
+; CHECK-NEXT:    global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
+; CHECK-NEXT:    global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
+; CHECK-NEXT:    global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    ds_write2_b64 v24, v[0:1], v[2:3] offset0:14 offset1:15
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    ds_write2_b64 v24, v[4:5], v[6:7] offset0:12 offset1:13
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1]
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    ds_write2_b64 v24, v[8:9], v[10:11] offset0:10 offset1:11
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    ds_write2_b64 v24, v[12:13], v[14:15] offset0:8 offset1:9
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    ds_write2_b64 v24, v[16:17], v[18:19] offset0:6 offset1:7
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    ds_write2_b64 v24, v[20:21], v[22:23] offset0:4 offset1:5
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v24, v[0:1], v[2:3] offset0:2 offset1:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write2_b64 v24, v[4:5], v[6:7] offset1:1
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) @shared, ptr addrspace(4) %0, i64 128, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
+; CHECK-LABEL: memcpy_p0_p3_optsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:127
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:126
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:125
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:124
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:127
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:126
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:123
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:125
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:124
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:122
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:121
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:123
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:120
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:119
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:122
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:121
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:118
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:120
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:119
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:117
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:116
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:118
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:115
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:114
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:117
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:116
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:113
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:115
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:114
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:112
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:111
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:113
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:110
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:109
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:112
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:111
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:108
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:110
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:109
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:107
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:106
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:108
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:105
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:104
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:107
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:106
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:103
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:105
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:104
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:102
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:101
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:103
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:100
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:99
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:102
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:101
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:98
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:100
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:99
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:97
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:96
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:98
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:95
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:94
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:97
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:96
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:93
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:95
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:94
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:92
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:91
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:93
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:90
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:89
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:92
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:91
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:88
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:90
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:89
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:87
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:86
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:88
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:85
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:84
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:87
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:86
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:83
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:85
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:84
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:82
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:81
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:83
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:80
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:79
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:82
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:81
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:78
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:80
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:79
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:77
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:76
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:78
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:75
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:74
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:77
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:76
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:73
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:75
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:74
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:72
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:71
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:73
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:70
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:69
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:72
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:71
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:68
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:70
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:69
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:67
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:66
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:68
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:65
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:64
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:67
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:66
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:63
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:65
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:64
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:62
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:61
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:63
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:60
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:59
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:62
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:61
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:58
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:60
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:59
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:57
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:56
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:58
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:55
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:54
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:57
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:56
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:53
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:55
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:54
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:52
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:51
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:53
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:50
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:49
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:52
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:51
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:48
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:50
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:49
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:47
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:46
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:48
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:45
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:44
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:47
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:46
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:43
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:45
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:44
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:42
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:41
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:43
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:40
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:39
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:42
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:41
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:38
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:40
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:39
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:37
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:36
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:38
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:35
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:34
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:37
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:36
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:33
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:35
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:34
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:32
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:31
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:33
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:30
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:29
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:32
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:31
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:28
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:27
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:26
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:28
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:25
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:27
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:26
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:23
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:25
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:24
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:22
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:21
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:23
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:20
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:19
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:21
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:18
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:20
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:19
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:16
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:17
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:18
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:8
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:9
+; CHECK-NEXT:    ds_read_u8 v7, v2 offset:10
+; CHECK-NEXT:    ds_read_u8 v8, v2 offset:11
+; CHECK-NEXT:    ds_read_u8 v9, v2 offset:12
+; CHECK-NEXT:    ds_read_u8 v10, v2 offset:13
+; CHECK-NEXT:    ds_read_u8 v11, v2 offset:14
+; CHECK-NEXT:    ds_read_u8 v12, v2 offset:15
+; CHECK-NEXT:    ds_read_u8 v13, v2
+; CHECK-NEXT:    ds_read_u8 v14, v2 offset:1
+; CHECK-NEXT:    ds_read_u8 v15, v2 offset:2
+; CHECK-NEXT:    ds_read_u8 v16, v2 offset:3
+; CHECK-NEXT:    ds_read_u8 v17, v2 offset:4
+; CHECK-NEXT:    ds_read_u8 v18, v2 offset:5
+; CHECK-NEXT:    ds_read_u8 v19, v2 offset:6
+; CHECK-NEXT:    ds_read_u8 v2, v2 offset:7
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:17
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:16
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:15
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:14
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:13
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:12
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:10
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:9
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:8
+; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:7
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:6
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:5
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:4
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:2
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:1
+; CHECK-NEXT:    flat_store_byte v[0:1], v13
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #2
+
+declare void @llvm.memcpy.p0.p5.i64(ptr noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg) #2
+
+declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2
+
+declare void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
+
+declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
+
+declare void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
+
+declare void @llvm.memcpy.p0.p3.i64(ptr noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #2
+
+attributes #0 = { minsize }
+attributes #1 = { optsize }
+attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
index f8e7cb397b475..8a5f75332557e 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
@@ -28,18 +28,17 @@ body:             |
   ; GCN-LABEL: name: test_main
   ; GCN: bb.0:
   ; GCN-NEXT:   successors: %bb.1(0x80000000)
-  ; GCN-NEXT:   liveins: $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GCN-NEXT:   liveins: $vcc_hi, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $vgpr2, $vgpr3, $vgpr4, $vgpr5
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr0 = COPY $sgpr33
+  ; GCN-NEXT:   $vcc_hi = frame-setup COPY $sgpr33
   ; GCN-NEXT:   $sgpr33 = frame-setup COPY $sgpr32
-  ; GCN-NEXT:   $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+  ; GCN-NEXT:   $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
   ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.69, addrspace 5)
   ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.70, addrspace 5)
   ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.71, addrspace 5)
   ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.72, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.74, addrspace 5)
-  ; GCN-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr1
-  ; GCN-NEXT:   $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr0, 4, undef $vgpr5
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.73, addrspace 5)
+  ; GCN-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr0
   ; GCN-NEXT:   $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24, implicit-def dead $scc
   ; GCN-NEXT:   renamable $vgpr2 = IMPLICIT_DEF
   ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, $vgpr3
@@ -116,18 +115,18 @@ body:             |
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
-  ; GCN-NEXT:   liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GCN-NEXT:   liveins: $vcc_hi, $vgpr2, $vgpr3, $vgpr4, $vgpr5
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   KILL implicit-def $vcc_lo, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99_sgpr100_sgpr101_sgpr102_sgpr103
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
   ; GCN-NEXT:   successors: %bb.3(0x80000000)
-  ; GCN-NEXT:   liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GCN-NEXT:   liveins: $vcc_hi, $vgpr2, $vgpr3, $vgpr4, $vgpr5
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.3:
-  ; GCN-NEXT:   liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GCN-NEXT:   liveins: $vcc_hi, $vgpr2, $vgpr3, $vgpr4, $vgpr5
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 3
   ; GCN-NEXT:   $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 2
@@ -198,16 +197,15 @@ body:             |
   ; GCN-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1
   ; GCN-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0
   ; GCN-NEXT:   KILL killed renamable $vgpr2
-  ; GCN-NEXT:   $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 4
-  ; GCN-NEXT:   $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+  ; GCN-NEXT:   $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
   ; GCN-NEXT:   $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5)
   ; GCN-NEXT:   $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5)
   ; GCN-NEXT:   $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.71, addrspace 5)
   ; GCN-NEXT:   $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5)
-  ; GCN-NEXT:   $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.74, addrspace 5)
-  ; GCN-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr1
+  ; GCN-NEXT:   $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.73, addrspace 5)
+  ; GCN-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr0
   ; GCN-NEXT:   $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24, implicit-def dead $scc
-  ; GCN-NEXT:   $sgpr33 = COPY $sgpr0
+  ; GCN-NEXT:   $sgpr33 = frame-destroy COPY $vcc_hi
   ; GCN-NEXT:   S_ENDPGM 0
   bb.0:
     liveins: $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
index 731a88278e512..204c8140d3f17 100644
--- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
+++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll
@@ -278,7 +278,7 @@ entry:
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_half
 ; GCN-POSTLINK: call fast float @_Z3powff(float %tmp, float 5.000000e-01)
-; GCN-PRELINK: %__pow2sqrt = tail call fast float @_Z4sqrtf(float %tmp)
+; GCN-PRELINK: %__pow2sqrt = tail call fast float @llvm.sqrt.f32(float %tmp)
 define amdgpu_kernel void @test_pow_half(ptr addrspace(1) nocapture %a) {
 entry:
   %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1
@@ -476,7 +476,7 @@ declare float @_Z5rootnfi(float, i32)
 
 ; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_2
 ; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 2)
-; GCN-PRELINK: %__rootn2sqrt = tail call fast float @_Z4sqrtf(float %tmp)
+; GCN-PRELINK: %__rootn2sqrt = tail call fast float @llvm.sqrt.f32(float %tmp)
 define amdgpu_kernel void @test_rootn_2(ptr addrspace(1) nocapture %a) {
 entry:
   %tmp = load float, ptr addrspace(1) %a, align 4
@@ -838,5 +838,5 @@ entry:
 ; GCN-PRELINK: declare float @_Z4cbrtf(float) local_unnamed_addr #[[$NOUNWIND_READONLY:[0-9]+]]
 
 ; GCN-PRELINK-DAG: attributes #[[$NOUNWIND]] = { nounwind }
-; GCN-PRELINK-DAG: attributes #[[$NOUNWIND_READONLY]] = { nofree nounwind memory(read) }
+; GCN-PRELINK-DAG: attributes #[[$NOUNWIND_READONLY]] = { nounwind memory(read) }
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/vopd-combine.mir b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
index 63bef40c34742..b8ac50c3aeb5e 100644
--- a/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
+++ b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir
@@ -160,7 +160,7 @@ body:             |
     ; PAIR-GFX11-NEXT: $vgpr3 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $sgpr20 = IMPLICIT_DEF
     ; PAIR-GFX11-NEXT: $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec
-    ; PAIR-GFX11-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx11 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; PAIR-GFX11-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx11 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
     ; PAIR-GFX11-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
     ; PAIR-GFX11-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit killed $vcc_lo
@@ -174,7 +174,7 @@ body:             |
     ; PAIR-GFX12-NEXT: $vgpr3 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $sgpr20 = IMPLICIT_DEF
     ; PAIR-GFX12-NEXT: $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx12 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; PAIR-GFX12-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32_gfx12 $sgpr20, killed $vgpr1, killed $vgpr2, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
     ; PAIR-GFX12-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
     ; PAIR-GFX12-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit killed $vcc_lo
@@ -458,9 +458,9 @@ body:             |
     ; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; PAIR-GFX11-NEXT: $vgpr12, $vgpr19 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; PAIR-GFX11-NEXT: $vgpr12, $vgpr19 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
     ; PAIR-GFX11-NEXT: $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; PAIR-GFX11-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx11 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; PAIR-GFX11-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx11 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
     ; PAIR-GFX11-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
     ; PAIR-GFX11-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
@@ -476,9 +476,9 @@ body:             |
     ; PAIR-GFX12-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr12, $vgpr19 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; PAIR-GFX12-NEXT: $vgpr12, $vgpr19 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
     ; PAIR-GFX12-NEXT: $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; PAIR-GFX12-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx12 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; PAIR-GFX12-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32_gfx12 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
     ; PAIR-GFX12-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit killed $vcc_lo
     ; PAIR-GFX12-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec
@@ -559,12 +559,12 @@ body:             |
     ; PAIR-GFX11-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; PAIR-GFX11-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; PAIR-GFX11-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx11 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+    ; PAIR-GFX11-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx11 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; PAIR-GFX11-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx11 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; PAIR-GFX11-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
-    ; PAIR-GFX11-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx11 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
-    ; PAIR-GFX11-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx11 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+    ; PAIR-GFX11-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+    ; PAIR-GFX11-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx11 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+    ; PAIR-GFX11-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx11 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr21, $vgpr24 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx11 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
     ; PAIR-GFX11-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
@@ -586,12 +586,12 @@ body:             |
     ; PAIR-GFX12-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; PAIR-GFX12-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx12 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+    ; PAIR-GFX12-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32_gfx12 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc_lo
+    ; PAIR-GFX12-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32_gfx12 $vgpr0, $vgpr3, 10, $vgpr1, killed $vgpr20, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
-    ; PAIR-GFX12-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx12 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
-    ; PAIR-GFX12-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx12 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+    ; PAIR-GFX12-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+    ; PAIR-GFX12-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32_gfx12 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
+    ; PAIR-GFX12-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32_gfx12 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $vcc_lo, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $vcc_lo, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr21, $vgpr24 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32_gfx12 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec
     ; PAIR-GFX12-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc_lo
diff --git a/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables-x.mir b/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables-x.mir
index eaa627966347f..40ea01189f2cd 100644
--- a/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables-x.mir
+++ b/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables-x.mir
@@ -1,5 +1,6 @@
 # REQUIRES: x86-registered-target
 # RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=mir-check-debugify -o - %s 2>&1 | FileCheck %s
+# RUN: llc --experimental-debuginfo-iterators=false -mtriple=x86_64-unknown-linux-gnu -run-pass=mir-check-debugify -o - %s 2>&1 | FileCheck %s
 --- |
   ; ModuleID = 'check-line-and-variables.mir'
   source_filename = "check-line-and-variables.c"
diff --git a/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables.ll b/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables.ll
index 9033fd2f147c4..56c7cf45705a7 100644
--- a/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables.ll
+++ b/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -debugify-check-and-strip-all-safe -o - %s 2>&1 | FileCheck %s
+; RUN: llc --experimental-debuginfo-iterators=false -debugify-check-and-strip-all-safe -o - %s 2>&1 | FileCheck %s
 
 ; ModuleID = 'main.c'
 source_filename = "main.c"
diff --git a/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables.mir b/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables.mir
index 9eb722258b703..0805a7f4cfc6c 100644
--- a/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables.mir
+++ b/llvm/test/CodeGen/Generic/MIRDebugify/check-line-and-variables.mir
@@ -1,6 +1,8 @@
 # REQUIRES: x86-registered-target
 # RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=mir-debugify,dead-mi-elimination,mir-check-debugify -o - %s 2>&1 | FileCheck %s
 # RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=mir-debugify,mir-check-debugify -o - %s 2>&1 | FileCheck %s --check-prefix=CHECK-PASS
+# RUN: llc --experimental-debuginfo-iterators=false -mtriple=x86_64-unknown-linux-gnu -run-pass=mir-debugify,dead-mi-elimination,mir-check-debugify -o - %s 2>&1 | FileCheck %s
+# RUN: llc --experimental-debuginfo-iterators=false -mtriple=x86_64-unknown-linux-gnu -run-pass=mir-debugify,mir-check-debugify -o - %s 2>&1 | FileCheck %s --check-prefix=CHECK-PASS
 --- |
   ; ModuleID = 'check-line-and-variables.mir'
   source_filename = "check-line-and-variables.ll"
diff --git a/llvm/test/CodeGen/Generic/MIRDebugify/locations-and-values.mir b/llvm/test/CodeGen/Generic/MIRDebugify/locations-and-values.mir
index 59dcff9efd4d5..3035fb8eab3f8 100644
--- a/llvm/test/CodeGen/Generic/MIRDebugify/locations-and-values.mir
+++ b/llvm/test/CodeGen/Generic/MIRDebugify/locations-and-values.mir
@@ -2,6 +2,10 @@
 # RUN: llc -run-pass=mir-debugify -debugify-level=locations -o - %s | FileCheck --check-prefixes=ALL --implicit-check-not=dbg.value %s
 # RUN: llc -run-pass=mir-debugify,mir-strip-debug,mir-debugify -o - %s | FileCheck --check-prefixes=ALL,VALUE %s
 # RUN: llc -run-pass=mir-debugify,mir-strip-debug -o - %s | FileCheck --check-prefix=STRIP %s
+# RUN: llc --experimental-debuginfo-iterators=false -run-pass=mir-debugify -o - %s | FileCheck --check-prefixes=ALL,VALUE %s
+# RUN: llc --experimental-debuginfo-iterators=false -run-pass=mir-debugify -debugify-level=locations -o - %s | FileCheck --check-prefixes=ALL --implicit-check-not=dbg.value %s
+# RUN: llc --experimental-debuginfo-iterators=false -run-pass=mir-debugify,mir-strip-debug,mir-debugify -o - %s | FileCheck --check-prefixes=ALL,VALUE %s
+# RUN: llc --experimental-debuginfo-iterators=false -run-pass=mir-debugify,mir-strip-debug -o - %s | FileCheck --check-prefix=STRIP %s
 
 --- |
   ; ModuleID = 'loc-only.ll'
diff --git a/llvm/test/CodeGen/Generic/MIRDebugify/multifunction-module.mir b/llvm/test/CodeGen/Generic/MIRDebugify/multifunction-module.mir
index fe4fcc1a15bb8..8079db926e1b0 100644
--- a/llvm/test/CodeGen/Generic/MIRDebugify/multifunction-module.mir
+++ b/llvm/test/CodeGen/Generic/MIRDebugify/multifunction-module.mir
@@ -1,6 +1,5 @@
-# FIXME: Remove rm after a few weeks.
-# RUN: rm -f %S/multifunction-module.s
 # RUN: llc -run-pass=mir-debugify,mir-check-debugify -o - %s 2>&1 | FileCheck %s
+# RUN: llc --experimental-debuginfo-iterators=false -run-pass=mir-debugify,mir-check-debugify -o - %s 2>&1 | FileCheck %s
 
 # CHECK: Machine IR debug info check: PASS
 # CHECK-NOT: Assertion `Var <= NumVars && "Unexpected name for DILocalVariable"'
diff --git a/llvm/test/CodeGen/Generic/expand-vp-fp-intrinsics.ll b/llvm/test/CodeGen/Generic/expand-vp-fp-intrinsics.ll
new file mode 100644
index 0000000000000..bc89ddea6b85a
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/expand-vp-fp-intrinsics.ll
@@ -0,0 +1,176 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -expandvp -S < %s | FileCheck %s
+
+define void @vp_fadd_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
+; CHECK-LABEL: define void @vp_fadd_v4f32(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[RES1:%.*]] = fadd <4 x float> [[A0]], [[A1]]
+; CHECK-NEXT:    store <4 x float> [[RES1]], ptr [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+  %res = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
+  store <4 x float> %res, ptr %out
+  ret void
+}
+declare <4 x float> @llvm.vp.fadd.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
+
+define void @vp_fsub_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
+; CHECK-LABEL: define void @vp_fsub_v4f32(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES1:%.*]] = fsub <4 x float> [[A0]], [[A1]]
+; CHECK-NEXT:    store <4 x float> [[RES1]], ptr [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+  %res = call <4 x float> @llvm.vp.fsub.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
+  store <4 x float> %res, ptr %out
+  ret void
+}
+declare <4 x float> @llvm.vp.fsub.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
+
+define void @vp_fmul_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
+; CHECK-LABEL: define void @vp_fmul_v4f32(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES1:%.*]] = fmul <4 x float> [[A0]], [[A1]]
+; CHECK-NEXT:    store <4 x float> [[RES1]], ptr [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+  %res = call <4 x float> @llvm.vp.fmul.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
+  store <4 x float> %res, ptr %out
+  ret void
+}
+declare <4 x float> @llvm.vp.fmul.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
+
+define void @vp_fdiv_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
+; CHECK-LABEL: define void @vp_fdiv_v4f32(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES1:%.*]] = fdiv <4 x float> [[A0]], [[A1]]
+; CHECK-NEXT:    store <4 x float> [[RES1]], ptr [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+  %res = call <4 x float> @llvm.vp.fdiv.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
+  store <4 x float> %res, ptr %out
+  ret void
+}
+declare <4 x float> @llvm.vp.fdiv.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
+
+define void @vp_frem_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
+; CHECK-LABEL: define void @vp_frem_v4f32(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES1:%.*]] = frem <4 x float> [[A0]], [[A1]]
+; CHECK-NEXT:    store <4 x float> [[RES1]], ptr [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+  %res = call <4 x float> @llvm.vp.frem.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
+  store <4 x float> %res, ptr %out
+  ret void
+}
+declare <4 x float> @llvm.vp.frem.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
+
+define void @vp_fabs_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
+; CHECK-LABEL: define void @vp_fabs_v4f32(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> [[A0]])
+; CHECK-NEXT:    store <4 x float> [[RES1]], ptr [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+  %res = call <4 x float> @llvm.vp.fabs.v4f32(<4 x float> %a0, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
+  store <4 x float> %res, ptr %out
+  ret void
+}
+declare <4 x float> @llvm.vp.fabs.v4f32(<4 x float>, <4 x i1>, i32)
+
+define void @vp_sqrt_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
+; CHECK-LABEL: define void @vp_sqrt_v4f32(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[A0]])
+; CHECK-NEXT:    store <4 x float> [[RES1]], ptr [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+  %res = call <4 x float> @llvm.vp.sqrt.v4f32(<4 x float> %a0, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
+  store <4 x float> %res, ptr %out
+  ret void
+}
+declare <4 x float> @llvm.vp.sqrt.v4f32(<4 x float>, <4 x i1>, i32)
+
+define void @vp_fneg_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
+; CHECK-LABEL: define void @vp_fneg_v4f32(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i32 [[VP:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES1:%.*]] = fneg <4 x float> [[A0]]
+; CHECK-NEXT:    store <4 x float> [[RES1]], ptr [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+  %res = call <4 x float> @llvm.vp.fneg.v4f32(<4 x float> %a0, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
+  store <4 x float> %res, ptr %out
+  ret void
+}
+declare <4 x float> @llvm.vp.fneg.v4f32(<4 x float>, <4 x i1>, i32)
+
+define void @vp_fma_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i4 %a5) nounwind {
+; CHECK-LABEL: define void @vp_fma_v4f32(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i4 [[A5:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A1]])
+; CHECK-NEXT:    store <4 x float> [[RES1]], ptr [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+  %res = call <4 x float> @llvm.vp.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 4)
+  store <4 x float> %res, ptr %out
+  ret void
+}
+declare <4 x float> @llvm.vp.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, <4 x i1>, i32)
+
+define void @vp_fmuladd_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i4 %a5) nounwind {
+; CHECK-LABEL: define void @vp_fmuladd_v4f32(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], ptr [[OUT:%.*]], i4 [[A5:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A1]])
+; CHECK-NEXT:    store <4 x float> [[RES1]], ptr [[OUT]], align 16
+; CHECK-NEXT:    ret void
+;
+  %res = call <4 x float> @llvm.vp.fmuladd.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 4)
+  store <4 x float> %res, ptr %out
+  ret void
+}
+declare <4 x float> @llvm.vp.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>, <4 x i1>, i32)
+
+declare <4 x float> @llvm.vp.maxnum.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
+define <4 x float> @vfmax_vv_v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: define <4 x float> @vfmax_vv_v4f32(
+; CHECK-SAME: <4 x float> [[VA:%.*]], <4 x float> [[VB:%.*]], <4 x i1> [[M:%.*]], i32 zeroext [[EVL:%.*]]) {
+; CHECK-NEXT:    [[V1:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VA]], <4 x float> [[VB]])
+; CHECK-NEXT:    ret <4 x float> [[V1]]
+;
+  %v = call <4 x float> @llvm.vp.maxnum.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x float> %v
+}
+
+declare <8 x float> @llvm.vp.maxnum.v8f32(<8 x float>, <8 x float>, <8 x i1>, i32)
+define <8 x float> @vfmax_vv_v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: define <8 x float> @vfmax_vv_v8f32(
+; CHECK-SAME: <8 x float> [[VA:%.*]], <8 x float> [[VB:%.*]], <8 x i1> [[M:%.*]], i32 zeroext [[EVL:%.*]]) {
+; CHECK-NEXT:    [[V1:%.*]] = call <8 x float> @llvm.maxnum.v8f32(<8 x float> [[VA]], <8 x float> [[VB]])
+; CHECK-NEXT:    ret <8 x float> [[V1]]
+;
+  %v = call <8 x float> @llvm.vp.maxnum.v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x float> %v
+}
+
+declare <4 x float> @llvm.vp.minnum.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
+define <4 x float> @vfmin_vv_v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: define <4 x float> @vfmin_vv_v4f32(
+; CHECK-SAME: <4 x float> [[VA:%.*]], <4 x float> [[VB:%.*]], <4 x i1> [[M:%.*]], i32 zeroext [[EVL:%.*]]) {
+; CHECK-NEXT:    [[V1:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VA]], <4 x float> [[VB]])
+; CHECK-NEXT:    ret <4 x float> [[V1]]
+;
+  %v = call <4 x float> @llvm.vp.minnum.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x float> %v
+}
+
+declare <8 x float> @llvm.vp.minnum.v8f32(<8 x float>, <8 x float>, <8 x i1>, i32)
+define <8 x float> @vfmin_vv_v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: define <8 x float> @vfmin_vv_v8f32(
+; CHECK-SAME: <8 x float> [[VA:%.*]], <8 x float> [[VB:%.*]], <8 x i1> [[M:%.*]], i32 zeroext [[EVL:%.*]]) {
+; CHECK-NEXT:    [[V1:%.*]] = call <8 x float> @llvm.minnum.v8f32(<8 x float> [[VA]], <8 x float> [[VB]])
+; CHECK-NEXT:    ret <8 x float> [[V1]]
+;
+  %v = call <8 x float> @llvm.vp.minnum.v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x float> %v
+}
diff --git a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
index 09297fb819ce5..ce81957f2a393 100644
--- a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll
@@ -17,7 +17,6 @@
 ; CHECK-32-NEXT:  alloca.u32  %r[[ALLOCA:[0-9]]], %r[[SIZE3]], 16;
 ; CHECK-32-NEXT:  cvta.local.u32  %r[[ALLOCA]], %r[[ALLOCA]];
 ; CHECK-32-NEXT:  { // callseq 0, 0
-; CHECK-32-NEXT:  .reg .b32 temp_param_reg;
 ; CHECK-32-NEXT:  .param .b32 param0;
 ; CHECK-32-NEXT:  st.param.b32  [param0+0], %r[[ALLOCA]];
 
@@ -27,7 +26,6 @@
 ; CHECK-64-NEXT:  alloca.u64  %rd[[ALLOCA:[0-9]]], %rd[[SIZE3]], 16;
 ; CHECK-64-NEXT:  cvta.local.u64  %rd[[ALLOCA]], %rd[[ALLOCA]];
 ; CHECK-64-NEXT:  { // callseq 0, 0
-; CHECK-64-NEXT:  .reg .b32 temp_param_reg;
 ; CHECK-64-NEXT:  .param .b64 param0;
 ; CHECK-64-NEXT:  st.param.b64  [param0+0], %rd[[ALLOCA]];
 
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 6895699a1dfea..96a4359d0ec43 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -827,7 +827,6 @@ define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_call_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_call_param_0];
 ; CHECK-NEXT:    { // callseq 0, 0
-; CHECK-NEXT:    .reg .b32 temp_param_reg;
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.b32 [param0+0], %r1;
 ; CHECK-NEXT:    .param .align 4 .b8 param1[4];
@@ -856,7 +855,6 @@ define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_call_flipped_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_call_flipped_param_0];
 ; CHECK-NEXT:    { // callseq 1, 0
-; CHECK-NEXT:    .reg .b32 temp_param_reg;
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.b32 [param0+0], %r2;
 ; CHECK-NEXT:    .param .align 4 .b8 param1[4];
@@ -885,7 +883,6 @@ define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-NEXT:    ld.param.u32 %r2, [test_tailcall_flipped_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_tailcall_flipped_param_0];
 ; CHECK-NEXT:    { // callseq 2, 0
-; CHECK-NEXT:    .reg .b32 temp_param_reg;
 ; CHECK-NEXT:    .param .align 4 .b8 param0[4];
 ; CHECK-NEXT:    st.param.b32 [param0+0], %r2;
 ; CHECK-NEXT:    .param .align 4 .b8 param1[4];
diff --git a/llvm/test/CodeGen/PowerPC/toc-data.ll b/llvm/test/CodeGen/PowerPC/toc-data.ll
index cbf3be9fcaad0..7f7afe76cfcde 100644
--- a/llvm/test/CodeGen/PowerPC/toc-data.ll
+++ b/llvm/test/CodeGen/PowerPC/toc-data.ll
@@ -12,6 +12,14 @@
 ; RUN: llc -mtriple powerpc-ibm-aix-xcoff -verify-machineinstrs -O0 < %s | FileCheck %s --check-prefix TEST32
 ; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -verify-machineinstrs -O0 < %s | FileCheck %s --check-prefix TEST64
 
+; RUN: llc -mtriple powerpc-ibm-aix-xcoff -code-model=large -verify-machineinstrs < %s \
+; RUN:     -stop-before=ppc-vsx-copy | FileCheck %s --check-prefix CHECK32LARGE
+; RUN: llc -mtriple powerpc-ibm-aix-xcoff -code-model=large -verify-machineinstrs < %s | FileCheck %s --check-prefix TEST32LARGE
+
+; Global variables i and f have the toc-data attribute.
+; In the following functions, those writing to or reading from
+; variables i and f should use the toc-data access pattern.
+; All remaining variables should use the regular toc access sequence.
 @i = dso_local global i32 0, align 4 #0
 @d = dso_local local_unnamed_addr global double 3.141590e+00, align 8
 @f = dso_local local_unnamed_addr global float 0x4005BE76C0000000, align 4 #0
@@ -44,6 +52,16 @@ define dso_local void @write_int(i32 signext %in) {
 ; TEST64:           la 4, i[TD](2)
 ; TEST64-NEXT:      stw 3, 0(4)
 
+; CHECK32LARGE: name:            write_int
+; CHECK32LARGE:      %[[SCRATCH1:[0-9]+]]:gprc_and_gprc_nor0 = ADDIStocHA $r2, @i
+; CHECK32LARGE-NEXT: %[[SCRATCH2:[0-9]+]]:gprc_and_gprc_nor0 = ADDItocL killed %[[SCRATCH1]], @i
+; CHECK32LARGE-NEXT: STW %{{[0-9]+}}, 0, killed %[[SCRATCH2]] :: (store (s32) into @i)
+
+; FIXME: peephole optimization opportunity for lower part relocation @l to the consuming stw
+; TEST32LARGE:         .write_int:
+; TEST32LARGE:          addis 4, i[TD]@u(2)
+; TEST32LARGE-NEXT:	la 4, i[TD]@l(4)
+; TEST32LARGE-NEXT:	stw 3, 0(4)
 
 define dso_local i64 @read_ll() {
   entry:
@@ -70,6 +88,15 @@ define dso_local i64 @read_ll() {
 ; TEST64:         ld 3, L..C0(2)
 ; TEST64-NEXT:    ld 3, 0(3)
 
+; CHECK32LARGE: name:            read_ll
+; CHECK32LARGE: %[[SCRATCH1:[0-9]+]]:gprc_and_gprc_nor0 = ADDIStocHA $r2, @ll
+; CHECK32LARGE: LWZtocL @ll, killed %[[SCRATCH1]] :: (load (s32) from got)
+
+; TEST32LARGE:         .read_ll:
+; TEST32LARGE:          addis 3, L..C0@u(2)
+; TEST32LARGE-NEXT:	lwz 4, L..C0@l(3)
+; TEST32LARGE-NEXT:	lwz 3, 0(4)
+; TEST32LARGE-NEXT:	lwz 4, 4(4)
 
 define dso_local float @read_float() {
   entry:
@@ -96,6 +123,16 @@ define dso_local float @read_float() {
 ; TEST64:         la 3, f[TD](2)
 ; TEST64-NEXT:    lfs 1, 0(3)
 
+; CHECK32LARGE: name:            read_float
+; CHECK32LARGE:      %[[SCRATCH1:[0-9]+]]:gprc_and_gprc_nor0 = ADDIStocHA $r2, @f
+; CHECK32LARGE-NEXT: %[[SCRATCH2:[0-9]+]]:gprc_and_gprc_nor0 = ADDItocL killed %[[SCRATCH1]], @f
+; CHECK32LARGE-NEXT: LFS 0, killed %[[SCRATCH2]] :: (dereferenceable load (s32) from @f)
+
+; FIXME: peephole optimization opportunity for lower part relocation @l to the consuming lfs
+; TEST32LARGE:         .read_float:
+; TEST32LARGE:          addis 3, f[TD]@u(2)
+; TEST32LARGE-NEXT:	la 3, f[TD]@l(3)
+; TEST32LARGE-NEXT:	lfs 1, 0(3)
 
 define dso_local void @write_double(double %in) {
   entry:
@@ -121,6 +158,14 @@ define dso_local void @write_double(double %in) {
 ; TEST64:         ld 3, L..C1(2)
 ; TEST64-NEXT:    stfd 1, 0(3)
 
+; CHECK32LARGE: name:            write_double
+; CHECK32LARGE: %[[SCRATCH1:[0-9]+]]:gprc_and_gprc_nor0 = ADDIStocHA $r2, @d
+; CHECK32LARGE: LWZtocL @d, killed %[[SCRATCH1]] :: (load (s32) from got)
+
+; TEST32LARGE:         .write_double:
+; TEST32LARGE:          addis 3, L..C1@u(2)
+; TEST32LARGE-NEXT:	lwz 3, L..C1@l(3)
+; TEST32LARGE-NEXT:	stfd 1, 0(3)
 
 define dso_local nonnull ptr @addr() {
   entry:
@@ -144,6 +189,15 @@ define dso_local nonnull ptr @addr() {
 ; TEST64:       .addr
 ; TEST64:         la 3, i[TD](2)
 
+; CHECK32LARGE: name:            addr
+; CHECK32LARGE:      %[[SCRATCH1:[0-9]+]]:gprc_and_gprc_nor0 = ADDIStocHA $r2, @i
+; CHECK32LARGE-NEXT: %[[SCRATCH2:[0-9]+]]:gprc = ADDItocL killed %[[SCRATCH1]], @i
+; CHECK32LARGE-NEXT: $r3 = COPY %[[SCRATCH2]]
+
+; TEST32LARGE:         .addr:
+; TEST32LARGE:          addis 3, i[TD]@u(2)
+; TEST32LARGE-NEXT:	la 3, i[TD]@l(3)
+
 ; TEST32:         .toc
 ; TEST32:           .tc ll[TC],ll[RW]
 ; TEST32-NOT:       .csect ll[TD]
@@ -170,4 +224,17 @@ define dso_local nonnull ptr @addr() {
 ; TEST64-NEXT:      .globl f[TD]
 ; TEST64-NOT:       .tc f[TD],f[RW]
 
+; TEST32LARGE:         .toc
+; TEST32LARGE:           .tc ll[TE],ll[RW]
+; TEST32LARGE-NOT:       .csect ll[TD]
+; TEST32LARGE:           .tc d[TE],d[RW]
+; TEST32LARGE-NOT:       .csect d[TD],2
+; TEST32LARGE:           .csect i[TD],2
+; TEST32LARGE-NEXT:      .globl  i[TD]
+; TEST32LARGE-NEXT:      .align  2
+; TEST32LARGE-NOT:       .tc i[TE],i[RW]
+; TEST32LARGE:           .csect f[TD],2
+; TEST32LARGE-NEXT:      .globl f[TD]
+; TEST32LARGE-NOT:       .tc f[TE],f[RW]
+
 attributes #0 = { "toc-data" }
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index 2326599bf3513..080783fdeec02 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -115,6 +115,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+zacas %s -o - | FileCheck --check-prefix=RV32ZACAS %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zalasr %s -o - | FileCheck --check-prefix=RV32ZALASR %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zalrsc %s -o - | FileCheck --check-prefix=RV32ZALRSC %s
+; RUN: llc -mtriple=riscv32 -mattr=+zama16b %s -o - | FileCheck --check-prefixes=CHECK,RV32ZAMA16B %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zicfilp %s -o - | FileCheck --check-prefix=RV32ZICFILP %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zabha %s -o - | FileCheck --check-prefix=RV32ZABHA %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-ssnpm  %s -o - | FileCheck --check-prefix=RV32SSNPM %s
@@ -199,6 +200,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+xtheadvdot %s -o - | FileCheck --check-prefixes=CHECK,RV64XTHEADVDOT %s
 ; RUN: llc -mtriple=riscv64 -mattr=+za64rs %s -o - | FileCheck --check-prefixes=CHECK,RV64ZA64RS %s
 ; RUN: llc -mtriple=riscv64 -mattr=+za128rs %s -o - | FileCheck --check-prefixes=CHECK,RV64ZA128RS %s
+; RUN: llc -mtriple=riscv64 -mattr=+zama16b %s -o - | FileCheck --check-prefixes=CHECK,RV64ZAMA16B %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zawrs %s -o - | FileCheck --check-prefixes=CHECK,RV64ZAWRS %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-ztso %s -o - | FileCheck --check-prefixes=CHECK,RV64ZTSO %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zca %s -o - | FileCheck --check-prefixes=CHECK,RV64ZCA %s
@@ -370,6 +372,7 @@
 ; RV32ZACAS: .attribute 5, "rv32i2p1_a2p1_zacas1p0"
 ; RV32ZALASR: .attribute 5, "rv32i2p1_zalasr0p1"
 ; RV32ZALRSC: .attribute 5, "rv32i2p1_zalrsc0p2"
+; RV32ZAMA16B: .attribute 5, "rv32i2p1_zama16b1p0"
 ; RV32ZICFILP: .attribute 5, "rv32i2p1_zicfilp0p4"
 ; RV32ZABHA: .attribute 5, "rv32i2p1_a2p1_zabha1p0"
 ; RV32SSNPM: .attribute 5, "rv32i2p1_ssnpm0p8"
@@ -418,6 +421,7 @@
 ; RV64ZICBOZ: .attribute 5, "rv64i2p1_zicboz1p0"
 ; RV64ZA64RS: .attribute 5, "rv64i2p1_za64rs1p0"
 ; RV64ZA128RS: .attribute 5, "rv64i2p1_za128rs1p0"
+; RV64ZAMA16B: .attribute 5, "rv64i2p1_zama16b1p0"
 ; RV64ZAWRS: .attribute 5, "rv64i2p1_zawrs1p0"
 ; RV64ZICBOP: .attribute 5, "rv64i2p1_zicbop1p0"
 ; RV64SHCOUNTERENW: .attribute 5, "rv64i2p1_shcounterenw1p0"
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
index 65d0768c60885..ea8feef332984 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
@@ -128,43 +128,113 @@ define i64 @ctz_nxv8i1_no_range(<vscale x 8 x i16> %a) {
 define i32 @ctz_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
 ; RV32-LABEL: ctz_nxv16i1:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; RV32-NEXT:    vfirst.m a0, v8
+; RV32-NEXT:    bgez a0, .LBB2_2
+; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 1
-; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV32-NEXT:    vmv.v.x v8, a0
-; RV32-NEXT:    vid.v v16
-; RV32-NEXT:    li a1, -1
-; RV32-NEXT:    vmadd.vx v16, a1, v8
-; RV32-NEXT:    vmv.v.i v8, 0
-; RV32-NEXT:    vmerge.vvm v8, v8, v16, v0
-; RV32-NEXT:    vredmaxu.vs v8, v8, v8
-; RV32-NEXT:    vmv.x.s a1, v8
-; RV32-NEXT:    sub a0, a0, a1
+; RV32-NEXT:  .LBB2_2:
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ctz_nxv16i1:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vmv1r.v v0, v8
+; RV64-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; RV64-NEXT:    vfirst.m a0, v8
+; RV64-NEXT:    bgez a0, .LBB2_2
+; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 1
-; RV64-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
-; RV64-NEXT:    vmv.v.x v8, a0
-; RV64-NEXT:    vid.v v16
-; RV64-NEXT:    li a1, -1
-; RV64-NEXT:    vmadd.vx v16, a1, v8
-; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmerge.vvm v8, v8, v16, v0
-; RV64-NEXT:    vredmaxu.vs v8, v8, v8
-; RV64-NEXT:    vmv.x.s a1, v8
-; RV64-NEXT:    subw a0, a0, a1
+; RV64-NEXT:  .LBB2_2:
 ; RV64-NEXT:    ret
   %res = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> %a, i1 0)
   ret i32 %res
 }
 
+define i32 @ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
+; RV32-LABEL: ctz_nxv16i1_poison:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; RV32-NEXT:    vfirst.m a0, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: ctz_nxv16i1_poison:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; RV64-NEXT:    vfirst.m a0, v8
+; RV64-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> %a, i1 1)
+  ret i32 %res
+}
+
+define i32 @ctz_v16i1(<16 x i1> %pg, <16 x i1> %a) {
+; RV32-LABEL: ctz_v16i1:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV32-NEXT:    vfirst.m a0, v8
+; RV32-NEXT:    bgez a0, .LBB4_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a0, 16
+; RV32-NEXT:  .LBB4_2:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: ctz_v16i1:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64-NEXT:    vfirst.m a0, v8
+; RV64-NEXT:    bgez a0, .LBB4_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a0, 16
+; RV64-NEXT:  .LBB4_2:
+; RV64-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 0)
+  ret i32 %res
+}
+
+define i32 @ctz_v16i1_poison(<16 x i1> %pg, <16 x i1> %a) {
+; RV32-LABEL: ctz_v16i1_poison:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV32-NEXT:    vfirst.m a0, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: ctz_v16i1_poison:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; RV64-NEXT:    vfirst.m a0, v8
+; RV64-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 1)
+  ret i32 %res
+}
+
+define i16 @ctz_v8i1_i16_ret(<8 x i1> %a) {
+; RV32-LABEL: ctz_v8i1_i16_ret:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV32-NEXT:    vfirst.m a0, v0
+; RV32-NEXT:    bgez a0, .LBB6_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a0, 8
+; RV32-NEXT:  .LBB6_2:
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: ctz_v8i1_i16_ret:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; RV64-NEXT:    vfirst.m a0, v0
+; RV64-NEXT:    bgez a0, .LBB6_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a0, 8
+; RV64-NEXT:  .LBB6_2:
+; RV64-NEXT:    ret
+  %res = call i16 @llvm.experimental.cttz.elts.i16.v8i1(<8 x i1> %a, i1 0)
+  ret i16 %res
+}
+
 declare i64 @llvm.experimental.cttz.elts.i64.nxv8i16(<vscale x 8 x i16>, i1)
 declare i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1>, i1)
 declare i32 @llvm.experimental.cttz.elts.i32.nxv4i32(<vscale x 4 x i32>, i1)
+declare i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1>, i1)
+declare i16 @llvm.experimental.cttz.elts.i16.v16i1(<8 x i1>, i1)
 
 attributes #0 = { vscale_range(2,1024) }
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
index 49d4760a2e9ab..94b717b42e92b 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
@@ -48,31 +48,13 @@ define i32 @ctz_v2i1_poison(<2 x i1> %a) {
 ; RV32-LABEL: ctz_v2i1_poison:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; RV32-NEXT:    vmv.v.i v8, 0
-; RV32-NEXT:    vmerge.vim v8, v8, -1, v0
-; RV32-NEXT:    vid.v v9
-; RV32-NEXT:    vrsub.vi v9, v9, 2
-; RV32-NEXT:    vand.vv v8, v8, v9
-; RV32-NEXT:    vredmaxu.vs v8, v8, v8
-; RV32-NEXT:    vmv.x.s a0, v8
-; RV32-NEXT:    li a1, 2
-; RV32-NEXT:    sub a1, a1, a0
-; RV32-NEXT:    andi a0, a1, 255
+; RV32-NEXT:    vfirst.m a0, v0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ctz_v2i1_poison:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
-; RV64-NEXT:    vmv.v.i v8, 0
-; RV64-NEXT:    vmerge.vim v8, v8, -1, v0
-; RV64-NEXT:    vid.v v9
-; RV64-NEXT:    vrsub.vi v9, v9, 2
-; RV64-NEXT:    vand.vv v8, v8, v9
-; RV64-NEXT:    vredmaxu.vs v8, v8, v8
-; RV64-NEXT:    vmv.x.s a0, v8
-; RV64-NEXT:    li a1, 2
-; RV64-NEXT:    subw a1, a1, a0
-; RV64-NEXT:    andi a0, a1, 255
+; RV64-NEXT:    vfirst.m a0, v0
 ; RV64-NEXT:    ret
   %res = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> %a, i1 1)
   ret i32 %res
diff --git a/llvm/test/CodeGen/RISCV/memcpy-inline.ll b/llvm/test/CodeGen/RISCV/memcpy-inline.ll
index 343695ee37da8..833e07351eec7 100644
--- a/llvm/test/CodeGen/RISCV/memcpy-inline.ll
+++ b/llvm/test/CodeGen/RISCV/memcpy-inline.ll
@@ -3,9 +3,9 @@
 ; RUN:   | FileCheck %s --check-prefixes=RV32-BOTH,RV32
 ; RUN: llc < %s -mtriple=riscv64 \
 ; RUN:   | FileCheck %s --check-prefixes=RV64-BOTH,RV64
-; RUN: llc < %s -mtriple=riscv32 -mattr=+fast-unaligned-access \
+; RUN: llc < %s -mtriple=riscv32 -mattr=+unaligned-scalar-mem \
 ; RUN:   | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST
-; RUN: llc < %s -mtriple=riscv64 -mattr=+fast-unaligned-access \
+; RUN: llc < %s -mtriple=riscv64 -mattr=+unaligned-scalar-mem \
 ; RUN:   | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST
 
 ; ----------------------------------------------------------------------
diff --git a/llvm/test/CodeGen/RISCV/memcpy.ll b/llvm/test/CodeGen/RISCV/memcpy.ll
index 12ec0881b20d9..02f582339d0b7 100644
--- a/llvm/test/CodeGen/RISCV/memcpy.ll
+++ b/llvm/test/CodeGen/RISCV/memcpy.ll
@@ -3,9 +3,9 @@
 ; RUN:   | FileCheck %s --check-prefixes=RV32-BOTH,RV32
 ; RUN: llc < %s -mtriple=riscv64 \
 ; RUN:   | FileCheck %s --check-prefixes=RV64-BOTH,RV64
-; RUN: llc < %s -mtriple=riscv32 -mattr=+fast-unaligned-access \
+; RUN: llc < %s -mtriple=riscv32 -mattr=+unaligned-scalar-mem \
 ; RUN:   | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST
-; RUN: llc < %s -mtriple=riscv64 -mattr=+fast-unaligned-access \
+; RUN: llc < %s -mtriple=riscv64 -mattr=+unaligned-scalar-mem \
 ; RUN:   | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST
 %struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
 
diff --git a/llvm/test/CodeGen/RISCV/memset-inline.ll b/llvm/test/CodeGen/RISCV/memset-inline.ll
index cc22b77c641e2..55fe81a58805e 100644
--- a/llvm/test/CodeGen/RISCV/memset-inline.ll
+++ b/llvm/test/CodeGen/RISCV/memset-inline.ll
@@ -3,9 +3,9 @@
 ; RUN:   | FileCheck %s --check-prefixes=RV32-BOTH,RV32
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m \
 ; RUN:   | FileCheck %s --check-prefixes=RV64-BOTH,RV64
-; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+fast-unaligned-access \
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+unaligned-scalar-mem \
 ; RUN:   | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST
-; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+fast-unaligned-access \
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+unaligned-scalar-mem \
 ; RUN:   | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST
 %struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
 
diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index af341dbaadeab..364e8c7b38dac 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -465,6 +465,192 @@ define i32 @mulhu_constant(i32 %a) nounwind {
   ret i32 %4
 }
 
+define i32 @muli32_p14(i32 %a) nounwind {
+; RV32I-LABEL: muli32_p14:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 14
+; RV32I-NEXT:    tail __mulsi3
+;
+; RV32IM-LABEL: muli32_p14:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    li a1, 14
+; RV32IM-NEXT:    mul a0, a0, a1
+; RV32IM-NEXT:    ret
+;
+; RV64I-LABEL: muli32_p14:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a1, 14
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: muli32_p14:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    li a1, 14
+; RV64IM-NEXT:    mulw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = mul i32 %a, 14
+  ret i32 %1
+}
+
+define i32 @muli32_p28(i32 %a) nounwind {
+; RV32I-LABEL: muli32_p28:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 28
+; RV32I-NEXT:    tail __mulsi3
+;
+; RV32IM-LABEL: muli32_p28:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    li a1, 28
+; RV32IM-NEXT:    mul a0, a0, a1
+; RV32IM-NEXT:    ret
+;
+; RV64I-LABEL: muli32_p28:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a1, 28
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: muli32_p28:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    li a1, 28
+; RV64IM-NEXT:    mulw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = mul i32 %a, 28
+  ret i32 %1
+}
+
+define i32 @muli32_p30(i32 %a) nounwind {
+; RV32I-LABEL: muli32_p30:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 30
+; RV32I-NEXT:    tail __mulsi3
+;
+; RV32IM-LABEL: muli32_p30:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    li a1, 30
+; RV32IM-NEXT:    mul a0, a0, a1
+; RV32IM-NEXT:    ret
+;
+; RV64I-LABEL: muli32_p30:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a1, 30
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: muli32_p30:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    li a1, 30
+; RV64IM-NEXT:    mulw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = mul i32 %a, 30
+  ret i32 %1
+}
+
+define i32 @muli32_p56(i32 %a) nounwind {
+; RV32I-LABEL: muli32_p56:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 56
+; RV32I-NEXT:    tail __mulsi3
+;
+; RV32IM-LABEL: muli32_p56:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    li a1, 56
+; RV32IM-NEXT:    mul a0, a0, a1
+; RV32IM-NEXT:    ret
+;
+; RV64I-LABEL: muli32_p56:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a1, 56
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: muli32_p56:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    li a1, 56
+; RV64IM-NEXT:    mulw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = mul i32 %a, 56
+  ret i32 %1
+}
+
+define i32 @muli32_p60(i32 %a) nounwind {
+; RV32I-LABEL: muli32_p60:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 60
+; RV32I-NEXT:    tail __mulsi3
+;
+; RV32IM-LABEL: muli32_p60:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    li a1, 60
+; RV32IM-NEXT:    mul a0, a0, a1
+; RV32IM-NEXT:    ret
+;
+; RV64I-LABEL: muli32_p60:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a1, 60
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: muli32_p60:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    li a1, 60
+; RV64IM-NEXT:    mulw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = mul i32 %a, 60
+  ret i32 %1
+}
+
+define i32 @muli32_p62(i32 %a) nounwind {
+; RV32I-LABEL: muli32_p62:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 62
+; RV32I-NEXT:    tail __mulsi3
+;
+; RV32IM-LABEL: muli32_p62:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    li a1, 62
+; RV32IM-NEXT:    mul a0, a0, a1
+; RV32IM-NEXT:    ret
+;
+; RV64I-LABEL: muli32_p62:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    li a1, 62
+; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: muli32_p62:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    li a1, 62
+; RV64IM-NEXT:    mulw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = mul i32 %a, 62
+  ret i32 %1
+}
+
 define i32 @muli32_p65(i32 %a) nounwind {
 ; RV32I-LABEL: muli32_p65:
 ; RV32I:       # %bb.0:
@@ -600,6 +786,8 @@ define i64 @muli64_p63(i64 %a) nounwind {
   ret i64 %1
 }
 
+
+
 define i32 @muli32_m63(i32 %a) nounwind {
 ; RV32I-LABEL: muli32_m63:
 ; RV32I:       # %bb.0:
@@ -1145,10 +1333,10 @@ define i128 @muli128_m3840(i128 %a) nounwind {
 ; RV32I-NEXT:    sltu a7, a6, a4
 ; RV32I-NEXT:    sub t0, t1, t0
 ; RV32I-NEXT:    mv t1, a7
-; RV32I-NEXT:    beq a5, a3, .LBB30_2
+; RV32I-NEXT:    beq a5, a3, .LBB36_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    sltu t1, a5, a3
-; RV32I-NEXT:  .LBB30_2:
+; RV32I-NEXT:  .LBB36_2:
 ; RV32I-NEXT:    sub a2, a2, a1
 ; RV32I-NEXT:    sltu a1, a2, t1
 ; RV32I-NEXT:    sub a1, t0, a1
@@ -1261,10 +1449,10 @@ define i128 @muli128_m63(i128 %a) nounwind {
 ; RV32I-NEXT:    slli t0, a1, 6
 ; RV32I-NEXT:    or a7, t0, a7
 ; RV32I-NEXT:    mv t0, a5
-; RV32I-NEXT:    beq a1, a7, .LBB31_2
+; RV32I-NEXT:    beq a1, a7, .LBB37_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    sltu t0, a1, a7
-; RV32I-NEXT:  .LBB31_2:
+; RV32I-NEXT:  .LBB37_2:
 ; RV32I-NEXT:    srli t1, a1, 26
 ; RV32I-NEXT:    slli t2, a6, 6
 ; RV32I-NEXT:    or t1, t2, t1
diff --git a/llvm/test/CodeGen/RISCV/pr56110.ll b/llvm/test/CodeGen/RISCV/pr56110.ll
index c795b17419f56..fa441f5fc3aef 100644
--- a/llvm/test/CodeGen/RISCV/pr56110.ll
+++ b/llvm/test/CodeGen/RISCV/pr56110.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=riscv32 | FileCheck %s
-; RUN: llc < %s -mtriple=riscv32 -mattr=+fast-unaligned-access | FileCheck %s
+; RUN: llc < %s -mtriple=riscv32 -mattr=+unaligned-scalar-mem | FileCheck %s
 
 define void @foo_set(ptr nocapture noundef %a, i32 noundef %v) {
 ; CHECK-LABEL: foo_set:
diff --git a/llvm/test/CodeGen/RISCV/prefer-w-inst.ll b/llvm/test/CodeGen/RISCV/prefer-w-inst.ll
new file mode 100644
index 0000000000000..34ab74d78a76f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/prefer-w-inst.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=NO-PREFER-W-INST %s
+; RUN: llc -mtriple=riscv64 -mattr=+m -riscv-disable-strip-w-suffix -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=NO-STRIP %s
+; RUN: llc -mtriple=riscv64 -mattr=+m,+prefer-w-inst -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefixes=PREFER-W-INST %s
+
+define i32 @addiw(i32 %a) {
+; NO-PREFER-W-INST-LABEL: addiw:
+; NO-PREFER-W-INST:       # %bb.0:
+; NO-PREFER-W-INST-NEXT:    lui a1, 1
+; NO-PREFER-W-INST-NEXT:    addi a1, a1, -1
+; NO-PREFER-W-INST-NEXT:    addw a0, a0, a1
+; NO-PREFER-W-INST-NEXT:    ret
+;
+; NO-STRIP-LABEL: addiw:
+; NO-STRIP:       # %bb.0:
+; NO-STRIP-NEXT:    lui a1, 1
+; NO-STRIP-NEXT:    addiw a1, a1, -1
+; NO-STRIP-NEXT:    addw a0, a0, a1
+; NO-STRIP-NEXT:    ret
+;
+; PREFER-W-INST-LABEL: addiw:
+; PREFER-W-INST:       # %bb.0:
+; PREFER-W-INST-NEXT:    lui a1, 1
+; PREFER-W-INST-NEXT:    addiw a1, a1, -1
+; PREFER-W-INST-NEXT:    addw a0, a0, a1
+; PREFER-W-INST-NEXT:    ret
+  %ret = add i32 %a, 4095
+  ret i32 %ret
+}
+
+define i32 @addw(i32 %a, i32 %b) {
+; NO-PREFER-W-INST-LABEL: addw:
+; NO-PREFER-W-INST:       # %bb.0:
+; NO-PREFER-W-INST-NEXT:    add a0, a0, a1
+; NO-PREFER-W-INST-NEXT:    addiw a0, a0, 1024
+; NO-PREFER-W-INST-NEXT:    ret
+;
+; NO-STRIP-LABEL: addw:
+; NO-STRIP:       # %bb.0:
+; NO-STRIP-NEXT:    addw a0, a0, a1
+; NO-STRIP-NEXT:    addiw a0, a0, 1024
+; NO-STRIP-NEXT:    ret
+;
+; PREFER-W-INST-LABEL: addw:
+; PREFER-W-INST:       # %bb.0:
+; PREFER-W-INST-NEXT:    addw a0, a0, a1
+; PREFER-W-INST-NEXT:    addiw a0, a0, 1024
+; PREFER-W-INST-NEXT:    ret
+  %add = add i32 %a, %b
+  %ret = add i32 %add, 1024
+  ret i32 %ret
+}
+
+define i32 @mulw(i32 %a, i32 %b) {
+; NO-PREFER-W-INST-LABEL: mulw:
+; NO-PREFER-W-INST:       # %bb.0:
+; NO-PREFER-W-INST-NEXT:    mul a1, a0, a1
+; NO-PREFER-W-INST-NEXT:    mul a0, a0, a1
+; NO-PREFER-W-INST-NEXT:    addiw a0, a0, 1024
+; NO-PREFER-W-INST-NEXT:    ret
+;
+; NO-STRIP-LABEL: mulw:
+; NO-STRIP:       # %bb.0:
+; NO-STRIP-NEXT:    mulw a1, a0, a1
+; NO-STRIP-NEXT:    mulw a0, a0, a1
+; NO-STRIP-NEXT:    addiw a0, a0, 1024
+; NO-STRIP-NEXT:    ret
+;
+; PREFER-W-INST-LABEL: mulw:
+; PREFER-W-INST:       # %bb.0:
+; PREFER-W-INST-NEXT:    mulw a1, a0, a1
+; PREFER-W-INST-NEXT:    mulw a0, a0, a1
+; PREFER-W-INST-NEXT:    addiw a0, a0, 1024
+; PREFER-W-INST-NEXT:    ret
+  %mul1 = mul i32 %a, %b
+  %mul = mul i32 %a, %mul1
+  %ret = add i32 %mul, 1024
+  ret i32 %ret
+}
+
+define i32 @slliw(i32 %a) {
+; NO-PREFER-W-INST-LABEL: slliw:
+; NO-PREFER-W-INST:       # %bb.0:
+; NO-PREFER-W-INST-NEXT:    slli a0, a0, 1
+; NO-PREFER-W-INST-NEXT:    addiw a0, a0, 1024
+; NO-PREFER-W-INST-NEXT:    ret
+;
+; NO-STRIP-LABEL: slliw:
+; NO-STRIP:       # %bb.0:
+; NO-STRIP-NEXT:    slliw a0, a0, 1
+; NO-STRIP-NEXT:    addiw a0, a0, 1024
+; NO-STRIP-NEXT:    ret
+;
+; PREFER-W-INST-LABEL: slliw:
+; PREFER-W-INST:       # %bb.0:
+; PREFER-W-INST-NEXT:    slliw a0, a0, 1
+; PREFER-W-INST-NEXT:    addiw a0, a0, 1024
+; PREFER-W-INST-NEXT:    ret
+  %shl = shl i32 %a, 1
+  %ret = add i32 %shl, 1024
+  ret i32 %ret
+}
diff --git a/llvm/test/CodeGen/RISCV/prefer-w-inst.mir b/llvm/test/CodeGen/RISCV/prefer-w-inst.mir
new file mode 100644
index 0000000000000..e05e27af4271c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/prefer-w-inst.mir
@@ -0,0 +1,262 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc %s -mtriple=riscv64 -run-pass=riscv-opt-w-instrs -verify-machineinstrs \
+# RUN:   -mattr=+m -o - | FileCheck %s -check-prefixes=NO-PREFER-W-INST
+# RUN: llc %s -mtriple=riscv64 -run-pass=riscv-opt-w-instrs -verify-machineinstrs \
+# RUN:   -mattr=+m,+prefer-w-inst -o - | FileCheck %s -check-prefixes=PREFER-W-INST
+
+---
+name: addi
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+    ; NO-PREFER-W-INST-LABEL: name: addi
+    ; NO-PREFER-W-INST: liveins: $x10, $x11
+    ; NO-PREFER-W-INST-NEXT: {{  $}}
+    ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; NO-PREFER-W-INST-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI [[COPY]], 1
+    ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[ADDI]], 1
+    ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; NO-PREFER-W-INST-NEXT: PseudoRET
+    ;
+    ; PREFER-W-INST-LABEL: name: addi
+    ; PREFER-W-INST: liveins: $x10, $x11
+    ; PREFER-W-INST-NEXT: {{  $}}
+    ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[COPY]], 1
+    ; PREFER-W-INST-NEXT: [[ADDIW1:%[0-9]+]]:gpr = ADDIW [[ADDIW]], 1
+    ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW1]]
+    ; PREFER-W-INST-NEXT: PseudoRET
+    %1:gpr = COPY $x10
+    %2:gpr = COPY $x11
+    %3:gpr = ADDI %1, 1
+    %4:gpr = ADDIW %3, 1
+    $x10 = COPY %4
+    PseudoRET
+...
+
+---
+name: add
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+    ; NO-PREFER-W-INST-LABEL: name: add
+    ; NO-PREFER-W-INST: liveins: $x10, $x11
+    ; NO-PREFER-W-INST-NEXT: {{  $}}
+    ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; NO-PREFER-W-INST-NEXT: [[ADD:%[0-9]+]]:gpr = ADD [[COPY]], [[COPY1]]
+    ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[ADD]], 1
+    ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; NO-PREFER-W-INST-NEXT: PseudoRET
+    ;
+    ; PREFER-W-INST-LABEL: name: add
+    ; PREFER-W-INST: liveins: $x10, $x11
+    ; PREFER-W-INST-NEXT: {{  $}}
+    ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; PREFER-W-INST-NEXT: [[ADDW:%[0-9]+]]:gpr = ADDW [[COPY]], [[COPY1]]
+    ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[ADDW]], 1
+    ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; PREFER-W-INST-NEXT: PseudoRET
+    %1:gpr = COPY $x10
+    %2:gpr = COPY $x11
+    %3:gpr = ADD %1, %2
+    %4:gpr = ADDIW %3, 1
+    $x10 = COPY %4
+    PseudoRET
+...
+
+---
+name: sub
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+    ; NO-PREFER-W-INST-LABEL: name: sub
+    ; NO-PREFER-W-INST: liveins: $x10, $x11
+    ; NO-PREFER-W-INST-NEXT: {{  $}}
+    ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; NO-PREFER-W-INST-NEXT: [[SUB:%[0-9]+]]:gpr = SUB [[COPY]], [[COPY1]]
+    ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SUB]], 1
+    ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; NO-PREFER-W-INST-NEXT: PseudoRET
+    ;
+    ; PREFER-W-INST-LABEL: name: sub
+    ; PREFER-W-INST: liveins: $x10, $x11
+    ; PREFER-W-INST-NEXT: {{  $}}
+    ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; PREFER-W-INST-NEXT: [[SUBW:%[0-9]+]]:gpr = SUBW [[COPY]], [[COPY1]]
+    ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SUBW]], 1
+    ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; PREFER-W-INST-NEXT: PseudoRET
+    %1:gpr = COPY $x10
+    %2:gpr = COPY $x11
+    %3:gpr = SUB %1, %2
+    %4:gpr = ADDIW %3, 1
+    $x10 = COPY %4
+    PseudoRET
+...
+
+---
+name: mul
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+    ; NO-PREFER-W-INST-LABEL: name: mul
+    ; NO-PREFER-W-INST: liveins: $x10, $x11
+    ; NO-PREFER-W-INST-NEXT: {{  $}}
+    ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; NO-PREFER-W-INST-NEXT: [[MUL:%[0-9]+]]:gpr = MUL [[COPY]], [[COPY1]]
+    ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[MUL]], 1
+    ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; NO-PREFER-W-INST-NEXT: PseudoRET
+    ;
+    ; PREFER-W-INST-LABEL: name: mul
+    ; PREFER-W-INST: liveins: $x10, $x11
+    ; PREFER-W-INST-NEXT: {{  $}}
+    ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; PREFER-W-INST-NEXT: [[MULW:%[0-9]+]]:gpr = MULW [[COPY]], [[COPY1]]
+    ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[MULW]], 1
+    ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; PREFER-W-INST-NEXT: PseudoRET
+    %1:gpr = COPY $x10
+    %2:gpr = COPY $x11
+    %3:gpr = MUL %1, %2
+    %4:gpr = ADDIW %3, 1
+    $x10 = COPY %4
+    PseudoRET
+...
+
+
+---
+name: slli_31
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+    ; NO-PREFER-W-INST-LABEL: name: slli_31
+    ; NO-PREFER-W-INST: liveins: $x10, $x11
+    ; NO-PREFER-W-INST-NEXT: {{  $}}
+    ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; NO-PREFER-W-INST-NEXT: [[SLLI:%[0-9]+]]:gpr = SLLI [[COPY]], 31
+    ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SLLI]], 1
+    ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; NO-PREFER-W-INST-NEXT: PseudoRET
+    ;
+    ; PREFER-W-INST-LABEL: name: slli_31
+    ; PREFER-W-INST: liveins: $x10, $x11
+    ; PREFER-W-INST-NEXT: {{  $}}
+    ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; PREFER-W-INST-NEXT: [[SLLIW:%[0-9]+]]:gpr = SLLIW [[COPY]], 31
+    ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SLLIW]], 1
+    ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; PREFER-W-INST-NEXT: PseudoRET
+    %1:gpr = COPY $x10
+    %2:gpr = COPY $x11
+    %3:gpr = SLLI %1, 31
+    %4:gpr = ADDIW %3, 1
+    $x10 = COPY %4
+    PseudoRET
+...
+
+---
+name: slli_32
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+    ; NO-PREFER-W-INST-LABEL: name: slli_32
+    ; NO-PREFER-W-INST: liveins: $x10, $x11
+    ; NO-PREFER-W-INST-NEXT: {{  $}}
+    ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; NO-PREFER-W-INST-NEXT: [[SLLI:%[0-9]+]]:gpr = SLLI [[COPY]], 32
+    ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SLLI]], 1
+    ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; NO-PREFER-W-INST-NEXT: PseudoRET
+    ;
+    ; PREFER-W-INST-LABEL: name: slli_32
+    ; PREFER-W-INST: liveins: $x10, $x11
+    ; PREFER-W-INST-NEXT: {{  $}}
+    ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; PREFER-W-INST-NEXT: [[SLLI:%[0-9]+]]:gpr = SLLI [[COPY]], 32
+    ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[SLLI]], 1
+    ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; PREFER-W-INST-NEXT: PseudoRET
+    %1:gpr = COPY $x10
+    %2:gpr = COPY $x11
+    %3:gpr = SLLI %1, 32
+    %4:gpr = ADDIW %3, 1
+    $x10 = COPY %4
+    PseudoRET
+...
+
+---
+name: ld
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+    ; NO-PREFER-W-INST-LABEL: name: ld
+    ; NO-PREFER-W-INST: liveins: $x10, $x11
+    ; NO-PREFER-W-INST-NEXT: {{  $}}
+    ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; NO-PREFER-W-INST-NEXT: [[LD:%[0-9]+]]:gpr = LD [[COPY]], 0
+    ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LD]], 1
+    ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; NO-PREFER-W-INST-NEXT: PseudoRET
+    ;
+    ; PREFER-W-INST-LABEL: name: ld
+    ; PREFER-W-INST: liveins: $x10, $x11
+    ; PREFER-W-INST-NEXT: {{  $}}
+    ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; PREFER-W-INST-NEXT: [[LW:%[0-9]+]]:gpr = LW [[COPY]], 0
+    ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LW]], 1
+    ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; PREFER-W-INST-NEXT: PseudoRET
+    %1:gpr = COPY $x10
+    %2:gpr = COPY $x11
+    %3:gpr = LD %1, 0
+    %4:gpr = ADDIW %3, 1
+    $x10 = COPY %4
+    PseudoRET
+...
+
+---
+name: lwu
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x11
+    ; NO-PREFER-W-INST-LABEL: name: lwu
+    ; NO-PREFER-W-INST: liveins: $x10, $x11
+    ; NO-PREFER-W-INST-NEXT: {{  $}}
+    ; NO-PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; NO-PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; NO-PREFER-W-INST-NEXT: [[LWU:%[0-9]+]]:gpr = LWU [[COPY]], 0
+    ; NO-PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LWU]], 1
+    ; NO-PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; NO-PREFER-W-INST-NEXT: PseudoRET
+    ;
+    ; PREFER-W-INST-LABEL: name: lwu
+    ; PREFER-W-INST: liveins: $x10, $x11
+    ; PREFER-W-INST-NEXT: {{  $}}
+    ; PREFER-W-INST-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
+    ; PREFER-W-INST-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
+    ; PREFER-W-INST-NEXT: [[LW:%[0-9]+]]:gpr = LW [[COPY]], 0
+    ; PREFER-W-INST-NEXT: [[ADDIW:%[0-9]+]]:gpr = ADDIW [[LW]], 1
+    ; PREFER-W-INST-NEXT: $x10 = COPY [[ADDIW]]
+    ; PREFER-W-INST-NEXT: PseudoRET
+    %1:gpr = COPY $x10
+    %2:gpr = COPY $x11
+    %3:gpr = LWU %1, 0
+    %4:gpr = ADDIW %3, 1
+    $x10 = COPY %4
+    PseudoRET
+...
diff --git a/llvm/test/CodeGen/RISCV/riscv-func-target-feature.ll b/llvm/test/CodeGen/RISCV/riscv-func-target-feature.ll
index a03dadbc1d116..d627ae9c90394 100644
--- a/llvm/test/CodeGen/RISCV/riscv-func-target-feature.ll
+++ b/llvm/test/CodeGen/RISCV/riscv-func-target-feature.ll
@@ -36,7 +36,7 @@ entry:
 }
 
 ; CHECK-NOT: .option push
-define void @test5() "target-features"="+fast-unaligned-access" {
+define void @test5() "target-features"="+unaligned-scalar-mem" {
 ; CHECK-LABEL: test5
 ; CHECK-NOT: .option pop
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rv32zba.ll b/llvm/test/CodeGen/RISCV/rv32zba.ll
index 0908a393338c5..a78f823d31841 100644
--- a/llvm/test/CodeGen/RISCV/rv32zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zba.ll
@@ -271,31 +271,49 @@ define i32 @mul288(i32 %a) {
 }
 
 define i32 @mul258(i32 %a) {
-; CHECK-LABEL: mul258:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 258
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    ret
+; RV32I-LABEL: mul258:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 258
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBA-LABEL: mul258:
+; RV32ZBA:       # %bb.0:
+; RV32ZBA-NEXT:    slli a1, a0, 8
+; RV32ZBA-NEXT:    sh1add a0, a0, a1
+; RV32ZBA-NEXT:    ret
   %c = mul i32 %a, 258
   ret i32 %c
 }
 
 define i32 @mul260(i32 %a) {
-; CHECK-LABEL: mul260:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 260
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    ret
+; RV32I-LABEL: mul260:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 260
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBA-LABEL: mul260:
+; RV32ZBA:       # %bb.0:
+; RV32ZBA-NEXT:    slli a1, a0, 8
+; RV32ZBA-NEXT:    sh2add a0, a0, a1
+; RV32ZBA-NEXT:    ret
   %c = mul i32 %a, 260
   ret i32 %c
 }
 
 define i32 @mul264(i32 %a) {
-; CHECK-LABEL: mul264:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 264
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    ret
+; RV32I-LABEL: mul264:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 264
+; RV32I-NEXT:    mul a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBA-LABEL: mul264:
+; RV32ZBA:       # %bb.0:
+; RV32ZBA-NEXT:    slli a1, a0, 8
+; RV32ZBA-NEXT:    sh3add a0, a0, a1
+; RV32ZBA-NEXT:    ret
   %c = mul i32 %a, 264
   ret i32 %c
 }
@@ -627,3 +645,96 @@ define i32 @addshl_5_8(i32 %a, i32 %b) {
   %e = add i32 %c, %d
   ret i32 %e
 }
+
+define i32 @mul_neg1(i32 %a) {
+; CHECK-LABEL: mul_neg1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i32 %a, -1
+  ret i32 %c
+}
+
+define i32 @mul_neg2(i32 %a) {
+; CHECK-LABEL: mul_neg2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i32 %a, -2
+  ret i32 %c
+}
+
+define i32 @mul_neg3(i32 %a) {
+; RV32I-LABEL: mul_neg3:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 1
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBA-LABEL: mul_neg3:
+; RV32ZBA:       # %bb.0:
+; RV32ZBA-NEXT:    sh1add a0, a0, a0
+; RV32ZBA-NEXT:    neg a0, a0
+; RV32ZBA-NEXT:    ret
+  %c = mul i32 %a, -3
+  ret i32 %c
+}
+
+define i32 @mul_neg4(i32 %a) {
+; CHECK-LABEL: mul_neg4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i32 %a, -4
+  ret i32 %c
+}
+
+define i32 @mul_neg5(i32 %a) {
+; RV32I-LABEL: mul_neg5:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a0, 2
+; RV32I-NEXT:    neg a0, a0
+; RV32I-NEXT:    sub a0, a0, a1
+; RV32I-NEXT:    ret
+;
+; RV32ZBA-LABEL: mul_neg5:
+; RV32ZBA:       # %bb.0:
+; RV32ZBA-NEXT:    sh2add a0, a0, a0
+; RV32ZBA-NEXT:    neg a0, a0
+; RV32ZBA-NEXT:    ret
+  %c = mul i32 %a, -5
+  ret i32 %c
+}
+
+define i32 @mul_neg6(i32 %a) {
+; CHECK-LABEL: mul_neg6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -6
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i32 %a, -6
+  ret i32 %c
+}
+
+define i32 @mul_neg7(i32 %a) {
+; CHECK-LABEL: mul_neg7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i32 %a, -7
+  ret i32 %c
+}
+
+define i32 @mul_neg8(i32 %a) {
+; CHECK-LABEL: mul_neg8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i32 %a, -8
+  ret i32 %c
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
index 90cfb1fdcb779..ee9b73ca82f21 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
@@ -811,31 +811,49 @@ define i64 @adduw_imm(i32 signext %0) nounwind {
 }
 
 define i64 @mul258(i64 %a) {
-; CHECK-LABEL: mul258:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 258
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: mul258:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 258
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul258:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli a1, a0, 8
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
   %c = mul i64 %a, 258
   ret i64 %c
 }
 
 define i64 @mul260(i64 %a) {
-; CHECK-LABEL: mul260:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 260
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: mul260:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 260
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul260:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli a1, a0, 8
+; RV64ZBA-NEXT:    sh2add a0, a0, a1
+; RV64ZBA-NEXT:    ret
   %c = mul i64 %a, 260
   ret i64 %c
 }
 
 define i64 @mul264(i64 %a) {
-; CHECK-LABEL: mul264:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 264
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: mul264:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 264
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul264:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli a1, a0, 8
+; RV64ZBA-NEXT:    sh3add a0, a0, a1
+; RV64ZBA-NEXT:    ret
   %c = mul i64 %a, 264
   ret i64 %c
 }
diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
index 6f56babf28f5e..1450c86c76d05 100644
--- a/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xtheadba.ll
@@ -268,6 +268,23 @@ define i64 @mul96(i64 %a) {
   ret i64 %c
 }
 
+define i64 @mul137(i64 %a) {
+; RV64I-LABEL: mul137:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 137
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBA-LABEL: mul137:
+; RV64XTHEADBA:       # %bb.0:
+; RV64XTHEADBA-NEXT:    th.addsl a1, a0, a0, 3
+; RV64XTHEADBA-NEXT:    slli a0, a0, 7
+; RV64XTHEADBA-NEXT:    add a0, a0, a1
+; RV64XTHEADBA-NEXT:    ret
+  %c = mul i64 %a, 137
+  ret i64 %c
+}
+
 define i64 @mul160(i64 %a) {
 ; RV64I-LABEL: mul160:
 ; RV64I:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index bb4be323ecb2e..6939185947f44 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -567,6 +567,87 @@ define i64 @mul96(i64 %a) {
   ret i64 %c
 }
 
+define i64 @mul119(i64 %a) {
+; CHECK-LABEL: mul119:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 119
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 119
+  ret i64 %c
+}
+
+define i64 @mul123(i64 %a) {
+; CHECK-LABEL: mul123:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 123
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 123
+  ret i64 %c
+}
+
+define i64 @mul125(i64 %a) {
+; CHECK-LABEL: mul125:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 125
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 125
+  ret i64 %c
+}
+
+define i64 @mul131(i64 %a) {
+; RV64I-LABEL: mul131:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 131
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul131:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh1add a1, a0, a0
+; RV64ZBA-NEXT:    slli a0, a0, 7
+; RV64ZBA-NEXT:    add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 131
+  ret i64 %c
+}
+
+define i64 @mul133(i64 %a) {
+; RV64I-LABEL: mul133:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 133
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul133:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh2add a1, a0, a0
+; RV64ZBA-NEXT:    slli a0, a0, 7
+; RV64ZBA-NEXT:    add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 133
+  ret i64 %c
+}
+
+define i64 @mul137(i64 %a) {
+; RV64I-LABEL: mul137:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 137
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul137:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add a1, a0, a0
+; RV64ZBA-NEXT:    slli a0, a0, 7
+; RV64ZBA-NEXT:    add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 137
+  ret i64 %c
+}
+
 define i64 @mul160(i64 %a) {
 ; RV64I-LABEL: mul160:
 ; RV64I:       # %bb.0:
@@ -834,31 +915,49 @@ define i64 @adduw_imm(i32 signext %0) nounwind {
 }
 
 define i64 @mul258(i64 %a) {
-; CHECK-LABEL: mul258:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 258
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: mul258:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 258
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul258:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli a1, a0, 8
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
   %c = mul i64 %a, 258
   ret i64 %c
 }
 
 define i64 @mul260(i64 %a) {
-; CHECK-LABEL: mul260:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 260
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: mul260:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 260
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul260:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli a1, a0, 8
+; RV64ZBA-NEXT:    sh2add a0, a0, a1
+; RV64ZBA-NEXT:    ret
   %c = mul i64 %a, 260
   ret i64 %c
 }
 
 define i64 @mul264(i64 %a) {
-; CHECK-LABEL: mul264:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a1, 264
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: mul264:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 264
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul264:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli a1, a0, 8
+; RV64ZBA-NEXT:    sh3add a0, a0, a1
+; RV64ZBA-NEXT:    ret
   %c = mul i64 %a, 264
   ret i64 %c
 }
@@ -2412,3 +2511,118 @@ define ptr @test_gep_gep_dont_crash(ptr %p, i64 %a1, i64 %a2) {
   %gep2 = getelementptr i64, ptr %gep1, i64 %a1
   ret ptr %gep2
 }
+
+define i64 @regression(i32 signext %x, i32 signext %y) {
+; RV64I-LABEL: regression:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    li a1, 3
+; RV64I-NEXT:    slli a1, a1, 35
+; RV64I-NEXT:    mulhu a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: regression:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    subw a0, a0, a1
+; RV64ZBA-NEXT:    slli.uw a0, a0, 3
+; RV64ZBA-NEXT:    sh1add a0, a0, a0
+; RV64ZBA-NEXT:    ret
+  %sub = sub i32 %x, %y
+  %ext = zext i32 %sub to i64
+  %res = mul nuw nsw i64 %ext, 24
+  ret i64 %res
+}
+
+define i64 @mul_neg1(i64 %a) {
+; CHECK-LABEL: mul_neg1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, -1
+  ret i64 %c
+}
+
+define i64 @mul_neg2(i64 %a) {
+; CHECK-LABEL: mul_neg2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, -2
+  ret i64 %c
+}
+
+define i64 @mul_neg3(i64 %a) {
+; RV64I-LABEL: mul_neg3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 1
+; RV64I-NEXT:    neg a0, a0
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul_neg3:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh1add a0, a0, a0
+; RV64ZBA-NEXT:    neg a0, a0
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, -3
+  ret i64 %c
+}
+
+define i64 @mul_neg4(i64 %a) {
+; CHECK-LABEL: mul_neg4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, -4
+  ret i64 %c
+}
+
+define i64 @mul_neg5(i64 %a) {
+; RV64I-LABEL: mul_neg5:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 2
+; RV64I-NEXT:    neg a0, a0
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul_neg5:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh2add a0, a0, a0
+; RV64ZBA-NEXT:    neg a0, a0
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, -5
+  ret i64 %c
+}
+
+define i64 @mul_neg6(i64 %a) {
+; CHECK-LABEL: mul_neg6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -6
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, -6
+  ret i64 %c
+}
+
+define i64 @mul_neg7(i64 %a) {
+; CHECK-LABEL: mul_neg7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, -7
+  ret i64 %c
+}
+
+define i64 @mul_neg8(i64 %a) {
+; CHECK-LABEL: mul_neg8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, -8
+  ret i64 %c
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll b/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll
new file mode 100644
index 0000000000000..c99388cbdaf44
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv-cfi-info.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=OMIT-FP %s
+; RUN: llc -mtriple=riscv64 -mattr=+v,+m -verify-machineinstrs -frame-pointer=all < %s \
+; RUN:   | FileCheck -check-prefix=NO-OMIT-FP %s
+
+define riscv_vector_cc <vscale x 1 x i32> @test_vector_callee_cfi(<vscale x 1 x i32> %va) {
+; OMIT-FP-LABEL: test_vector_callee_cfi:
+; OMIT-FP:       # %bb.0: # %entry
+; OMIT-FP-NEXT:    addi sp, sp, -16
+; OMIT-FP-NEXT:    .cfi_def_cfa_offset 16
+; OMIT-FP-NEXT:    csrr a0, vlenb
+; OMIT-FP-NEXT:    slli a0, a0, 3
+; OMIT-FP-NEXT:    sub sp, sp, a0
+; OMIT-FP-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; OMIT-FP-NEXT:    csrr a0, vlenb
+; OMIT-FP-NEXT:    li a1, 6
+; OMIT-FP-NEXT:    mul a0, a0, a1
+; OMIT-FP-NEXT:    add a0, sp, a0
+; OMIT-FP-NEXT:    addi a0, a0, 16
+; OMIT-FP-NEXT:    vs1r.v v1, (a0) # Unknown-size Folded Spill
+; OMIT-FP-NEXT:    csrr a0, vlenb
+; OMIT-FP-NEXT:    slli a0, a0, 2
+; OMIT-FP-NEXT:    add a0, sp, a0
+; OMIT-FP-NEXT:    addi a0, a0, 16
+; OMIT-FP-NEXT:    vs2r.v v2, (a0) # Unknown-size Folded Spill
+; OMIT-FP-NEXT:    addi a0, sp, 16
+; OMIT-FP-NEXT:    vs4r.v v4, (a0) # Unknown-size Folded Spill
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x61, 0x08, 0x11, 0x7e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v1 @ cfa - 2 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x62, 0x08, 0x11, 0x7c, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v2m2 @ cfa - 4 * vlenb
+; OMIT-FP-NEXT:    .cfi_escape 0x10, 0x64, 0x08, 0x11, 0x78, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v4m4 @ cfa - 8 * vlenb
+; OMIT-FP-NEXT:    #APP
+; OMIT-FP-NEXT:    #NO_APP
+; OMIT-FP-NEXT:    csrr a0, vlenb
+; OMIT-FP-NEXT:    li a1, 6
+; OMIT-FP-NEXT:    mul a0, a0, a1
+; OMIT-FP-NEXT:    add a0, sp, a0
+; OMIT-FP-NEXT:    addi a0, a0, 16
+; OMIT-FP-NEXT:    vl1r.v v1, (a0) # Unknown-size Folded Reload
+; OMIT-FP-NEXT:    csrr a0, vlenb
+; OMIT-FP-NEXT:    slli a0, a0, 2
+; OMIT-FP-NEXT:    add a0, sp, a0
+; OMIT-FP-NEXT:    addi a0, a0, 16
+; OMIT-FP-NEXT:    vl2r.v v2, (a0) # Unknown-size Folded Reload
+; OMIT-FP-NEXT:    addi a0, sp, 16
+; OMIT-FP-NEXT:    vl4r.v v4, (a0) # Unknown-size Folded Reload
+; OMIT-FP-NEXT:    csrr a0, vlenb
+; OMIT-FP-NEXT:    slli a0, a0, 3
+; OMIT-FP-NEXT:    add sp, sp, a0
+; OMIT-FP-NEXT:    addi sp, sp, 16
+; OMIT-FP-NEXT:    ret
+;
+; NO-OMIT-FP-LABEL: test_vector_callee_cfi:
+; NO-OMIT-FP:       # %bb.0: # %entry
+; NO-OMIT-FP-NEXT:    addi sp, sp, -32
+; NO-OMIT-FP-NEXT:    .cfi_def_cfa_offset 32
+; NO-OMIT-FP-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; NO-OMIT-FP-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; NO-OMIT-FP-NEXT:    .cfi_offset ra, -8
+; NO-OMIT-FP-NEXT:    .cfi_offset s0, -16
+; NO-OMIT-FP-NEXT:    addi s0, sp, 32
+; NO-OMIT-FP-NEXT:    .cfi_def_cfa s0, 0
+; NO-OMIT-FP-NEXT:    csrr a0, vlenb
+; NO-OMIT-FP-NEXT:    slli a0, a0, 3
+; NO-OMIT-FP-NEXT:    sub sp, sp, a0
+; NO-OMIT-FP-NEXT:    csrr a0, vlenb
+; NO-OMIT-FP-NEXT:    slli a0, a0, 1
+; NO-OMIT-FP-NEXT:    sub a0, s0, a0
+; NO-OMIT-FP-NEXT:    addi a0, a0, -32
+; NO-OMIT-FP-NEXT:    vs1r.v v1, (a0) # Unknown-size Folded Spill
+; NO-OMIT-FP-NEXT:    csrr a0, vlenb
+; NO-OMIT-FP-NEXT:    slli a0, a0, 2
+; NO-OMIT-FP-NEXT:    sub a0, s0, a0
+; NO-OMIT-FP-NEXT:    addi a0, a0, -32
+; NO-OMIT-FP-NEXT:    vs2r.v v2, (a0) # Unknown-size Folded Spill
+; NO-OMIT-FP-NEXT:    csrr a0, vlenb
+; NO-OMIT-FP-NEXT:    slli a0, a0, 3
+; NO-OMIT-FP-NEXT:    sub a0, s0, a0
+; NO-OMIT-FP-NEXT:    addi a0, a0, -32
+; NO-OMIT-FP-NEXT:    vs4r.v v4, (a0) # Unknown-size Folded Spill
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x61, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v1 @ cfa - 32 - 2 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x62, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x7c, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v2m2 @ cfa - 32 - 4 * vlenb
+; NO-OMIT-FP-NEXT:    .cfi_escape 0x10, 0x64, 0x0b, 0x11, 0x60, 0x22, 0x11, 0x78, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # $v4m4 @ cfa - 32 - 8 * vlenb
+; NO-OMIT-FP-NEXT:    #APP
+; NO-OMIT-FP-NEXT:    #NO_APP
+; NO-OMIT-FP-NEXT:    csrr a0, vlenb
+; NO-OMIT-FP-NEXT:    slli a0, a0, 1
+; NO-OMIT-FP-NEXT:    sub a0, s0, a0
+; NO-OMIT-FP-NEXT:    addi a0, a0, -32
+; NO-OMIT-FP-NEXT:    vl1r.v v1, (a0) # Unknown-size Folded Reload
+; NO-OMIT-FP-NEXT:    csrr a0, vlenb
+; NO-OMIT-FP-NEXT:    slli a0, a0, 2
+; NO-OMIT-FP-NEXT:    sub a0, s0, a0
+; NO-OMIT-FP-NEXT:    addi a0, a0, -32
+; NO-OMIT-FP-NEXT:    vl2r.v v2, (a0) # Unknown-size Folded Reload
+; NO-OMIT-FP-NEXT:    csrr a0, vlenb
+; NO-OMIT-FP-NEXT:    slli a0, a0, 3
+; NO-OMIT-FP-NEXT:    sub a0, s0, a0
+; NO-OMIT-FP-NEXT:    addi a0, a0, -32
+; NO-OMIT-FP-NEXT:    vl4r.v v4, (a0) # Unknown-size Folded Reload
+; NO-OMIT-FP-NEXT:    addi sp, s0, -32
+; NO-OMIT-FP-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; NO-OMIT-FP-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; NO-OMIT-FP-NEXT:    addi sp, sp, 32
+; NO-OMIT-FP-NEXT:    ret
+entry:
+  call void asm sideeffect "",
+  "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"()
+
+  ret <vscale x 1 x i32> %va
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll b/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
index 78e8700a9feff..647d3158b6167 100644
--- a/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/calling-conv.ll
@@ -162,3 +162,206 @@ define void @caller_tuple_argument({<vscale x 4 x i32>, <vscale x 4 x i32>} %x)
 }
 
 declare void @callee_tuple_argument({<vscale x 4 x i32>, <vscale x 4 x i32>})
+
+; %0 -> v8
+; %1 -> v9
+define <vscale x 1 x i64> @case1(<vscale x 1 x i64> %0, <vscale x 1 x i64> %1) {
+; CHECK-LABEL: case1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %a = add <vscale x 1 x i64> %0, %1
+  ret <vscale x 1 x i64> %a
+}
+
+; %0 -> v8
+; %1 -> v10-v11
+; %2 -> v9
+define <vscale x 1 x i64> @case2_1(<vscale x 1 x i64> %0, <vscale x 2 x i64> %1, <vscale x 1 x i64> %2) {
+; CHECK-LABEL: case2_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %a = add <vscale x 1 x i64> %0, %2
+  ret <vscale x 1 x i64> %a
+}
+define <vscale x 2 x i64> @case2_2(<vscale x 1 x i64> %0, <vscale x 2 x i64> %1, <vscale x 1 x i64> %2) {
+; CHECK-LABEL: case2_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v10
+; CHECK-NEXT:    ret
+  %a = add <vscale x 2 x i64> %1, %1
+  ret <vscale x 2 x i64> %a
+}
+
+; %0 -> v8
+; %1 -> {v10-v11, v12-v13}
+; %2 -> v9
+define <vscale x 1 x i64> @case3_1(<vscale x 1 x i64> %0, {<vscale x 2 x i64>, <vscale x 2 x i64>} %1, <vscale x 1 x i64> %2) {
+; CHECK-LABEL: case3_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %add = add <vscale x 1 x i64> %0, %2
+  ret <vscale x 1 x i64> %add
+}
+define <vscale x 2 x i64> @case3_2(<vscale x 1 x i64> %0, {<vscale x 2 x i64>, <vscale x 2 x i64>} %1, <vscale x 1 x i64> %2) {
+; CHECK-LABEL: case3_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v10, v12
+; CHECK-NEXT:    ret
+  %a = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %1, 0
+  %b = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %1, 1
+  %add = add <vscale x 2 x i64> %a, %b
+  ret <vscale x 2 x i64> %add
+}
+
+; %0 -> v8
+; %1 -> {by-ref, by-ref}
+; %2 -> v9
+define <vscale x 8 x i64> @case4_1(<vscale x 1 x i64> %0, {<vscale x 8 x i64>, <vscale x 8 x i64>} %1, <vscale x 1 x i64> %2) {
+; CHECK-LABEL: case4_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    vl8re64.v v8, (a1)
+; CHECK-NEXT:    vl8re64.v v16, (a0)
+; CHECK-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v16, v8
+; CHECK-NEXT:    ret
+  %a = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64> } %1, 0
+  %b = extractvalue { <vscale x 8 x i64>, <vscale x 8 x i64> } %1, 1
+  %add = add <vscale x 8 x i64> %a, %b
+  ret <vscale x 8 x i64> %add
+}
+define <vscale x 1 x i64> @case4_2(<vscale x 1 x i64> %0, {<vscale x 8 x i64>, <vscale x 8 x i64>} %1, <vscale x 1 x i64> %2) {
+; CHECK-LABEL: case4_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %add = add <vscale x 1 x i64> %0, %2
+  ret <vscale x 1 x i64> %add
+}
+
+declare <vscale x 1 x i64> @callee1()
+declare void @callee2(<vscale x 1 x i64>)
+declare void @callee3(<vscale x 4 x i32>)
+define void @caller() {
+; RV32-LABEL: caller:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    call callee1
+; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT:    vadd.vv v8, v8, v8
+; RV32-NEXT:    call callee2
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: caller:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    call callee1
+; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64-NEXT:    vadd.vv v8, v8, v8
+; RV64-NEXT:    call callee2
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = call <vscale x 1 x i64> @callee1()
+  %add = add <vscale x 1 x i64> %a, %a
+  call void @callee2(<vscale x 1 x i64> %add)
+  ret void
+}
+
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @callee_tuple()
+define void @caller_tuple() {
+; RV32-LABEL: caller_tuple:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    call callee_tuple
+; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v8, v10
+; RV32-NEXT:    call callee3
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: caller_tuple:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    call callee_tuple
+; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV64-NEXT:    vadd.vv v8, v8, v10
+; RV64-NEXT:    call callee3
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @callee_tuple()
+  %b = extractvalue {<vscale x 4 x i32>, <vscale x 4 x i32>} %a, 0
+  %c = extractvalue {<vscale x 4 x i32>, <vscale x 4 x i32>} %a, 1
+  %add = add <vscale x 4 x i32> %b, %c
+  call void @callee3(<vscale x 4 x i32> %add)
+  ret void
+}
+
+declare {<vscale x 4 x i32>, {<vscale x 4 x i32>, <vscale x 4 x i32>}} @callee_nested()
+define void @caller_nested() {
+; RV32-LABEL: caller_nested:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    call callee_nested
+; RV32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV32-NEXT:    vadd.vv v8, v8, v10
+; RV32-NEXT:    vadd.vv v8, v8, v12
+; RV32-NEXT:    call callee3
+; RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: caller_nested:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    .cfi_def_cfa_offset 16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    call callee_nested
+; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV64-NEXT:    vadd.vv v8, v8, v10
+; RV64-NEXT:    vadd.vv v8, v8, v12
+; RV64-NEXT:    call callee3
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %a = call {<vscale x 4 x i32>, {<vscale x 4 x i32>, <vscale x 4 x i32>}} @callee_nested()
+  %b = extractvalue {<vscale x 4 x i32>, {<vscale x 4 x i32>, <vscale x 4 x i32>}} %a, 0
+  %c = extractvalue {<vscale x 4 x i32>, {<vscale x 4 x i32>, <vscale x 4 x i32>}} %a, 1
+  %c0 = extractvalue {<vscale x 4 x i32>, <vscale x 4 x i32>} %c, 0
+  %c1 = extractvalue {<vscale x 4 x i32>, <vscale x 4 x i32>} %c, 1
+  %add0 = add <vscale x 4 x i32> %b, %c0
+  %add1 = add <vscale x 4 x i32> %add0, %c1
+  call void @callee3(<vscale x 4 x i32> %add1)
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/concat-vectors-constant-stride.ll b/llvm/test/CodeGen/RISCV/rvv/concat-vectors-constant-stride.ll
index f244810e739d9..ff35043dbd7e7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/concat-vectors-constant-stride.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/concat-vectors-constant-stride.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+v,+fast-unaligned-access -target-abi=ilp32 \
+; RUN: llc -mtriple=riscv32 -mattr=+v,+unaligned-vector-mem -target-abi=ilp32 \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+v,+fast-unaligned-access -target-abi=lp64 \
+; RUN: llc -mtriple=riscv64 -mattr=+v,+unaligned-vector-mem -target-abi=lp64 \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
 
 define void @constant_forward_stride(ptr %s, ptr %d) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
index 1d3c22a02efc0..ab6df1d3e883f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll
@@ -628,6 +628,7 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) {
 ; RV32-NEXT:    vs8r.v v8, (a0)
 ; RV32-NEXT:    vs8r.v v16, (a1)
 ; RV32-NEXT:    addi sp, s0, -80
+; RV32-NEXT:    .cfi_def_cfa sp, 80
 ; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 80
@@ -661,6 +662,7 @@ define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) {
 ; RV64-NEXT:    vs8r.v v8, (a0)
 ; RV64-NEXT:    vs8r.v v16, (a1)
 ; RV64-NEXT:    addi sp, s0, -80
+; RV64-NEXT:    .cfi_def_cfa sp, 80
 ; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 80
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
index 657d52354aa39..f0fcc482e2207 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-combine.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=riscv32 -mattr=+v,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV32
 ; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,RV64
-; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+fast-unaligned-access -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64,RV64-MISALIGN
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+unaligned-vector-mem -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64,RV64-MISALIGN
 
 ; RUN: llc -mtriple=riscv64 -mattr=+f,+zfh,+zve64f,+zvl128b,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,CHECK-NO-MISALIGN,ZVE64F
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
index fffc4d6c08335..36c36a13964c9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll
@@ -3,9 +3,9 @@
 ; RUN:   | FileCheck %s --check-prefixes=SLOW,RV32-SLOW
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefixes=SLOW,RV64-SLOW
-; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+fast-unaligned-access -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+unaligned-vector-mem -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefixes=FAST,RV32-FAST
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+fast-unaligned-access -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+unaligned-vector-mem -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s --check-prefixes=FAST,RV64-FAST
 
 define <4 x i32> @load_v4i32_align1(ptr %ptr) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll b/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll
index 485f94ee2a102..53598c609107b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll
@@ -3,9 +3,9 @@
 ; RUN:   | FileCheck %s --check-prefixes=RV32-BOTH,RV32
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+v \
 ; RUN:   | FileCheck %s --check-prefixes=RV64-BOTH,RV64
-; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+fast-unaligned-access \
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+unaligned-scalar-mem,+unaligned-vector-mem \
 ; RUN:   | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+fast-unaligned-access \
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+unaligned-scalar-mem,+unaligned-vector-mem \
 ; RUN:   | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST
 
 ; ----------------------------------------------------------------------
diff --git a/llvm/test/CodeGen/RISCV/rvv/memset-inline.ll b/llvm/test/CodeGen/RISCV/rvv/memset-inline.ll
index 0e7e914cf68e8..accc18519d626 100644
--- a/llvm/test/CodeGen/RISCV/rvv/memset-inline.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/memset-inline.ll
@@ -3,9 +3,9 @@
 ; RUN:   | FileCheck %s --check-prefixes=RV32-BOTH,RV32
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v \
 ; RUN:   | FileCheck %s --check-prefixes=RV64-BOTH,RV64
-; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+fast-unaligned-access \
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+unaligned-scalar-mem,,+unaligned-vector-mem \
 ; RUN:   | FileCheck %s --check-prefixes=RV32-BOTH,RV32-FAST
-; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+fast-unaligned-access \
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+unaligned-scalar-mem,+unaligned-vector-mem \
 ; RUN:   | FileCheck %s --check-prefixes=RV64-BOTH,RV64-FAST
 %struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/pr88799.ll b/llvm/test/CodeGen/RISCV/rvv/pr88799.ll
new file mode 100644
index 0000000000000..7212a789f9e7e
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/pr88799.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=riscv64-unknown-linux-gnu -mattr=+v | FileCheck %s
+
+define i32 @main() vscale_range(2,2) {
+; CHECK-LABEL: main:
+; CHECK:       # %bb.0: # %vector.body
+; CHECK-NEXT:    lui a0, 1040368
+; CHECK-NEXT:    addiw a0, a0, -144
+; CHECK-NEXT:    vl2re16.v v8, (a0)
+; CHECK-NEXT:    vs2r.v v8, (zero)
+; CHECK-NEXT:    li a0, 0
+; CHECK-NEXT:    ret
+vector.body:
+  %0 = load <16 x i16>, ptr getelementptr ([3 x [23 x [23 x i16]]], ptr null, i64 -10593, i64 1, i64 22, i64 0), align 16
+  store <16 x i16> %0, ptr null, align 2
+  %wide.load = load <vscale x 8 x i16>, ptr getelementptr ([3 x [23 x [23 x i16]]], ptr null, i64 -10593, i64 1, i64 22, i64 0), align 16
+  store <vscale x 8 x i16> %wide.load, ptr null, align 2
+  ret i32 0
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/unaligned-loads-stores.ll b/llvm/test/CodeGen/RISCV/rvv/unaligned-loads-stores.ll
index f488baf5a9d9f..1491bb6c337a0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/unaligned-loads-stores.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/unaligned-loads-stores.ll
@@ -3,9 +3,9 @@
 ; RUN:    -verify-machineinstrs | FileCheck %s
 ; RUN: llc -mtriple riscv64 -mattr=+d,+zfh,+zvfh,+v < %s \
 ; RUN:    -verify-machineinstrs | FileCheck %s
-; RUN: llc -mtriple riscv32 -mattr=+d,+zfh,+zvfh,+v,+fast-unaligned-access < %s \
+; RUN: llc -mtriple riscv32 -mattr=+d,+zfh,+zvfh,+v,+unaligned-vector-mem < %s \
 ; RUN:    -verify-machineinstrs | FileCheck --check-prefix=FAST %s
-; RUN: llc -mtriple riscv64 -mattr=+d,+zfh,+zvfh,+v,+fast-unaligned-access < %s \
+; RUN: llc -mtriple riscv64 -mattr=+d,+zfh,+zvfh,+v,+unaligned-vector-mem < %s \
 ; RUN:    -verify-machineinstrs | FileCheck --check-prefix=FAST %s
 
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
index a320aecc6fce4..6a712080fda74 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-load.ll
@@ -18,10 +18,10 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_load_nxv16i
 ; CHECK-NEXT:    vmerge.vim v14, v10, 1, v0
 ; CHECK-NEXT:    vmv1r.v v0, v8
 ; CHECK-NEXT:    vmerge.vim v12, v10, 1, v0
-; CHECK-NEXT:    vnsrl.wi v8, v12, 0
-; CHECK-NEXT:    vmsne.vi v0, v8, 0
-; CHECK-NEXT:    vnsrl.wi v10, v12, 8
+; CHECK-NEXT:    vnsrl.wi v10, v12, 0
 ; CHECK-NEXT:    vmsne.vi v8, v10, 0
+; CHECK-NEXT:    vnsrl.wi v10, v12, 8
+; CHECK-NEXT:    vmsne.vi v9, v10, 0
 ; CHECK-NEXT:    ret
   %vec = load <vscale x 32 x i1>, ptr %p
   %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index ef4baf34d23f0..d98597fabcd95 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -8,18 +8,18 @@ define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv
 ; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv32i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v10, 0
-; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v12, v8, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    srli a0, a0, 2
 ; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v0, v0, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmerge.vim v10, v10, 1, v0
-; CHECK-NEXT:    vnsrl.wi v12, v8, 0
-; CHECK-NEXT:    vmsne.vi v0, v12, 0
-; CHECK-NEXT:    vnsrl.wi v12, v8, 8
-; CHECK-NEXT:    vmsne.vi v8, v12, 0
+; CHECK-NEXT:    vmerge.vim v14, v8, 1, v0
+; CHECK-NEXT:    vnsrl.wi v10, v12, 0
+; CHECK-NEXT:    vmsne.vi v8, v10, 0
+; CHECK-NEXT:    vnsrl.wi v10, v12, 8
+; CHECK-NEXT:    vmsne.vi v9, v10, 0
 ; CHECK-NEXT:    ret
 %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.experimental.vector.deinterleave2.nxv32i1(<vscale x 32 x i1> %vec)
 ret {<vscale x 16 x i1>, <vscale x 16 x i1>} %retval
@@ -102,12 +102,13 @@ define {<vscale x 64 x i1>, <vscale x 64 x i1>} @vector_deinterleave_nxv64i1_nxv
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v28, v8, 0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmsne.vi v0, v24, 0
+; CHECK-NEXT:    vmsne.vi v7, v24, 0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vnsrl.wi v24, v16, 8
 ; CHECK-NEXT:    vnsrl.wi v28, v8, 8
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
-; CHECK-NEXT:    vmsne.vi v8, v24, 0
+; CHECK-NEXT:    vmsne.vi v9, v24, 0
+; CHECK-NEXT:    vmv1r.v v8, v7
 ; CHECK-NEXT:    ret
 %retval = call {<vscale x 64 x i1>, <vscale x 64 x i1>} @llvm.experimental.vector.deinterleave2.nxv128i1(<vscale x 128 x i1> %vec)
 ret {<vscale x 64 x i1>, <vscale x 64 x i1>} %retval
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir
index 1850abe6363bc..e8620c848f8d3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert.mir
@@ -76,6 +76,10 @@
     ret void
   }
 
+  define void @postpass_modify_vl() {
+    ret void
+  }
+
   declare <vscale x 1 x i64> @llvm.riscv.vadd.nxv1i64.nxv1i64.i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, i64) #1
 
   declare <vscale x 1 x i64> @llvm.riscv.vle.nxv1i64.i64(<vscale x 1 x i64>, ptr nocapture, i64) #4
@@ -503,3 +507,25 @@ body:             |
     %5:vr = PseudoVMV_V_I_MF2 $noreg, 1, 2, 5, 0
     PseudoRET
 ...
+---
+name:            postpass_modify_vl
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x1
+    ; CHECK-LABEL: name: postpass_modify_vl
+    ; CHECK: liveins: $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 3, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $vtype
+    ; CHECK-NEXT: $vl = COPY $x1
+    ; CHECK-NEXT: dead $x0 = PseudoVSETIVLI 3, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; CHECK-NEXT: [[PseudoVADD_VV_M1_:%[0-9]+]]:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6 /* e64 */, 0 /* tu, mu */, implicit $vl, implicit $vtype
+    ; CHECK-NEXT: PseudoRET
+    dead $x0 = PseudoVSETIVLI 3, 216, implicit-def $vl, implicit-def $vtype
+    %1:gpr = COPY $vtype
+    $vl = COPY $x1
+    dead $x0 = PseudoVSETIVLI 3, 216, implicit-def $vl, implicit-def $vtype
+    %4:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 3, 6, 0
+    PseudoRET
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/vxrm.mir b/llvm/test/CodeGen/RISCV/rvv/vxrm.mir
index 64e191887e092..a588677bec8e2 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vxrm.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vxrm.mir
@@ -11,9 +11,9 @@ body:     |
     ; MIR-LABEL: name: verify_vxrm
     ; MIR: liveins: $v8, $v9, $x10
     ; MIR-NEXT: {{  $}}
-    ; MIR-NEXT: dead $x0 = PseudoVSETVLI renamable $x10, 197 /* e8, mf8, ta, ma */, implicit-def $vl, implicit-def $vtype
+    ; MIR-NEXT: dead $x0 = PseudoVSETVLI killed renamable $x10, 197 /* e8, mf8, ta, ma */, implicit-def $vl, implicit-def $vtype
     ; MIR-NEXT: WriteVXRMImm 0, implicit-def $vxrm
-    ; MIR-NEXT: renamable $v8 = PseudoVAADD_VV_MF8 undef $v8, renamable $v8, renamable $v9, 0, $noreg, 3 /* e8 */, 0  /* tu, mu */, implicit $vl, implicit $vtype, implicit $vxrm
+    ; MIR-NEXT: renamable $v8 = PseudoVAADD_VV_MF8 undef $v8, killed renamable $v8, killed renamable $v9, 0, $noreg, 3 /* e8 */, 0  /* tu, mu */, implicit $vl, implicit $vtype, implicit $vxrm
     ; MIR-NEXT: PseudoRET implicit $v8
     ; ASM-LABEL: verify_vxrm:
     ; ASM:        # %bb.0:
@@ -23,8 +23,8 @@ body:     |
     ; ASM-NEXT:    ret
     %0:vr = COPY $v8
     %1:vr = COPY $v9
-    dead $x0 = PseudoVSETVLI killed renamable $x10, 197 /* e8, mf8, ta, ma */, implicit-def $vl, implicit-def $vtype
+    %2:gprnox0 = COPY $x10
     %pt:vr = IMPLICIT_DEF
-    renamable $v8 = PseudoVAADD_VV_MF8 %pt, killed renamable $v8, killed renamable $v9, 0, $noreg, 3 /* e8 */, 0
+    renamable $v8 = PseudoVAADD_VV_MF8 %pt, %0, %1, 0, %2, 3 /* e8 */, 0
     PseudoRET implicit $v8
 ...
diff --git a/llvm/test/CodeGen/RISCV/strip-w-suffix.ll b/llvm/test/CodeGen/RISCV/strip-w-suffix.ll
deleted file mode 100644
index 4124b3d0d360d..0000000000000
--- a/llvm/test/CodeGen/RISCV/strip-w-suffix.ll
+++ /dev/null
@@ -1,74 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \
-; RUN:   | FileCheck -check-prefixes=STRIP %s
-; RUN: llc -mtriple=riscv64 -mattr=+m,+no-strip-w-suffix -verify-machineinstrs < %s \
-; RUN:   | FileCheck -check-prefixes=NO-STRIP %s
-
-define i32 @addiw(i32 %a) {
-; STRIP-LABEL: addiw:
-; STRIP:       # %bb.0:
-; STRIP-NEXT:    lui a1, 1
-; STRIP-NEXT:    addi a1, a1, -1
-; STRIP-NEXT:    addw a0, a0, a1
-; STRIP-NEXT:    ret
-;
-; NO-STRIP-LABEL: addiw:
-; NO-STRIP:       # %bb.0:
-; NO-STRIP-NEXT:    lui a1, 1
-; NO-STRIP-NEXT:    addiw a1, a1, -1
-; NO-STRIP-NEXT:    addw a0, a0, a1
-; NO-STRIP-NEXT:    ret
-  %ret = add i32 %a, 4095
-  ret i32 %ret
-}
-
-define i32 @addw(i32 %a, i32 %b) {
-; STRIP-LABEL: addw:
-; STRIP:       # %bb.0:
-; STRIP-NEXT:    add a0, a0, a1
-; STRIP-NEXT:    addiw a0, a0, 1024
-; STRIP-NEXT:    ret
-;
-; NO-STRIP-LABEL: addw:
-; NO-STRIP:       # %bb.0:
-; NO-STRIP-NEXT:    addw a0, a0, a1
-; NO-STRIP-NEXT:    addiw a0, a0, 1024
-; NO-STRIP-NEXT:    ret
-  %add = add i32 %a, %b
-  %ret = add i32 %add, 1024
-  ret i32 %ret
-}
-
-define i32 @mulw(i32 %a, i32 %b) {
-; STRIP-LABEL: mulw:
-; STRIP:       # %bb.0:
-; STRIP-NEXT:    mul a0, a0, a1
-; STRIP-NEXT:    addiw a0, a0, 1024
-; STRIP-NEXT:    ret
-;
-; NO-STRIP-LABEL: mulw:
-; NO-STRIP:       # %bb.0:
-; NO-STRIP-NEXT:    mulw a0, a0, a1
-; NO-STRIP-NEXT:    addiw a0, a0, 1024
-; NO-STRIP-NEXT:    ret
-  %mul = mul i32 %a, %b
-  %ret = add i32 %mul, 1024
-  ret i32 %ret
-}
-
-define i32 @slliw(i32 %a) {
-; STRIP-LABEL: slliw:
-; STRIP:       # %bb.0:
-; STRIP-NEXT:    slli a0, a0, 1
-; STRIP-NEXT:    addiw a0, a0, 1024
-; STRIP-NEXT:    ret
-;
-; NO-STRIP-LABEL: slliw:
-; NO-STRIP:       # %bb.0:
-; NO-STRIP-NEXT:    slliw a0, a0, 1
-; NO-STRIP-NEXT:    addiw a0, a0, 1024
-; NO-STRIP-NEXT:    ret
-  %shl = shl i32 %a, 1
-  %ret = add i32 %shl, 1024
-  ret i32 %ret
-}
diff --git a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
index 599b0d08629ea..ce0d8fedbfb88 100644
--- a/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/RISCV/unaligned-load-store.ll
@@ -3,9 +3,9 @@
 ; RUN:   | FileCheck -check-prefixes=ALL,SLOW,RV32I %s
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=ALL,SLOW,RV64I %s
-; RUN: llc -mtriple=riscv32 -mattr=+fast-unaligned-access -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+unaligned-scalar-mem -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=ALL,FAST,RV32I-FAST %s
-; RUN: llc -mtriple=riscv64 -mattr=+fast-unaligned-access -verify-machineinstrs < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+unaligned-scalar-mem -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=ALL,FAST,RV64I-FAST %s
 
 ; A collection of cases showing codegen for unaligned loads and stores
diff --git a/llvm/test/CodeGen/SPIRV/instructions/select-phi.ll b/llvm/test/CodeGen/SPIRV/instructions/select-phi.ll
index afc75c616f023..3828fe89e60ae 100644
--- a/llvm/test/CodeGen/SPIRV/instructions/select-phi.ll
+++ b/llvm/test/CodeGen/SPIRV/instructions/select-phi.ll
@@ -1,6 +1,10 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown --translator-compatibility-mode %s -o - -filetype=obj | spirv-val %}
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --translator-compatibility-mode %s -o - -filetype=obj | spirv-val %}
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-DAG: %[[Char:.*]] = OpTypeInt 8 0
 ; CHECK-DAG: %[[Long:.*]] = OpTypeInt 32 0
diff --git a/llvm/test/CodeGen/SPIRV/instructions/select-ptr-load.ll b/llvm/test/CodeGen/SPIRV/instructions/select-ptr-load.ll
new file mode 100644
index 0000000000000..0ff28952f8081
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/instructions/select-ptr-load.ll
@@ -0,0 +1,25 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV-DAG: %[[Float:.*]] = OpTypeFloat 32
+; CHECK-SPIRV-DAG: %[[FloatPtr:.*]] = OpTypePointer Function %[[Float]]
+; CHECK-SPIRV: OpInBoundsPtrAccessChain %[[FloatPtr]]
+; CHECK-SPIRV: OpInBoundsPtrAccessChain %[[FloatPtr]]
+; CHECK-SPIRV: OpSelect %[[FloatPtr]]
+; CHECK-SPIRV: OpLoad %[[Float]]
+
+%struct = type { [3 x float] }
+
+define spir_kernel void @bar(i1 %sw) {
+entry:
+  %var1 = alloca %struct
+  %var2 = alloca %struct
+  %elem1 = getelementptr inbounds [3 x float], ptr %var1, i64 0, i64 0
+  %elem2 = getelementptr inbounds [3 x float], ptr %var2, i64 0, i64 1
+  %elem = select i1 %sw, ptr %elem1, ptr %elem2
+  %res = load float, ptr %elem
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/instructions/select.ll b/llvm/test/CodeGen/SPIRV/instructions/select.ll
index c4176b17abb44..9234b97157d9d 100644
--- a/llvm/test/CodeGen/SPIRV/instructions/select.ll
+++ b/llvm/test/CodeGen/SPIRV/instructions/select.ll
@@ -1,6 +1,9 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
 ; CHECK-DAG:  OpName [[SCALARi32:%.+]] "select_i32"
 ; CHECK-DAG:  OpName [[SCALARPTR:%.+]] "select_ptr"
 ; CHECK-DAG:  OpName [[VEC2i32:%.+]] "select_i32v2"
diff --git a/llvm/test/CodeGen/SPIRV/select.ll b/llvm/test/CodeGen/SPIRV/select-builtin.ll
similarity index 67%
rename from llvm/test/CodeGen/SPIRV/select.ll
rename to llvm/test/CodeGen/SPIRV/select-builtin.ll
index b34e91be1dbcd..6717970d160fc 100644
--- a/llvm/test/CodeGen/SPIRV/select.ll
+++ b/llvm/test/CodeGen/SPIRV/select-builtin.ll
@@ -1,4 +1,6 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV: OpSelect
 
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/memcpy-zext.ll b/llvm/test/CodeGen/SPIRV/transcoding/memcpy-zext.ll
new file mode 100644
index 0000000000000..ea0197548a815
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/transcoding/memcpy-zext.ll
@@ -0,0 +1,43 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-32
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-64
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-64-DAG: %[[#i64:]] = OpTypeInt 64 0
+
+; CHECK-DAG:    %[[#i8:]] = OpTypeInt 8 0
+; CHECK-DAG:    %[[#i32:]] = OpTypeInt 32 0
+; CHECK-DAG:    %[[#one:]] = OpConstant %[[#i32]] 1
+; CHECK-DAG:    %[[#two:]] = OpConstant %[[#i32]] 2
+; CHECK-DAG:    %[[#three:]] = OpConstant %[[#i32]] 3
+; CHECK-DAG:    %[[#i32x3:]] = OpTypeArray %[[#i32]] %[[#three]]
+; CHECK-DAG:    %[[#test_arr_init:]] = OpConstantComposite %[[#i32x3]] %[[#one]] %[[#two]] %[[#three]]
+; CHECK-DAG:    %[[#szconst1024:]] = OpConstant %[[#i32]] 1024
+; CHECK-DAG:    %[[#szconst42:]] = OpConstant %[[#i8]] 42
+; CHECK-DAG:    %[[#const_i32x3_ptr:]] = OpTypePointer UniformConstant %[[#i32x3]]
+; CHECK-DAG:    %[[#test_arr:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]]
+; CHECK-DAG:    %[[#i32x3_ptr:]] = OpTypePointer Function %[[#i32x3]]
+; CHECK:        %[[#arr:]] = OpVariable %[[#i32x3_ptr]] Function
+
+; CHECK-32:     OpCopyMemorySized %[[#arr]] %[[#test_arr]] %[[#szconst1024]]
+; CHECK-64:     %[[#szconstext1024:]] = OpUConvert %[[#i64:]] %[[#szconst1024:]]
+; CHECK-64:     OpCopyMemorySized %[[#arr]] %[[#test_arr]] %[[#szconstext1024]]
+
+; CHECK-32:     %[[#szconstext42:]] = OpUConvert %[[#i32:]] %[[#szconst42:]]
+; CHECK-32:     OpCopyMemorySized %[[#arr]] %[[#test_arr]] %[[#szconstext42]]
+; CHECK-64:     %[[#szconstext42:]] = OpUConvert %[[#i64:]] %[[#szconst42:]]
+; CHECK-64:     OpCopyMemorySized %[[#arr]] %[[#test_arr]] %[[#szconstext42]]
+
+@__const.test.arr = private unnamed_addr addrspace(2) constant [3 x i32] [i32 1, i32 2, i32 3]
+
+define spir_func void @test() {
+entry:
+  %arr = alloca [3 x i32], align 4
+  %dest = bitcast ptr %arr to ptr
+  call void @llvm.memcpy.p0.p2.i32(ptr align 4 %dest, ptr addrspace(2) align 4 @__const.test.arr, i32 1024, i1 false)
+  call void @llvm.memcpy.p0.p2.i8(ptr align 4 %dest, ptr addrspace(2) align 4 @__const.test.arr, i8 42, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0.p2.i32(ptr nocapture writeonly, ptr addrspace(2) nocapture readonly, i32, i1)
+declare void @llvm.memcpy.p0.p2.i8(ptr nocapture writeonly, ptr addrspace(2) nocapture readonly, i8, i1)
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll b/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll
index e0172ec3c1bdb..04fb39118034c 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/spirv-private-array-initialization.ll
@@ -1,23 +1,34 @@
-; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
-;
-; CHECK-SPIRV-DAG: %[[#i32:]] = OpTypeInt 32 0
-; CHECK-SPIRV-DAG: %[[#one:]] = OpConstant %[[#i32]] 1
-; CHECK-SPIRV-DAG: %[[#two:]] = OpConstant %[[#i32]] 2
-; CHECK-SPIRV-DAG: %[[#three:]] = OpConstant %[[#i32]] 3
-; CHECK-SPIRV-DAG: %[[#i32x3:]] = OpTypeArray %[[#i32]] %[[#three]]
-; CHECK-SPIRV-DAG: %[[#test_arr_init:]] = OpConstantComposite %[[#i32x3]] %[[#one]] %[[#two]] %[[#three]]
-; CHECK-SPIRV-DAG: %[[#twelve:]] = OpConstant %[[#i32]] 12
-; CHECK-SPIRV-DAG: %[[#const_i32x3_ptr:]] = OpTypePointer UniformConstant %[[#i32x3]]
-
-; CHECK-SPIRV:     %[[#test_arr2:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]]
-; CHECK-SPIRV:     %[[#test_arr:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]]
-
-; CHECK-SPIRV-DAG: %[[#i32x3_ptr:]] = OpTypePointer Function %[[#i32x3]]
-
-; CHECK-SPIRV:     %[[#arr:]] = OpVariable %[[#i32x3_ptr]] Function
-; CHECK-SPIRV:     %[[#arr2:]] = OpVariable %[[#i32x3_ptr]] Function
-; CHECK-SPIRV:     OpCopyMemorySized %[[#arr]] %[[#test_arr]] %[[#twelve]] Aligned 4
-; CHECK-SPIRV:     OpCopyMemorySized %[[#arr2]] %[[#test_arr2]] %[[#twelve]] Aligned 4
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV-32
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV-64
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV-64-DAG: %[[#i64:]] = OpTypeInt 64 0
+
+; CHECK-SPIRV-DAG:    %[[#i32:]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG:    %[[#one:]] = OpConstant %[[#i32]] 1
+; CHECK-SPIRV-DAG:    %[[#two:]] = OpConstant %[[#i32]] 2
+; CHECK-SPIRV-DAG:    %[[#three:]] = OpConstant %[[#i32]] 3
+; CHECK-SPIRV-DAG:    %[[#i32x3:]] = OpTypeArray %[[#i32]] %[[#three]]
+; CHECK-SPIRV-DAG:    %[[#test_arr_init:]] = OpConstantComposite %[[#i32x3]] %[[#one]] %[[#two]] %[[#three]]
+; CHECK-SPIRV-DAG:    %[[#twelve:]] = OpConstant %[[#i32]] 12
+; CHECK-SPIRV-DAG:    %[[#const_i32x3_ptr:]] = OpTypePointer UniformConstant %[[#i32x3]]
+
+; CHECK-SPIRV:        %[[#test_arr2:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]]
+; CHECK-SPIRV:        %[[#test_arr:]] = OpVariable %[[#const_i32x3_ptr]] UniformConstant %[[#test_arr_init]]
+
+; CHECK-SPIRV-DAG:    %[[#i32x3_ptr:]] = OpTypePointer Function %[[#i32x3]]
+
+; CHECK-SPIRV:        %[[#arr:]] = OpVariable %[[#i32x3_ptr]] Function
+; CHECK-SPIRV:        %[[#arr2:]] = OpVariable %[[#i32x3_ptr]] Function
+
+; CHECK-SPIRV-32:     OpCopyMemorySized %[[#arr]] %[[#test_arr]] %[[#twelve]] Aligned 4
+; CHECK-SPIRV-32:     OpCopyMemorySized %[[#arr2]] %[[#test_arr2]] %[[#twelve]] Aligned 4
+
+; CHECK-SPIRV-64:     %[[#twelvezext1:]] = OpUConvert %[[#i64:]] %[[#twelve:]]
+; CHECK-SPIRV-64:     OpCopyMemorySized %[[#arr]] %[[#test_arr]] %[[#twelvezext1]] Aligned 4
+; CHECK-SPIRV-64:     %[[#twelvezext2:]] = OpUConvert %[[#i64:]] %[[#twelve:]]
+; CHECK-SPIRV-64:     OpCopyMemorySized %[[#arr2]] %[[#test_arr2]] %[[#twelvezext2]] Aligned 4
 
 
 @__const.test.arr = private unnamed_addr addrspace(2) constant [3 x i32] [i32 1, i32 2, i32 3], align 4
diff --git a/llvm/test/CodeGen/X86/GlobalISel/fconstant.ll b/llvm/test/CodeGen/X86/GlobalISel/fconstant.ll
index a9b2037e9947a..8d2ee3c50f215 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/fconstant.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/fconstant.ll
@@ -10,27 +10,22 @@ define void @test_float(ptr %a , float %b) {
 ; CHECK64_SMALL:       # %bb.0: # %entry
 ; CHECK64_SMALL-NEXT:    movss {{.*#+}} xmm1 = [5.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK64_SMALL-NEXT:    addss %xmm0, %xmm1
-; CHECK64_SMALL-NEXT:    movd %xmm1, %eax
-; CHECK64_SMALL-NEXT:    movl %eax, (%rdi)
+; CHECK64_SMALL-NEXT:    movss %xmm1, (%rdi)
 ; CHECK64_SMALL-NEXT:    retq
 ;
 ; CHECK64_LARGE-LABEL: test_float:
 ; CHECK64_LARGE:       # %bb.0: # %entry
 ; CHECK64_LARGE-NEXT:    movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
 ; CHECK64_LARGE-NEXT:    addss (%rax), %xmm0
-; CHECK64_LARGE-NEXT:    movd %xmm0, %eax
-; CHECK64_LARGE-NEXT:    movl %eax, (%rdi)
+; CHECK64_LARGE-NEXT:    movss %xmm0, (%rdi)
 ; CHECK64_LARGE-NEXT:    retq
 ;
 ; CHECK32-LABEL: test_float:
 ; CHECK32:       # %bb.0: # %entry
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK32-NEXT:    movss {{.*#+}} xmm0 = [5.5E+0,0.0E+0,0.0E+0,0.0E+0]
-; CHECK32-NEXT:    movd %ecx, %xmm1
-; CHECK32-NEXT:    addss %xmm0, %xmm1
-; CHECK32-NEXT:    movd %xmm1, %ecx
-; CHECK32-NEXT:    movl %ecx, (%eax)
+; CHECK32-NEXT:    addss {{[0-9]+}}(%esp), %xmm0
+; CHECK32-NEXT:    movss %xmm0, (%eax)
 ; CHECK32-NEXT:    retl
 entry:
   %aa = fadd float 5.500000e+00, %b
diff --git a/llvm/test/CodeGen/X86/GlobalISel/regbankselect-sse-intrinsics.ll b/llvm/test/CodeGen/X86/GlobalISel/regbankselect-sse-intrinsics.ll
new file mode 100644
index 0000000000000..3388af605d969
--- /dev/null
+++ b/llvm/test/CodeGen/X86/GlobalISel/regbankselect-sse-intrinsics.ll
@@ -0,0 +1,153 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse -global-isel -stop-after=regbankselect | FileCheck %s
+
+define void @test_x86_sse_max_ps(ptr %p1, ptr %p2) {
+  ; CHECK-LABEL: name: test_x86_sse_max_ps
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1)
+  ; CHECK-NEXT:   [[FRAME_INDEX1:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.0
+  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load (p0) from %fixed-stack.0)
+  ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1)
+  ; CHECK-NEXT:   [[LOAD3:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD1]](p0) :: (load (<4 x s32>) from %ir.p2)
+  ; CHECK-NEXT:   [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.max.ps), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+  ; CHECK-NEXT:   G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1)
+  ; CHECK-NEXT:   RET 0
+  %a0 = load <4 x float>, ptr %p1, align 16
+  %a1 = load <4 x float>, ptr %p2, align 16
+  %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  store <4 x float> %res, ptr %p1
+  ret void
+}
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
+
+
+define void @test_x86_sse_max_ss(ptr %p1, ptr %p2) {
+  ; CHECK-LABEL: name: test_x86_sse_max_ss
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1)
+  ; CHECK-NEXT:   [[FRAME_INDEX1:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.0
+  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load (p0) from %fixed-stack.0)
+  ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1)
+  ; CHECK-NEXT:   [[LOAD3:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD1]](p0) :: (load (<4 x s32>) from %ir.p2)
+  ; CHECK-NEXT:   [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.max.ss), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+  ; CHECK-NEXT:   G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1)
+  ; CHECK-NEXT:   RET 0
+  %a0 = load <4 x float>, ptr %p1, align 16
+  %a1 = load <4 x float>, ptr %p2, align 16
+  %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  store <4 x float> %res, ptr %p1
+  ret void
+}
+declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define void @test_x86_sse_min_ps(ptr %p1, ptr %p2) {
+  ; CHECK-LABEL: name: test_x86_sse_min_ps
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1)
+  ; CHECK-NEXT:   [[FRAME_INDEX1:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.0
+  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load (p0) from %fixed-stack.0)
+  ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1)
+  ; CHECK-NEXT:   [[LOAD3:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD1]](p0) :: (load (<4 x s32>) from %ir.p2)
+  ; CHECK-NEXT:   [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.min.ps), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+  ; CHECK-NEXT:   G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1)
+  ; CHECK-NEXT:   RET 0
+  %a0 = load <4 x float>, ptr %p1, align 16
+  %a1 = load <4 x float>, ptr %p2, align 16
+  %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  store <4 x float> %res, ptr %p1
+  ret void
+}
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
+
+
+define void @test_x86_sse_min_ss(ptr %p1, ptr %p2) {
+  ; CHECK-LABEL: name: test_x86_sse_min_ss
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1)
+  ; CHECK-NEXT:   [[FRAME_INDEX1:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.0
+  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load (p0) from %fixed-stack.0)
+  ; CHECK-NEXT:   [[LOAD2:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1)
+  ; CHECK-NEXT:   [[LOAD3:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD1]](p0) :: (load (<4 x s32>) from %ir.p2)
+  ; CHECK-NEXT:   [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.min.ss), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
+  ; CHECK-NEXT:   G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1)
+  ; CHECK-NEXT:   RET 0
+  %a0 = load <4 x float>, ptr %p1, align 16
+  %a1 = load <4 x float>, ptr %p2, align 16
+  %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
+  store <4 x float> %res, ptr %p1
+  ret void
+}
+declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define void @test_x86_sse_rcp_ps(ptr %p1, ptr %p2) {
+  ; CHECK-LABEL: name: test_x86_sse_rcp_ps
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1)
+  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1)
+  ; CHECK-NEXT:   [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.rcp.ps), [[LOAD1]](<4 x s32>)
+  ; CHECK-NEXT:   G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1)
+  ; CHECK-NEXT:   RET 0
+  %a0 = load <4 x float>, ptr %p1, align 16
+  %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  store <4 x float> %res, ptr %p1
+  ret void
+}
+declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
+
+
+define void @test_x86_sse_rcp_ss(ptr %p1, ptr %p2) {
+  ; CHECK-LABEL: name: test_x86_sse_rcp_ss
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1)
+  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1)
+  ; CHECK-NEXT:   [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.rcp.ss), [[LOAD1]](<4 x s32>)
+  ; CHECK-NEXT:   G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1)
+  ; CHECK-NEXT:   RET 0
+  %a0 = load <4 x float>, ptr %p1, align 16
+  %res = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  store <4 x float> %res, ptr %p1
+  ret void
+}
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
+
+
+define void @test_x86_sse_rsqrt_ps(ptr %p1, ptr %p2) {
+  ; CHECK-LABEL: name: test_x86_sse_rsqrt_ps
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1)
+  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1)
+  ; CHECK-NEXT:   [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.rsqrt.ps), [[LOAD1]](<4 x s32>)
+  ; CHECK-NEXT:   G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1)
+  ; CHECK-NEXT:   RET 0
+  %a0 = load <4 x float>, ptr %p1, align 16
+  %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  store <4 x float> %res, ptr %p1
+  ret void
+}
+declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
+
+
+define void @test_x86_sse_rsqrt_ss(ptr %p1, ptr %p2) {
+  ; CHECK-LABEL: name: test_x86_sse_rsqrt_ss
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.1
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (p0) from %fixed-stack.1)
+  ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:vecr(<4 x s32>) = G_LOAD [[LOAD]](p0) :: (load (<4 x s32>) from %ir.p1)
+  ; CHECK-NEXT:   [[INT:%[0-9]+]]:vecr(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.x86.sse.rsqrt.ss), [[LOAD1]](<4 x s32>)
+  ; CHECK-NEXT:   G_STORE [[INT]](<4 x s32>), [[LOAD]](p0) :: (store (<4 x s32>) into %ir.p1)
+  ; CHECK-NEXT:   RET 0
+  %a0 = load <4 x float>, ptr %p1, align 16
+  %res = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  store <4 x float> %res, ptr %p1
+  ret void
+}
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/GlobalISel/regbankselect-x87.ll b/llvm/test/CodeGen/X86/GlobalISel/regbankselect-x87.ll
index d09db0f2474c9..99d458a183a9b 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/regbankselect-x87.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/regbankselect-x87.ll
@@ -142,7 +142,7 @@ define float @f4(float %val) {
   ; X86-LABEL: name: f4
   ; X86: bb.1 (%ir-block.0):
   ; X86-NEXT:   [[FRAME_INDEX:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.0
-  ; X86-NEXT:   [[LOAD:%[0-9]+]]:gpr(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (s32) from %fixed-stack.0)
+  ; X86-NEXT:   [[LOAD:%[0-9]+]]:psr(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (invariant load (s32) from %fixed-stack.0)
   ; X86-NEXT:   $fp0 = COPY [[LOAD]](s32)
   ; X86-NEXT:   RET 0, implicit $fp0
   ;
@@ -187,13 +187,10 @@ define void @f5(ptr %a, ptr %b) {
   ; X64-NEXT: {{  $}}
   ; X64-NEXT:   [[COPY:%[0-9]+]]:gpr(p0) = COPY $rdi
   ; X64-NEXT:   [[COPY1:%[0-9]+]]:gpr(p0) = COPY $rsi
-  ; X64-NEXT:   [[LOAD:%[0-9]+]]:gpr(s64) = G_LOAD [[COPY]](p0) :: (load (s64) from %ir.a)
-  ; X64-NEXT:   [[LOAD1:%[0-9]+]]:gpr(s64) = G_LOAD [[COPY1]](p0) :: (load (s64) from %ir.b)
-  ; X64-NEXT:   [[COPY2:%[0-9]+]]:psr(s64) = COPY [[LOAD]](s64)
-  ; X64-NEXT:   [[COPY3:%[0-9]+]]:psr(s64) = COPY [[LOAD1]](s64)
-  ; X64-NEXT:   [[FADD:%[0-9]+]]:psr(s64) = G_FADD [[COPY2]], [[COPY3]]
-  ; X64-NEXT:   [[COPY4:%[0-9]+]]:gpr(s64) = COPY [[FADD]](s64)
-  ; X64-NEXT:   G_STORE [[COPY4]](s64), [[COPY]](p0) :: (store (s64) into %ir.a)
+  ; X64-NEXT:   [[LOAD:%[0-9]+]]:psr(s64) = G_LOAD [[COPY]](p0) :: (load (s64) from %ir.a)
+  ; X64-NEXT:   [[LOAD1:%[0-9]+]]:psr(s64) = G_LOAD [[COPY1]](p0) :: (load (s64) from %ir.b)
+  ; X64-NEXT:   [[FADD:%[0-9]+]]:psr(s64) = G_FADD [[LOAD]], [[LOAD1]]
+  ; X64-NEXT:   G_STORE [[FADD]](s64), [[COPY]](p0) :: (store (s64) into %ir.a)
   ; X64-NEXT:   RET 0
   %load1 = load double, ptr %a, align 8
   %load2 = load double, ptr %b, align 8
@@ -210,11 +207,9 @@ define void @f6(ptr %0, ptr %1) {
   ; X86-NEXT:   [[FRAME_INDEX1:%[0-9]+]]:gpr(p0) = G_FRAME_INDEX %fixed-stack.0
   ; X86-NEXT:   [[LOAD1:%[0-9]+]]:gpr(p0) = G_LOAD [[FRAME_INDEX1]](p0) :: (invariant load (p0) from %fixed-stack.0)
   ; X86-NEXT:   [[C:%[0-9]+]]:psr(s32) = G_FCONSTANT float 2.000000e+01
-  ; X86-NEXT:   [[LOAD2:%[0-9]+]]:gpr(s32) = G_LOAD [[LOAD]](p0) :: (load (s32) from %ir.0)
-  ; X86-NEXT:   [[COPY:%[0-9]+]]:psr(s32) = COPY [[LOAD2]](s32)
-  ; X86-NEXT:   [[FADD:%[0-9]+]]:psr(s32) = G_FADD [[COPY]], [[C]]
-  ; X86-NEXT:   [[COPY1:%[0-9]+]]:gpr(s32) = COPY [[FADD]](s32)
-  ; X86-NEXT:   G_STORE [[COPY1]](s32), [[LOAD1]](p0) :: (store (s32) into %ir.1)
+  ; X86-NEXT:   [[LOAD2:%[0-9]+]]:psr(s32) = G_LOAD [[LOAD]](p0) :: (load (s32) from %ir.0)
+  ; X86-NEXT:   [[FADD:%[0-9]+]]:psr(s32) = G_FADD [[LOAD2]], [[C]]
+  ; X86-NEXT:   G_STORE [[FADD]](s32), [[LOAD1]](p0) :: (store (s32) into %ir.1)
   ; X86-NEXT:   RET 0
   ;
   ; X64-LABEL: name: f6
@@ -224,11 +219,9 @@ define void @f6(ptr %0, ptr %1) {
   ; X64-NEXT:   [[COPY:%[0-9]+]]:gpr(p0) = COPY $rdi
   ; X64-NEXT:   [[COPY1:%[0-9]+]]:gpr(p0) = COPY $rsi
   ; X64-NEXT:   [[C:%[0-9]+]]:psr(s32) = G_FCONSTANT float 2.000000e+01
-  ; X64-NEXT:   [[LOAD:%[0-9]+]]:gpr(s32) = G_LOAD [[COPY]](p0) :: (load (s32) from %ir.0)
-  ; X64-NEXT:   [[COPY2:%[0-9]+]]:psr(s32) = COPY [[LOAD]](s32)
-  ; X64-NEXT:   [[FADD:%[0-9]+]]:psr(s32) = G_FADD [[COPY2]], [[C]]
-  ; X64-NEXT:   [[COPY3:%[0-9]+]]:gpr(s32) = COPY [[FADD]](s32)
-  ; X64-NEXT:   G_STORE [[COPY3]](s32), [[COPY1]](p0) :: (store (s32) into %ir.1)
+  ; X64-NEXT:   [[LOAD:%[0-9]+]]:psr(s32) = G_LOAD [[COPY]](p0) :: (load (s32) from %ir.0)
+  ; X64-NEXT:   [[FADD:%[0-9]+]]:psr(s32) = G_FADD [[LOAD]], [[C]]
+  ; X64-NEXT:   G_STORE [[FADD]](s32), [[COPY1]](p0) :: (store (s32) into %ir.1)
   ; X64-NEXT:   RET 0
   %load1 = load float, ptr %0
   %add = fadd float %load1, 20.0
diff --git a/llvm/test/CodeGen/X86/code-model-elf-text-sections.ll b/llvm/test/CodeGen/X86/code-model-elf-text-sections.ll
index 016c9a4d7b839..66a6fd3767542 100644
--- a/llvm/test/CodeGen/X86/code-model-elf-text-sections.ll
+++ b/llvm/test/CodeGen/X86/code-model-elf-text-sections.ll
@@ -13,9 +13,20 @@
 ; RUN: llvm-readelf -S %t | FileCheck %s --check-prefix=LARGE-DS
 
 ; SMALL: .text {{.*}} AX {{.*}}
+; SMALL: .ltext {{.*}} AXl {{.*}}
+; SMALL: .ltext.2 {{.*}} AXl {{.*}}
+; SMALL: .foo {{.*}} AX {{.*}}
 ; SMALL-DS: .text.func {{.*}} AX {{.*}}
+; SMALL-DS: .ltext {{.*}} AXl {{.*}}
+; SMALL-DS: .ltext.2 {{.*}} AXl {{.*}}
+; SMALL-DS: .foo {{.*}} AX {{.*}}
 ; LARGE: .ltext {{.*}} AXl {{.*}}
+; LARGE: .ltext.2 {{.*}} AXl {{.*}}
+; LARGE: .foo {{.*}} AX {{.*}}
 ; LARGE-DS: .ltext.func {{.*}} AXl {{.*}}
+; LARGE-DS: .ltext {{.*}} AXl {{.*}}
+; LARGE-DS: .ltext.2 {{.*}} AXl {{.*}}
+; LARGE-DS: .foo {{.*}} AX {{.*}}
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64--linux"
@@ -23,3 +34,15 @@ target triple = "x86_64--linux"
 define void @func() {
   ret void
 }
+
+define void @ltext() section ".ltext" {
+  ret void
+}
+
+define void @ltext2() section ".ltext.2" {
+  ret void
+}
+
+define void @foo() section ".foo" {
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/code-model-elf.ll b/llvm/test/CodeGen/X86/code-model-elf.ll
index 0da62e3e7a651..f60f75bc26911 100644
--- a/llvm/test/CodeGen/X86/code-model-elf.ll
+++ b/llvm/test/CodeGen/X86/code-model-elf.ll
@@ -350,7 +350,7 @@ define dso_local ptr @lea_forced_small_data() #0 {
 ;
 ; LARGE-STATIC-LABEL: lea_forced_small_data:
 ; LARGE-STATIC:       # %bb.0:
-; LARGE-STATIC-NEXT:    movl $forced_small_data, %eax
+; LARGE-STATIC-NEXT:    movabsq $forced_small_data, %rax
 ; LARGE-STATIC-NEXT:    retq
 ;
 ; SMALL-PIC-LABEL: lea_forced_small_data:
@@ -403,7 +403,7 @@ define dso_local i32 @load_forced_small_data() #0 {
 ;
 ; LARGE-STATIC-LABEL: load_forced_small_data:
 ; LARGE-STATIC:       # %bb.0:
-; LARGE-STATIC-NEXT:    movl $forced_small_data+8, %eax
+; LARGE-STATIC-NEXT:    movabsq $forced_small_data+8, %rax
 ; LARGE-STATIC-NEXT:    movl (%rax), %eax
 ; LARGE-STATIC-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/combine-ptest.ll b/llvm/test/CodeGen/X86/combine-ptest.ll
index 337edef96beee..3a695bfc6234d 100644
--- a/llvm/test/CodeGen/X86/combine-ptest.ll
+++ b/llvm/test/CodeGen/X86/combine-ptest.ll
@@ -397,6 +397,48 @@ define i1 @PR38788(<4 x i32> %0, <4 x i32> %1) {
   ret i1 %7
 }
 
+define i32 @PR88958_1(ptr %0, <2 x i64> %1) {
+; SSE-LABEL: PR88958_1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa (%rdi), %xmm1
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    ptest %xmm0, %xmm1
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: PR88958_1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    vptest %xmm0, %xmm1
+; AVX-NEXT:    sete %al
+; AVX-NEXT:    retq
+  %3 = load <2 x i64>, ptr %0
+  %4 = tail call i32 @llvm.x86.sse41.ptestz(<2 x i64> %3, <2 x i64> %1)
+  ret i32 %4
+}
+
+define i32 @PR88958_2(ptr %0, <2 x i64> %1) {
+; SSE-LABEL: PR88958_2:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa (%rdi), %xmm1
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    ptest %xmm0, %xmm1
+; SSE-NEXT:    setb %al
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: PR88958_2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    vptest %xmm0, %xmm1
+; AVX-NEXT:    setb %al
+; AVX-NEXT:    retq
+  %3 = load <2 x i64>, ptr %0
+  %4 = tail call i32 @llvm.x86.sse41.ptestc(<2 x i64> %3, <2 x i64> %1)
+  ret i32 %4
+}
+
 declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
 declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
 declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/tail-dup-pred-succ-size.mir b/llvm/test/CodeGen/X86/tail-dup-pred-succ-size.mir
new file mode 100644
index 0000000000000..67f8cc72e0d72
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tail-dup-pred-succ-size.mir
@@ -0,0 +1,260 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=early-tailduplication -tail-dup-pred-size=3 -tail-dup-succ-size=3 %s -o - | FileCheck %s -check-prefix=LIMIT
+# RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=early-tailduplication -tail-dup-pred-size=4 -tail-dup-succ-size=4 %s -o - | FileCheck %s -check-prefix=NOLIMIT
+
+---
+name:            foo
+tracksRegLiveness: true
+jumpTable:
+  kind:            block-address
+  entries:
+    - id:              0
+      blocks:          [ '%bb.2', '%bb.3', '%bb.4', '%bb.5' ]
+    - id:              1
+      blocks:          [ '%bb.9', '%bb.10', '%bb.11', '%bb.12' ]
+body:             |
+  ; LIMIT-LABEL: name: foo
+  ; LIMIT: bb.0:
+  ; LIMIT-NEXT:   successors: %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000), %bb.5(0x20000000)
+  ; LIMIT-NEXT:   liveins: $rdi, $esi
+  ; LIMIT-NEXT: {{  $}}
+  ; LIMIT-NEXT:   [[COPY:%[0-9]+]]:gr32 = COPY $esi
+  ; LIMIT-NEXT:   [[COPY1:%[0-9]+]]:gr64 = COPY $rdi
+  ; LIMIT-NEXT:   [[SHR32ri:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 1, implicit-def dead $eflags
+  ; LIMIT-NEXT:   [[AND32ri:%[0-9]+]]:gr32 = AND32ri [[SHR32ri]], 7, implicit-def dead $eflags
+  ; LIMIT-NEXT:   [[SUBREG_TO_REG:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, killed [[AND32ri]], %subreg.sub_32bit
+  ; LIMIT-NEXT:   JMP64m $noreg, 8, [[SUBREG_TO_REG]], %jump-table.0, $noreg
+  ; LIMIT-NEXT: {{  $}}
+  ; LIMIT-NEXT: bb.2:
+  ; LIMIT-NEXT:   successors: %bb.7(0x80000000)
+  ; LIMIT-NEXT: {{  $}}
+  ; LIMIT-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
+  ; LIMIT-NEXT:   JMP_1 %bb.7
+  ; LIMIT-NEXT: {{  $}}
+  ; LIMIT-NEXT: bb.3:
+  ; LIMIT-NEXT:   successors: %bb.7(0x80000000)
+  ; LIMIT-NEXT: {{  $}}
+  ; LIMIT-NEXT:   [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
+  ; LIMIT-NEXT:   [[SHR32ri1:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm1]], 1, implicit-def dead $eflags
+  ; LIMIT-NEXT:   JMP_1 %bb.7
+  ; LIMIT-NEXT: {{  $}}
+  ; LIMIT-NEXT: bb.4:
+  ; LIMIT-NEXT:   successors: %bb.7(0x80000000)
+  ; LIMIT-NEXT: {{  $}}
+  ; LIMIT-NEXT:   [[MOV32rm2:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
+  ; LIMIT-NEXT:   [[SHR32ri2:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm2]], 2, implicit-def dead $eflags
+  ; LIMIT-NEXT:   JMP_1 %bb.7
+  ; LIMIT-NEXT: {{  $}}
+  ; LIMIT-NEXT: bb.5:
+  ; LIMIT-NEXT:   successors: %bb.7(0x80000000)
+  ; LIMIT-NEXT: {{  $}}
+  ; LIMIT-NEXT:   [[MOV32rm3:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
+  ; LIMIT-NEXT:   [[SHR32ri3:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm3]], 3, implicit-def dead $eflags
+  ; LIMIT-NEXT:   JMP_1 %bb.7
+  ; LIMIT-NEXT: {{  $}}
+  ; LIMIT-NEXT: bb.6:
+  ; LIMIT-NEXT:   successors:
+  ; LIMIT-NEXT: {{  $}}
+  ; LIMIT-NEXT: bb.7:
+  ; LIMIT-NEXT:   successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000)
+  ; LIMIT-NEXT: {{  $}}
+  ; LIMIT-NEXT:   [[PHI:%[0-9]+]]:gr32 = PHI [[SHR32ri3]], %bb.5, [[SHR32ri2]], %bb.4, [[SHR32ri1]], %bb.3, [[MOV32rm]], %bb.2
+  ; LIMIT-NEXT:   [[SHR32ri4:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 2, implicit-def dead $eflags
+  ; LIMIT-NEXT:   [[AND32ri1:%[0-9]+]]:gr32 = AND32ri [[SHR32ri4]], 7, implicit-def dead $eflags
+  ; LIMIT-NEXT:   [[SUBREG_TO_REG1:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, killed [[AND32ri1]], %subreg.sub_32bit
+  ; LIMIT-NEXT:   JMP64m $noreg, 8, [[SUBREG_TO_REG1]], %jump-table.1, $noreg
+  ; LIMIT-NEXT: {{  $}}
+  ; LIMIT-NEXT: bb.9:
+  ; LIMIT-NEXT:   successors: %bb.13(0x80000000)
+  ; LIMIT-NEXT: {{  $}}
+  ; LIMIT-NEXT:   [[MOV32rm4:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
+  ; LIMIT-NEXT:   JMP_1 %bb.13
+  ; LIMIT-NEXT: {{  $}}
+  ; LIMIT-NEXT: bb.10:
+  ; LIMIT-NEXT:   successors: %bb.13(0x80000000)
+  ; LIMIT-NEXT: {{  $}}
+  ; LIMIT-NEXT:   [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
+  ; LIMIT-NEXT:   [[SHR32ri5:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm5]], 1, implicit-def dead $eflags
+  ; LIMIT-NEXT:   JMP_1 %bb.13
+  ; LIMIT-NEXT: {{  $}}
+  ; LIMIT-NEXT: bb.11:
+  ; LIMIT-NEXT:   successors: %bb.13(0x80000000)
+  ; LIMIT-NEXT: {{  $}}
+  ; LIMIT-NEXT:   [[MOV32rm6:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
+  ; LIMIT-NEXT:   [[SHR32ri6:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm6]], 2, implicit-def dead $eflags
+  ; LIMIT-NEXT:   JMP_1 %bb.13
+  ; LIMIT-NEXT: {{  $}}
+  ; LIMIT-NEXT: bb.12:
+  ; LIMIT-NEXT:   successors: %bb.13(0x80000000)
+  ; LIMIT-NEXT: {{  $}}
+  ; LIMIT-NEXT:   [[MOV32rm7:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
+  ; LIMIT-NEXT:   [[SHR32ri7:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm7]], 6, implicit-def dead $eflags
+  ; LIMIT-NEXT: {{  $}}
+  ; LIMIT-NEXT: bb.13:
+  ; LIMIT-NEXT:   [[PHI1:%[0-9]+]]:gr32 = PHI [[SHR32ri7]], %bb.12, [[SHR32ri6]], %bb.11, [[SHR32ri5]], %bb.10, [[MOV32rm4]], %bb.9
+  ; LIMIT-NEXT:   [[OR32rr:%[0-9]+]]:gr32 = OR32rr [[PHI1]], [[PHI]], implicit-def dead $eflags
+  ; LIMIT-NEXT:   $eax = COPY [[OR32rr]]
+  ; LIMIT-NEXT:   RET 0, $eax
+  ;
+  ; NOLIMIT-LABEL: name: foo
+  ; NOLIMIT: bb.0:
+  ; NOLIMIT-NEXT:   successors: %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000), %bb.5(0x20000000)
+  ; NOLIMIT-NEXT:   liveins: $rdi, $esi
+  ; NOLIMIT-NEXT: {{  $}}
+  ; NOLIMIT-NEXT:   [[COPY:%[0-9]+]]:gr32 = COPY $esi
+  ; NOLIMIT-NEXT:   [[COPY1:%[0-9]+]]:gr64 = COPY $rdi
+  ; NOLIMIT-NEXT:   [[SHR32ri:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 1, implicit-def dead $eflags
+  ; NOLIMIT-NEXT:   [[AND32ri:%[0-9]+]]:gr32 = AND32ri [[SHR32ri]], 7, implicit-def dead $eflags
+  ; NOLIMIT-NEXT:   [[SUBREG_TO_REG:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, killed [[AND32ri]], %subreg.sub_32bit
+  ; NOLIMIT-NEXT:   JMP64m $noreg, 8, [[SUBREG_TO_REG]], %jump-table.0, $noreg
+  ; NOLIMIT-NEXT: {{  $}}
+  ; NOLIMIT-NEXT: bb.2:
+  ; NOLIMIT-NEXT:   successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000)
+  ; NOLIMIT-NEXT: {{  $}}
+  ; NOLIMIT-NEXT:   [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
+  ; NOLIMIT-NEXT:   [[SHR32ri1:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 2, implicit-def dead $eflags
+  ; NOLIMIT-NEXT:   [[AND32ri1:%[0-9]+]]:gr32 = AND32ri [[SHR32ri1]], 7, implicit-def dead $eflags
+  ; NOLIMIT-NEXT:   [[SUBREG_TO_REG1:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, [[AND32ri1]], %subreg.sub_32bit
+  ; NOLIMIT-NEXT:   JMP64m $noreg, 8, [[SUBREG_TO_REG1]], %jump-table.1, $noreg
+  ; NOLIMIT-NEXT: {{  $}}
+  ; NOLIMIT-NEXT: bb.3:
+  ; NOLIMIT-NEXT:   successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000)
+  ; NOLIMIT-NEXT: {{  $}}
+  ; NOLIMIT-NEXT:   [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
+  ; NOLIMIT-NEXT:   [[SHR32ri2:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm1]], 1, implicit-def dead $eflags
+  ; NOLIMIT-NEXT:   [[SHR32ri3:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 2, implicit-def dead $eflags
+  ; NOLIMIT-NEXT:   [[AND32ri2:%[0-9]+]]:gr32 = AND32ri [[SHR32ri3]], 7, implicit-def dead $eflags
+  ; NOLIMIT-NEXT:   [[SUBREG_TO_REG2:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, [[AND32ri2]], %subreg.sub_32bit
+  ; NOLIMIT-NEXT:   JMP64m $noreg, 8, [[SUBREG_TO_REG2]], %jump-table.1, $noreg
+  ; NOLIMIT-NEXT: {{  $}}
+  ; NOLIMIT-NEXT: bb.4:
+  ; NOLIMIT-NEXT:   successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000)
+  ; NOLIMIT-NEXT: {{  $}}
+  ; NOLIMIT-NEXT:   [[MOV32rm2:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
+  ; NOLIMIT-NEXT:   [[SHR32ri4:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm2]], 2, implicit-def dead $eflags
+  ; NOLIMIT-NEXT:   [[SHR32ri5:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 2, implicit-def dead $eflags
+  ; NOLIMIT-NEXT:   [[AND32ri3:%[0-9]+]]:gr32 = AND32ri [[SHR32ri5]], 7, implicit-def dead $eflags
+  ; NOLIMIT-NEXT:   [[SUBREG_TO_REG3:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, [[AND32ri3]], %subreg.sub_32bit
+  ; NOLIMIT-NEXT:   JMP64m $noreg, 8, [[SUBREG_TO_REG3]], %jump-table.1, $noreg
+  ; NOLIMIT-NEXT: {{  $}}
+  ; NOLIMIT-NEXT: bb.5:
+  ; NOLIMIT-NEXT:   successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000)
+  ; NOLIMIT-NEXT: {{  $}}
+  ; NOLIMIT-NEXT:   [[MOV32rm3:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
+  ; NOLIMIT-NEXT:   [[SHR32ri6:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm3]], 3, implicit-def dead $eflags
+  ; NOLIMIT-NEXT:   [[SHR32ri7:%[0-9]+]]:gr32 = SHR32ri [[COPY]], 2, implicit-def dead $eflags
+  ; NOLIMIT-NEXT:   [[AND32ri4:%[0-9]+]]:gr32 = AND32ri [[SHR32ri7]], 7, implicit-def dead $eflags
+  ; NOLIMIT-NEXT:   [[SUBREG_TO_REG4:%[0-9]+]]:gr64_nosp = SUBREG_TO_REG 0, [[AND32ri4]], %subreg.sub_32bit
+  ; NOLIMIT-NEXT:   JMP64m $noreg, 8, [[SUBREG_TO_REG4]], %jump-table.1, $noreg
+  ; NOLIMIT-NEXT: {{  $}}
+  ; NOLIMIT-NEXT: bb.6:
+  ; NOLIMIT-NEXT:   successors:
+  ; NOLIMIT-NEXT: {{  $}}
+  ; NOLIMIT-NEXT: bb.9:
+  ; NOLIMIT-NEXT:   successors: %bb.13(0x80000000)
+  ; NOLIMIT-NEXT: {{  $}}
+  ; NOLIMIT-NEXT:   [[PHI:%[0-9]+]]:gr32 = PHI [[MOV32rm]], %bb.2, [[SHR32ri2]], %bb.3, [[SHR32ri4]], %bb.4, [[SHR32ri6]], %bb.5
+  ; NOLIMIT-NEXT:   [[MOV32rm4:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
+  ; NOLIMIT-NEXT:   JMP_1 %bb.13
+  ; NOLIMIT-NEXT: {{  $}}
+  ; NOLIMIT-NEXT: bb.10:
+  ; NOLIMIT-NEXT:   successors: %bb.13(0x80000000)
+  ; NOLIMIT-NEXT: {{  $}}
+  ; NOLIMIT-NEXT:   [[PHI1:%[0-9]+]]:gr32 = PHI [[MOV32rm]], %bb.2, [[SHR32ri2]], %bb.3, [[SHR32ri4]], %bb.4, [[SHR32ri6]], %bb.5
+  ; NOLIMIT-NEXT:   [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
+  ; NOLIMIT-NEXT:   [[SHR32ri8:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm5]], 1, implicit-def dead $eflags
+  ; NOLIMIT-NEXT:   JMP_1 %bb.13
+  ; NOLIMIT-NEXT: {{  $}}
+  ; NOLIMIT-NEXT: bb.11:
+  ; NOLIMIT-NEXT:   successors: %bb.13(0x80000000)
+  ; NOLIMIT-NEXT: {{  $}}
+  ; NOLIMIT-NEXT:   [[PHI2:%[0-9]+]]:gr32 = PHI [[MOV32rm]], %bb.2, [[SHR32ri2]], %bb.3, [[SHR32ri4]], %bb.4, [[SHR32ri6]], %bb.5
+  ; NOLIMIT-NEXT:   [[MOV32rm6:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
+  ; NOLIMIT-NEXT:   [[SHR32ri9:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm6]], 2, implicit-def dead $eflags
+  ; NOLIMIT-NEXT:   JMP_1 %bb.13
+  ; NOLIMIT-NEXT: {{  $}}
+  ; NOLIMIT-NEXT: bb.12:
+  ; NOLIMIT-NEXT:   successors: %bb.13(0x80000000)
+  ; NOLIMIT-NEXT: {{  $}}
+  ; NOLIMIT-NEXT:   [[PHI3:%[0-9]+]]:gr32 = PHI [[MOV32rm]], %bb.2, [[SHR32ri2]], %bb.3, [[SHR32ri4]], %bb.4, [[SHR32ri6]], %bb.5
+  ; NOLIMIT-NEXT:   [[MOV32rm7:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 0, $noreg
+  ; NOLIMIT-NEXT:   [[SHR32ri10:%[0-9]+]]:gr32 = SHR32ri [[MOV32rm7]], 6, implicit-def dead $eflags
+  ; NOLIMIT-NEXT: {{  $}}
+  ; NOLIMIT-NEXT: bb.13:
+  ; NOLIMIT-NEXT:   [[PHI4:%[0-9]+]]:gr32 = PHI [[PHI]], %bb.9, [[PHI1]], %bb.10, [[PHI2]], %bb.11, [[PHI3]], %bb.12
+  ; NOLIMIT-NEXT:   [[PHI5:%[0-9]+]]:gr32 = PHI [[SHR32ri10]], %bb.12, [[SHR32ri9]], %bb.11, [[SHR32ri8]], %bb.10, [[MOV32rm4]], %bb.9
+  ; NOLIMIT-NEXT:   [[OR32rr:%[0-9]+]]:gr32 = OR32rr [[PHI5]], [[PHI4]], implicit-def dead $eflags
+  ; NOLIMIT-NEXT:   $eax = COPY [[OR32rr]]
+  ; NOLIMIT-NEXT:   RET 0, $eax
+  bb.0:
+    liveins: $rdi, $esi
+
+    %11:gr32 = COPY $esi
+    %10:gr64 = COPY $rdi
+    %13:gr32 = SHR32ri %11, 1, implicit-def dead $eflags
+    %14:gr32 = AND32ri %13, 7, implicit-def dead $eflags
+    %12:gr64_nosp = SUBREG_TO_REG 0, killed %14, %subreg.sub_32bit
+
+  bb.1:
+    successors: %bb.2, %bb.3, %bb.4, %bb.5
+
+    JMP64m $noreg, 8, %12, %jump-table.0, $noreg
+
+  bb.2:
+    %0:gr32 = MOV32rm %10, 1, $noreg, 0, $noreg
+    JMP_1 %bb.7
+
+  bb.3:
+    %17:gr32 = MOV32rm %10, 1, $noreg, 0, $noreg
+    %1:gr32 = SHR32ri %17, 1, implicit-def dead $eflags
+    JMP_1 %bb.7
+
+  bb.4:
+    %16:gr32 = MOV32rm %10, 1, $noreg, 0, $noreg
+    %2:gr32 = SHR32ri %16, 2, implicit-def dead $eflags
+    JMP_1 %bb.7
+
+  bb.5:
+    %15:gr32 = MOV32rm %10, 1, $noreg, 0, $noreg
+    %3:gr32 = SHR32ri %15, 3, implicit-def dead $eflags
+    JMP_1 %bb.7
+
+  bb.6:
+    successors:
+
+  bb.7:
+    %4:gr32 = PHI %3, %bb.5, %2, %bb.4, %1, %bb.3, %0, %bb.2
+    %19:gr32 = SHR32ri %11, 2, implicit-def dead $eflags
+    %20:gr32 = AND32ri %19, 7, implicit-def dead $eflags
+    %18:gr64_nosp = SUBREG_TO_REG 0, killed %20, %subreg.sub_32bit
+
+  bb.8:
+    successors: %bb.9, %bb.10, %bb.11, %bb.12
+
+    JMP64m $noreg, 8, %18, %jump-table.1, $noreg
+
+  bb.9:
+    %5:gr32 = MOV32rm %10, 1, $noreg, 0, $noreg
+    JMP_1 %bb.13
+
+  bb.10:
+    %23:gr32 = MOV32rm %10, 1, $noreg, 0, $noreg
+    %6:gr32 = SHR32ri %23, 1, implicit-def dead $eflags
+    JMP_1 %bb.13
+
+  bb.11:
+    %22:gr32 = MOV32rm %10, 1, $noreg, 0, $noreg
+    %7:gr32 = SHR32ri %22, 2, implicit-def dead $eflags
+    JMP_1 %bb.13
+
+  bb.12:
+    %21:gr32 = MOV32rm %10, 1, $noreg, 0, $noreg
+    %8:gr32 = SHR32ri %21, 6, implicit-def dead $eflags
+
+  bb.13:
+    %9:gr32 = PHI %8, %bb.12, %7, %bb.11, %6, %bb.10, %5, %bb.9
+    %24:gr32 = OR32rr %9, %4, implicit-def dead $eflags
+    $eax = COPY %24
+    RET 0, $eax
+
+...
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index 0c76c14afb0ae..4859a8e0eaaa5 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -305,6 +305,37 @@ define <4 x float> @combine_vpermilvar_4f32_as_insertps(<4 x float> %a0) {
   ret <4 x float> %2
 }
 
+define <8 x i32> @combine_blend_of_permutes_v8i32(<4 x i64> %a0, <4 x i64> %a1) {
+; AVX1-LABEL: combine_blend_of_permutes_v8i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6],ymm1[7]
+; AVX1-NEXT:    ret{{[l|q]}}
+;
+; AVX2-LABEL: combine_blend_of_permutes_v8i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4],ymm0[5,6],ymm1[7]
+; AVX2-NEXT:    ret{{[l|q]}}
+;
+; AVX512-LABEL: combine_blend_of_permutes_v8i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [4,21,6,23,16,1,2,19]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
+; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512-NEXT:    ret{{[l|q]}}
+  %s0 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+  %s1 = shufflevector <4 x i64> %a1, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+  %x0 = bitcast <4 x i64> %s0 to <8 x i32>
+  %x1 = bitcast <4 x i64> %s1 to <8 x i32>
+  %r = shufflevector <8 x i32> %x0, <8 x i32> %x1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 12, i32 5, i32 6, i32 15>
+  ret <8 x i32> %r
+}
+
 define <2 x double> @constant_fold_vpermilvar_pd() {
 ; CHECK-LABEL: constant_fold_vpermilvar_pd:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
index f53b1eeaf8f54..e87e810971e11 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
@@ -973,3 +973,47 @@ define <8 x i64> @combine_vpermvar_insertion_as_broadcast_v8i64(i64 %a0) {
   %2 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %1, <8 x i64> zeroinitializer)
   ret <8 x i64> %2
 }
+
+define <16 x i32> @blend_of_permutes_v16i32(<8 x i64> %a0, <8x i64> %a1) {
+; X86-AVX512F-LABEL: blend_of_permutes_v16i32:
+; X86-AVX512F:       # %bb.0:
+; X86-AVX512F-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
+; X86-AVX512F-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
+; X86-AVX512F-NEXT:    movw $-25958, %ax # imm = 0x9A9A
+; X86-AVX512F-NEXT:    kmovw %eax, %k1
+; X86-AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; X86-AVX512F-NEXT:    retl
+;
+; X86-AVX512BW-LABEL: blend_of_permutes_v16i32:
+; X86-AVX512BW:       # %bb.0:
+; X86-AVX512BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
+; X86-AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
+; X86-AVX512BW-NEXT:    movw $-25958, %ax # imm = 0x9A9A
+; X86-AVX512BW-NEXT:    kmovd %eax, %k1
+; X86-AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; X86-AVX512BW-NEXT:    retl
+;
+; X64-AVX512F-LABEL: blend_of_permutes_v16i32:
+; X64-AVX512F:       # %bb.0:
+; X64-AVX512F-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
+; X64-AVX512F-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
+; X64-AVX512F-NEXT:    movw $-25958, %ax # imm = 0x9A9A
+; X64-AVX512F-NEXT:    kmovw %eax, %k1
+; X64-AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; X64-AVX512F-NEXT:    retq
+;
+; X64-AVX512BW-LABEL: blend_of_permutes_v16i32:
+; X64-AVX512BW:       # %bb.0:
+; X64-AVX512BW-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5]
+; X64-AVX512BW-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5]
+; X64-AVX512BW-NEXT:    movw $-25958, %ax # imm = 0x9A9A
+; X64-AVX512BW-NEXT:    kmovd %eax, %k1
+; X64-AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; X64-AVX512BW-NEXT:    retq
+  %s0 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
+  %s1 = shufflevector <8 x i64> %a1, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 0, i32 1, i32 6, i32 7, i32 4, i32 5>
+  %x0 = bitcast <8 x i64> %s0 to <16 x i32>
+  %x1 = bitcast <8 x i64> %s1 to <16 x i32>
+  %r = shufflevector <16 x i32> %x0, <16 x i32> %x1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 20, i32 5, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 28, i32 13, i32 14, i32 31>
+  ret <16 x i32> %r
+}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
index 5eb017bc80ca5..8d213d2577433 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512
 
 ; Combine tests involving SSE41 target shuffles (BLEND,INSERTPS,MOVZX)
 
@@ -22,6 +22,45 @@ define <16 x i8> @combine_vpshufb_as_movzx(<16 x i8> %a0) {
   ret <16 x i8> %res0
 }
 
+define <4 x i32> @combine_blend_of_permutes_v4i32(<2 x i64> %a0, <2 x i64> %a1) {
+; SSE-LABEL: combine_blend_of_permutes_v4i32:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: combine_blend_of_permutes_v4i32:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: combine_blend_of_permutes_v4i32:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[2,3,0,1]
+; AVX2-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: combine_blend_of_permutes_v4i32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [2,19,0,17]
+; AVX512-NEXT:    vpermt2d %zmm1, %zmm2, %zmm0
+; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %s0 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+  %s1 = shufflevector <2 x i64> %a1, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+  %x0 = bitcast <2 x i64> %s0 to <4 x i32>
+  %x1 = bitcast <2 x i64> %s1 to <4 x i32>
+  %r = shufflevector <4 x i32> %x0, <4 x i32> %x1, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  ret <4 x i32> %r
+}
+
 define <16 x i8> @PR50049(ptr %p1, ptr %p2) {
 ; SSE-LABEL: PR50049:
 ; SSE:       # %bb.0:
@@ -56,6 +95,107 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) {
 ; SSE-NEXT:    pand %xmm5, %xmm1
 ; SSE-NEXT:    packuswb %xmm1, %xmm0
 ; SSE-NEXT:    retq
+;
+; AVX1-LABEL: PR50049:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vmovdqa (%rsi), %xmm2
+; AVX1-NEXT:    vmovdqa 16(%rsi), %xmm5
+; AVX1-NEXT:    vmovdqa 32(%rsi), %xmm6
+; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
+; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
+; AVX1-NEXT:    vpor %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; AVX1-NEXT:    vpmullw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: PR50049:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u]
+; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u]
+; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vmovdqa (%rsi), %xmm2
+; AVX2-NEXT:    vmovdqa 16(%rsi), %xmm5
+; AVX2-NEXT:    vmovdqa 32(%rsi), %xmm6
+; AVX2-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
+; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
+; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
+; AVX2-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
+; AVX2-NEXT:    vpor %xmm5, %xmm2, %xmm2
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: PR50049:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT:    vmovdqa 32(%rdi), %xmm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u]
+; AVX512-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa (%rsi), %xmm2
+; AVX512-NEXT:    vmovdqa 16(%rsi), %xmm5
+; AVX512-NEXT:    vmovdqa 32(%rsi), %xmm6
+; AVX512-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
+; AVX512-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
+; AVX512-NEXT:    vpor %xmm3, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
+; AVX512-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
+; AVX512-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
+; AVX512-NEXT:    vpor %xmm5, %xmm2, %xmm2
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
+; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %x1 = load <48 x i8>, ptr %p1, align 16
   %x2 = load <48 x i8>, ptr %p2, align 16
   %s1 = shufflevector <48 x i8> %x1, <48 x i8> poison, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45>
diff --git a/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll b/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll
index 8d7608ead38ed..de367dfa4acb4 100644
--- a/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll
+++ b/llvm/test/DebugInfo/NVPTX/dbg-declare-alloca.ll
@@ -9,7 +9,6 @@
 ; CHECK: add.u64 %rd1, %SP, 0;
 ; CHECK: .loc 1 5 3                   // t.c:5:3
 ; CHECK: { // callseq 0, 0
-; CHECK: .reg .b32 temp_param_reg;
 ; CHECK: .param .b64 param0;
 ; CHECK: st.param.b64 [param0+0], %rd1;
 ; CHECK: call.uni
diff --git a/llvm/test/DebugInfo/debugify-bogus-dbg-value.ll b/llvm/test/DebugInfo/debugify-bogus-dbg-value.ll
index 4990979f10c53..55e436b1a93b2 100644
--- a/llvm/test/DebugInfo/debugify-bogus-dbg-value.ll
+++ b/llvm/test/DebugInfo/debugify-bogus-dbg-value.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -passes=check-debugify < %s 2>&1 | FileCheck %s
+; RUN: opt --experimental-debuginfo-iterators=false -passes=check-debugify < %s 2>&1 | FileCheck %s
 
 define <2 x i64> @test-fun(<2 x i64> %A) !dbg !6 {
   %and = and <2 x i64> %A, <i64 23, i64 42>, !dbg !14
diff --git a/llvm/test/DebugInfo/debugify-each.ll b/llvm/test/DebugInfo/debugify-each.ll
index e9241dedb6960..7685b57b5dd15 100644
--- a/llvm/test/DebugInfo/debugify-each.ll
+++ b/llvm/test/DebugInfo/debugify-each.ll
@@ -40,6 +40,40 @@
 ; RUN: opt -debugify-each -passes=globalopt -S -o /dev/null < %s 2> %t
 ; RUN: FileCheck %s -input-file=%t -check-prefix=MODULE-PASS-ONE
 
+; Repeat the same checks with debug intrinsics enabled.
+; RUN: opt --experimental-debuginfo-iterators=false -debugify-each -O3 -S -o /dev/null < %s 2> %t
+; RUN: FileCheck %s -input-file=%t -check-prefix=MODULE-PASS
+; RUN: FileCheck %s -input-file=%t -check-prefix=FUNCTION-PASS
+; RUN: opt --experimental-debuginfo-iterators=false -disable-output -debugify-each -passes='default<O3>' %s 2> %t
+; RUN: FileCheck %s -input-file=%t -check-prefix=MODULE-PASS
+; RUN: FileCheck %s -input-file=%t -check-prefix=FUNCTION-PASS
+
+; RUN: opt --experimental-debuginfo-iterators=false -enable-debugify -debugify-each -O3 -S -o /dev/null < %s 2> %t
+; RUN: FileCheck %s -input-file=%t -check-prefix=MODULE-PASS
+; RUN: FileCheck %s -input-file=%t -check-prefix=FUNCTION-PASS
+
+; RUN: opt --experimental-debuginfo-iterators=false -debugify-each -passes='instrprof,instrprof,sroa,sccp' -S -o /dev/null < %s 2> %t
+; RUN: FileCheck %s -input-file=%t -check-prefix=MODULE-PASS
+; RUN: FileCheck %s -input-file=%t -check-prefix=FUNCTION-PASS
+
+; RUN: opt --experimental-debuginfo-iterators=false -debugify-each -O1 < %s | opt -O2 -o /dev/null
+
+; RUN: opt --experimental-debuginfo-iterators=false -disable-output -debugify-quiet -debugify-each -O1 < %s 2>&1 | count 0
+
+; RUN: opt --experimental-debuginfo-iterators=false -O1 < %s -S -o %t.before
+; RUN: opt --experimental-debuginfo-iterators=false -O1 -debugify-each < %s -S -o %t.after
+; RUN: diff %t.before %t.after
+
+; RUN: opt --experimental-debuginfo-iterators=false -O1 < %s | llvm-dis -o %t.before
+; RUN: opt --experimental-debuginfo-iterators=false -O1 -debugify-each < %s | llvm-dis -o %t.after
+; RUN: diff %t.before %t.after
+
+; RUN: opt --experimental-debuginfo-iterators=false -debugify-each -passes=instsimplify -S -o /dev/null < %s 2> %t
+; RUN: FileCheck %s -input-file=%t -check-prefix=FUNCTION-PASS-ONE
+
+; RUN: opt --experimental-debuginfo-iterators=false -debugify-each -passes=globalopt -S -o /dev/null < %s 2> %t
+; RUN: FileCheck %s -input-file=%t -check-prefix=MODULE-PASS-ONE
+
 define void @foo(i32 %arg) {
   call i32 asm "bswap $0", "=r,r"(i32 %arg)
   ret void
diff --git a/llvm/test/DebugInfo/debugify-export.ll b/llvm/test/DebugInfo/debugify-export.ll
index 6e5952d433da9..30333ca908b0d 100644
--- a/llvm/test/DebugInfo/debugify-export.ll
+++ b/llvm/test/DebugInfo/debugify-export.ll
@@ -1,6 +1,9 @@
 ; RUN: opt %s -disable-output -debugify-each -debugify-quiet -debugify-export - -passes=globalopt | FileCheck %s
 ; RUN: opt %s -disable-output -debugify-each -debugify-quiet -debugify-export - -passes=globalopt | FileCheck %s
 
+; RUN: opt --experimental-debuginfo-iterators=false %s -disable-output -debugify-each -debugify-quiet -debugify-export - -passes=globalopt | FileCheck %s
+; RUN: opt --experimental-debuginfo-iterators=false %s -disable-output -debugify-each -debugify-quiet -debugify-export - -passes=globalopt | FileCheck %s
+
 ; CHECK: Pass Name
 ; CHECK-SAME: # of missing debug values
 ; CHECK-SAME: # of missing locations
diff --git a/llvm/test/DebugInfo/debugify-ignore-phi.ll b/llvm/test/DebugInfo/debugify-ignore-phi.ll
index 322ccafa22ac8..643df1d960485 100644
--- a/llvm/test/DebugInfo/debugify-ignore-phi.ll
+++ b/llvm/test/DebugInfo/debugify-ignore-phi.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -passes=check-debugify < %s -S 2>&1 | FileCheck %s
+; RUN: opt --experimental-debuginfo-iterators=false -passes=check-debugify < %s -S 2>&1 | FileCheck %s
 
 define void @test_phi(i1 %cond) !dbg !6 {
   br i1 %cond, label %1, label %2, !dbg !11
diff --git a/llvm/test/DebugInfo/debugify-original-no-dbg-info.ll b/llvm/test/DebugInfo/debugify-original-no-dbg-info.ll
index 941b294fb8556..4cbbfc5c215e2 100644
--- a/llvm/test/DebugInfo/debugify-original-no-dbg-info.ll
+++ b/llvm/test/DebugInfo/debugify-original-no-dbg-info.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -verify-debuginfo-preserve -passes=instcombine -S -o - < %s 2>&1 | FileCheck %s
+; RUN: opt --experimental-debuginfo-iterators=false -verify-debuginfo-preserve -passes=instcombine -S -o - < %s 2>&1 | FileCheck %s
 
 ; CHECK: ModuleDebugify (original debuginfo): Skipping module without debug info
 ; CHECK-NEXT: CheckModuleDebugify (original debuginfo): Skipping module without debug info
diff --git a/llvm/test/DebugInfo/debugify-report-missing-locs-only.ll b/llvm/test/DebugInfo/debugify-report-missing-locs-only.ll
index 1c5daa19c6484..04b7636f025a0 100644
--- a/llvm/test/DebugInfo/debugify-report-missing-locs-only.ll
+++ b/llvm/test/DebugInfo/debugify-report-missing-locs-only.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -passes=check-debugify < %s -S -o - 2>&1 | FileCheck %s -implicit-check-not "WARNING: Instruction with empty DebugLoc in function bar"
+; RUN: opt --experimental-debuginfo-iterators=false -passes=check-debugify < %s -S -o - 2>&1 | FileCheck %s -implicit-check-not "WARNING: Instruction with empty DebugLoc in function bar"
 
 ; CHECK: WARNING: Instruction with empty DebugLoc in function foo --   ret void
 define void @foo() !dbg !6 {
diff --git a/llvm/test/DebugInfo/debugify.ll b/llvm/test/DebugInfo/debugify.ll
index 5ce6795d41b6b..191015f825933 100644
--- a/llvm/test/DebugInfo/debugify.ll
+++ b/llvm/test/DebugInfo/debugify.ll
@@ -25,6 +25,33 @@
 ; RUN: opt -enable-debugify -O1 < %s | opt -O2 -o /dev/null
 ; RUN: opt -passes=debugify,mem2reg,check-debugify < %s | opt -O2 -o /dev/null
 
+;; Perform the same checks again for intrinsic debug info
+; RUN: opt --experimental-debuginfo-iterators=false -passes=debugify -S -o - < %s | FileCheck %s
+; RUN: opt --experimental-debuginfo-iterators=false -passes=debugify -S -o - < %s | FileCheck %s
+
+; RUN: opt --experimental-debuginfo-iterators=false -passes=debugify,debugify -S -o - < %s 2>&1 | \
+; RUN:   FileCheck %s -check-prefix=CHECK-REPEAT
+; RUN: opt --experimental-debuginfo-iterators=false -passes=debugify,debugify -S -o - < %s 2>&1 | \
+; RUN:   FileCheck %s -check-prefix=CHECK-REPEAT
+
+; RUN: opt --experimental-debuginfo-iterators=false -passes=debugify,check-debugify -S -o - < %s | \
+; RUN:   FileCheck %s -implicit-check-not="CheckModuleDebugify: FAIL"
+; RUN: opt --experimental-debuginfo-iterators=false -passes=debugify,check-debugify -S -o - < %s | \
+; RUN:   FileCheck %s -implicit-check-not="CheckModuleDebugify: FAIL"
+; RUN: opt --experimental-debuginfo-iterators=false -enable-debugify -passes=verify -S -o - < %s | \
+; RUN:   FileCheck %s -implicit-check-not="CheckModuleDebugify: FAIL"
+
+; RUN: opt --experimental-debuginfo-iterators=false -passes=debugify,strip,check-debugify -S -o - < %s 2>&1 | \
+; RUN:   FileCheck %s -check-prefix=CHECK-WARN
+
+; RUN: opt --experimental-debuginfo-iterators=false -enable-debugify -passes=strip -S -o - < %s 2>&1 | \
+; RUN:   FileCheck %s -check-prefix=CHECK-WARN
+
+; RUN: opt --experimental-debuginfo-iterators=false -enable-debugify -S -o - < %s 2>&1 | FileCheck %s -check-prefix=PASS
+
+; RUN: opt --experimental-debuginfo-iterators=false -enable-debugify -O1 < %s | opt -O2 -o /dev/null
+; RUN: opt --experimental-debuginfo-iterators=false -passes=debugify,mem2reg,check-debugify < %s | opt -O2 -o /dev/null
+
 ; CHECK-LABEL: define void @foo
 define void @foo() {
 ; CHECK: ret void, !dbg ![[RET1:.*]]
diff --git a/llvm/test/DebugInfo/pr37964.ll b/llvm/test/DebugInfo/pr37964.ll
index 9581f1a6b35dc..63db67d2bd37f 100644
--- a/llvm/test/DebugInfo/pr37964.ll
+++ b/llvm/test/DebugInfo/pr37964.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -disable-output -debugify-each -passes=gvn < %s 2>&1 | FileCheck %s
+; RUN: opt --experimental-debuginfo-iterators=false -disable-output -debugify-each -passes=gvn < %s 2>&1 | FileCheck %s
 
 ; CHECK-NOT: ERROR: Instruction with empty DebugLoc in function _Z3bazv --  {{%.*}} = phi
 ; CHECK: CheckFunctionDebugify [GVNPass]: PASS
diff --git a/llvm/test/DebugInfo/salvage-cast-debug-info.ll b/llvm/test/DebugInfo/salvage-cast-debug-info.ll
index 4676aee3d4e48..b72f717a4f2de 100644
--- a/llvm/test/DebugInfo/salvage-cast-debug-info.ll
+++ b/llvm/test/DebugInfo/salvage-cast-debug-info.ll
@@ -1,5 +1,5 @@
 ; RUN: opt %s -passes=debugify,early-cse -earlycse-debug-hash -S | FileCheck %s
-; RUN: opt %s -passes=debugify,early-cse -earlycse-debug-hash -S --try-experimental-debuginfo-iterators | FileCheck %s
+; RUN: opt --experimental-debuginfo-iterators=false %s -passes=debugify,early-cse -earlycse-debug-hash -S | FileCheck %s
 define i32 @foo(i64 %nose, i32 %more) {
 ; CHECK-LABEL: @foo(
 ; CHECK: call void @llvm.dbg.value(metadata i64 %nose, metadata [[V1:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_convert, 64, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned
diff --git a/llvm/test/DebugInfo/verify-di-preserve.ll b/llvm/test/DebugInfo/verify-di-preserve.ll
index a2f1b1dd78dc5..92fc62a0b34c4 100644
--- a/llvm/test/DebugInfo/verify-di-preserve.ll
+++ b/llvm/test/DebugInfo/verify-di-preserve.ll
@@ -1,10 +1,10 @@
 ; RUN: opt %s -verify-debuginfo-preserve -passes=instcombine -disable-output 2>&1 | FileCheck --check-prefix=VERIFY %s
-; RUN: opt --try-experimental-debuginfo-iterators %s -verify-debuginfo-preserve -passes=instcombine -disable-output 2>&1 | FileCheck --check-prefix=VERIFY %s
+; RUN: opt --experimental-debuginfo-iterators=false %s -verify-debuginfo-preserve -passes=instcombine -disable-output 2>&1 | FileCheck --check-prefix=VERIFY %s
 
 ; VERIFY: CheckModuleDebugify (original debuginfo):
 
 ; RUN: opt %s -verify-each-debuginfo-preserve -O2 -disable-output 2>&1 | FileCheck --check-prefix=VERIFY-EACH %s
-; RUN: opt %s  --try-experimental-debuginfo-iterators -verify-each-debuginfo-preserve -O2 -disable-output 2>&1 | FileCheck --check-prefix=VERIFY-EACH %s
+; RUN: opt %s --experimental-debuginfo-iterators=false -verify-each-debuginfo-preserve -O2 -disable-output 2>&1 | FileCheck --check-prefix=VERIFY-EACH %s
 
 ; VERIFY-EACH: DeadArgumentEliminationPass
 ; VERIFY-EACH: GlobalDCEPass
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_vtune.s b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_vtune.s
index 1c95bde51e121..936486b8a319c 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_vtune.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_vtune.s
@@ -20,12 +20,14 @@ main:
         .cfi_def_cfa_offset 16
         .cfi_offset 6, -16
         movq    %rsp, %rbp
+        pushq   %rbx
         .cfi_def_cfa_register 6
-        movl    %edi, -4(%rbp)
-        movq    %rsi, -16(%rbp)
-        movl    -4(%rbp), %ebx
+        movl    %edi, -16(%rbp)
+        movq    %rsi, -24(%rbp)
+        movl    -16(%rbp), %ebx
         addl    $1, %ebx
-	movl   $0, %eax
+        movl    $0, %eax
+        popq    %rbx
         popq    %rbp
         .cfi_def_cfa 7, 8
         ret
diff --git a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
index 18e0ede5e2937..0e27d6301bb3c 100644
--- a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
+++ b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
@@ -28,7 +28,7 @@
 
 # RELAX:       Relocations [
 # RELAX-NEXT:    Section ({{.*}}) .rela.text {
-# RELAX-NEXT:      0x4 R_LARCH_ALIGN {{.*}} 0x4
+# RELAX-NEXT:      0x4 R_LARCH_ALIGN .text 0x4
 # RELAX-NEXT:      0x10 R_LARCH_PCALA_HI20 .L1 0x0
 # RELAX-NEXT:      0x10 R_LARCH_RELAX - 0x0
 # RELAX-NEXT:      0x14 R_LARCH_PCALA_LO12 .L1 0x0
diff --git a/llvm/test/MC/LoongArch/Relocations/relax-align.s b/llvm/test/MC/LoongArch/Relocations/relax-align.s
index 294fd9fb916c7..0246d5b46431c 100644
--- a/llvm/test/MC/LoongArch/Relocations/relax-align.s
+++ b/llvm/test/MC/LoongArch/Relocations/relax-align.s
@@ -63,17 +63,19 @@ ret
 ## Test the symbol index is different from .text.
 .section .text2, "ax"
 .p2align 4
+.p2align 4, , 4
 break 7
 
 # RELOC:            Relocations [
 # RELAX-RELOC-NEXT:   Section ({{.*}}) .rela.text {
-# RELAX-RELOC-NEXT:     0x24 R_LARCH_ALIGN .Lla-relax-align0 0x4
-# RELAX-RELOC-NEXT:     0x34 R_LARCH_ALIGN .Lla-relax-align0 0x5
-# RELAX-RELOC-NEXT:     0x50 R_LARCH_ALIGN .Lla-relax-align0 0x4
-# RELAX-RELOC-NEXT:     0x60 R_LARCH_ALIGN .Lla-relax-align0 0xB04
-# RELAX-RELOC-NEXT:     0x70 R_LARCH_ALIGN .Lla-relax-align0 0x4
+# RELAX-RELOC-NEXT:     0x24 R_LARCH_ALIGN .text 0x4
+# RELAX-RELOC-NEXT:     0x34 R_LARCH_ALIGN .text 0x5
+# RELAX-RELOC-NEXT:     0x50 R_LARCH_ALIGN .text 0x4
+# RELAX-RELOC-NEXT:     0x60 R_LARCH_ALIGN .text 0xB04
+# RELAX-RELOC-NEXT:     0x70 R_LARCH_ALIGN .text 0x4
 # RELAX-RELOC-NEXT:   }
 # RELAX-RELOC-NEXT:   Section ({{.*}}) .rela.text2 {
-# RELAX-RELOC-NEXT:     0x0 R_LARCH_ALIGN .Lla-relax-align1 0x4
+# RELAX-RELOC-NEXT:     0x0 R_LARCH_ALIGN .text2 0x4
+# RELAX-RELOC-NEXT:     0xC R_LARCH_ALIGN .text2 0x404
 # RELAX-RELOC-NEXT:   }
 # RELOC-NEXT:       ]
diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s
index a8f493f781ec3..8835ff22446c8 100644
--- a/llvm/test/MC/RISCV/attribute-arch.s
+++ b/llvm/test/MC/RISCV/attribute-arch.s
@@ -270,6 +270,9 @@
 .attribute arch, "rv32iza64rs1p0"
 # CHECK: attribute      5, "rv32i2p1_za64rs1p0"
 
+.attribute arch, "rv32izama16b"
+# CHECK: attribute      5, "rv32i2p1_zama16b1p0"
+
 .attribute arch, "rv32izawrs1p0"
 # CHECK: attribute      5, "rv32i2p1_zawrs1p0"
 
diff --git a/llvm/test/MC/RISCV/rv32zcmop-invalid.s b/llvm/test/MC/RISCV/rv32zcmop-invalid.s
index 71d72d59b0209..fb6252f7f0760 100644
--- a/llvm/test/MC/RISCV/rv32zcmop-invalid.s
+++ b/llvm/test/MC/RISCV/rv32zcmop-invalid.s
@@ -1,7 +1,7 @@
 # RUN: not llvm-mc -triple riscv32 -mattr=+zcmop < %s 2>&1 | FileCheck %s
 
-cmop.0 # CHECK: :[[@LINE]]:1: error: unrecognized instruction mnemonic
+c.mop.0 # CHECK: :[[@LINE]]:1: error: unrecognized instruction mnemonic
 
-cmop.1 t0 # CHECK: :[[@LINE]]:8: error: invalid operand for instruction
+c.mop.1 t0 # CHECK: :[[@LINE]]:9: error: invalid operand for instruction
 
-cmop.1 0x0 # CHECK: :[[@LINE]]:8: error: invalid operand for instruction
+c.mop.1 0x0 # CHECK: :[[@LINE]]:9: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/rvzcmop-valid.s b/llvm/test/MC/RISCV/rvzcmop-valid.s
index c6bb4a1580825..dd5d26ac5dd0c 100644
--- a/llvm/test/MC/RISCV/rvzcmop-valid.s
+++ b/llvm/test/MC/RISCV/rvzcmop-valid.s
@@ -9,34 +9,34 @@
 # RUN:     | llvm-objdump --mattr=+zcmop -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 
-# CHECK-ASM-AND-OBJ: cmop.1
+# CHECK-ASM-AND-OBJ: c.mop.1
 # CHECK-ASM: encoding: [0x81,0x60]
-cmop.1
+c.mop.1
 
-# CHECK-ASM-AND-OBJ: cmop.3
+# CHECK-ASM-AND-OBJ: c.mop.3
 # CHECK-ASM: encoding: [0x81,0x61]
-cmop.3
+c.mop.3
 
-# CHECK-ASM-AND-OBJ: cmop.5
+# CHECK-ASM-AND-OBJ: c.mop.5
 # CHECK-ASM: encoding: [0x81,0x62]
-cmop.5
+c.mop.5
 
-# CHECK-ASM-AND-OBJ: cmop.7
+# CHECK-ASM-AND-OBJ: c.mop.7
 # CHECK-ASM: encoding: [0x81,0x63]
-cmop.7
+c.mop.7
 
-# CHECK-ASM-AND-OBJ: cmop.9
+# CHECK-ASM-AND-OBJ: c.mop.9
 # CHECK-ASM: encoding: [0x81,0x64]
-cmop.9
+c.mop.9
 
-# CHECK-ASM-AND-OBJ: cmop.11
+# CHECK-ASM-AND-OBJ: c.mop.11
 # CHECK-ASM: encoding: [0x81,0x65]
-cmop.11
+c.mop.11
 
-# CHECK-ASM-AND-OBJ: cmop.13
+# CHECK-ASM-AND-OBJ: c.mop.13
 # CHECK-ASM: encoding: [0x81,0x66]
-cmop.13
+c.mop.13
 
-# CHECK-ASM-AND-OBJ: cmop.15
+# CHECK-ASM-AND-OBJ: c.mop.15
 # CHECK-ASM: encoding: [0x81,0x67]
-cmop.15
+c.mop.15
diff --git a/llvm/test/Other/lint.ll b/llvm/test/Other/lint.ll
index 6b31b31a78c98..6fd2d40cd2f29 100644
--- a/llvm/test/Other/lint.ll
+++ b/llvm/test/Other/lint.ll
@@ -124,13 +124,6 @@ define void @0() nounwind {
   ret void
 }
 
-; CHECK: va_start called in a non-varargs function
-declare void @llvm.va_start(ptr)
-define void @not_vararg(ptr %p) nounwind {
-  call void @llvm.va_start(ptr %p)
-  ret void
-}
-
 ; CHECK: Undefined behavior: Branch to non-blockaddress
 define void @use_indbr() {
   indirectbr ptr @foo, [label %block]
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td
index 40a831d7e9e8f..ebb95ccb21040 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/builtins/match-table-replacerreg.td
@@ -28,11 +28,11 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 
 //      CHECK: const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(69), GIMT_Encode2(186), /*)*//*default:*//*Label 2*/ GIMT_Encode4(562),
-// CHECK-NEXT:     /*TargetOpcode::G_UNMERGE_VALUES*//*Label 0*/ GIMT_Encode4(478), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_FNEG*//*Label 1*/ GIMT_Encode4(530),
-// CHECK-NEXT:     // Label 0: @478
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 3*/ GIMT_Encode4(529), // Rule ID 1 //
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2({{[0-9]+}}), GIMT_Encode2({{[0-9]+}}), /*)*//*default:*//*Label 2*/ GIMT_Encode4([[L562:[0-9]+]]),
+// CHECK-NEXT:     /*TargetOpcode::G_UNMERGE_VALUES*//*Label 0*/ GIMT_Encode4([[L478:[0-9]+]]), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_FNEG*//*Label 1*/ GIMT_Encode4([[L530:[0-9]+]]),
+// CHECK-NEXT:     // Label 0: @[[L478]]
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 3*/ GIMT_Encode4([[L529:[0-9]+]]), // Rule ID 1 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled),
 // CHECK-NEXT:       GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
 // CHECK-NEXT:       // MIs[0] a
@@ -57,10 +57,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_ReplaceRegWithTempReg, /*OldInsnID*/0, /*OldOpIdx*/1, /*TempRegID*/0,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 3: @529
+// CHECK-NEXT:     // Label 3: @[[L529]]
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 1: @530
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4(561), // Rule ID 0 //
+// CHECK-NEXT:     // Label 1: @[[L530]]
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4([[L561:[0-9]+]]), // Rule ID 0 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled),
 // CHECK-NEXT:       // MIs[0] dst
 // CHECK-NEXT:       // No operand predicates
@@ -75,10 +75,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_ReplaceReg, /*OldInsnID*/0, /*OldOpIdx*/0, /*NewInsnId*/1, /*NewOpIdx*/1,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 4: @561
+// CHECK-NEXT:     // Label 4: @[[L561]]
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 2: @562
+// CHECK-NEXT:     // Label 2: @[[L562]]
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     }; // Size: 563 bytes
+// CHECK-NEXT:     }; // Size: {{[0-9]+}} bytes
 // CHECK-NEXT:   return MatchTable0;
 // CHECK-NEXT: }
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td
index 751b1318ecc01..6004a17d351be 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-imms.td
@@ -34,12 +34,12 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 
 //      CHECK: const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2(132), /*)*//*default:*//*Label 3*/ GIMT_Encode4(579),
-// CHECK-NEXT:     /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(462), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_CONSTANT*//*Label 1*/ GIMT_Encode4(493), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 2*/ GIMT_Encode4(539),
-// CHECK-NEXT:     // Label 0: @462
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4(492), // Rule ID 0 //
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2({{[0-9]+}}), /*)*//*default:*//*Label 3*/ GIMT_Encode4([[L579:[0-9]+]]),
+// CHECK-NEXT:     /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4([[L462:[0-9]+]]), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_CONSTANT*//*Label 1*/ GIMT_Encode4([[L493:[0-9]+]]), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 2*/ GIMT_Encode4({{[0-9]+}}),
+// CHECK-NEXT:     // Label 0: @[[L462]]
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4([[L492:[0-9]+]]), // Rule ID 0 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled),
 // CHECK-NEXT:       GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
 // CHECK-NEXT:       // MIs[0] a
@@ -51,10 +51,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_AddImm8, /*InsnID*/0, /*Imm*/0,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 4: @492
+// CHECK-NEXT:     // Label 4: @[[L492]]
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 1: @493
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 5*/ GIMT_Encode4(538), // Rule ID 2 //
+// CHECK-NEXT:     // Label 1: @[[L493]]
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 5*/ GIMT_Encode4([[L538:[0-9]+]]), // Rule ID 2 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule2Enabled),
 // CHECK-NEXT:       GIM_CheckType, /*MI*/0, /*Op*/1, /*Type*/GILLT_s32,
 // CHECK-NEXT:       // MIs[0] a
@@ -66,10 +66,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_AddCImm, /*InsnID*/0, /*Type*/GILLT_s32, /*Imm*/GIMT_Encode8(42),
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 5: @538
+// CHECK-NEXT:     // Label 5: @[[L538]]
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 2: @539
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4(578), // Rule ID 1 //
+// CHECK-NEXT:     // Label 2: @{{[0-9]+}}
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4([[L578:[0-9]+]]), // Rule ID 1 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled),
 // CHECK-NEXT:       // MIs[0] a
 // CHECK-NEXT:       // No operand predicates
@@ -83,10 +83,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 6: @578
+// CHECK-NEXT:     // Label 6: @[[L578]]
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 3: @579
+// CHECK-NEXT:     // Label 3: @[[L579]]
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     }; // Size: 580 bytes
+// CHECK-NEXT:     }; // Size: {{[0-9]+}} bytes
 // CHECK-NEXT:   return MatchTable0;
 // CHECK-NEXT: }
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td
index e8e6d3e74f402..b2dd8b6684b1d 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-intrinsics.td
@@ -29,11 +29,11 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 
 //      CHECK: const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(119), GIMT_Encode2(121), /*)*//*default:*//*Label 2*/ GIMT_Encode4(132),
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2({{[0-9]+}}), GIMT_Encode2({{[0-9]+}}), /*)*//*default:*//*Label 2*/ GIMT_Encode4([[L132:[0-9]+]]),
 // CHECK-NEXT:     /*TargetOpcode::G_INTRINSIC*//*Label 0*/ GIMT_Encode4(18),
-// CHECK-NEXT:     /*TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS*//*Label 1*/ GIMT_Encode4(73),
+// CHECK-NEXT:     /*TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS*//*Label 1*/ GIMT_Encode4([[L73:[0-9]+]]),
 // CHECK-NEXT:     // Label 0: @18
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 3*/ GIMT_Encode4(72), // Rule ID 0 //
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 3*/ GIMT_Encode4([[L72:[0-9]+]]), // Rule ID 0 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled),
 // CHECK-NEXT:       GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
 // CHECK-NEXT:       GIM_CheckIntrinsicID, /*MI*/0, /*Op*/1, GIMT_Encode2(Intrinsic::1in_1out),
@@ -52,10 +52,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_AddSimpleTempRegister, /*InsnID*/1, /*TempRegID*/0,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 3: @72
+// CHECK-NEXT:     // Label 3: @[[L72]]
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 1: @73
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4(131), // Rule ID 1 //
+// CHECK-NEXT:     // Label 1: @[[L73]]
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4([[L131:[0-9]+]]), // Rule ID 1 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled),
 // CHECK-NEXT:       GIM_CheckNumOperands, /*MI*/0, /*Expected*/3,
 // CHECK-NEXT:       GIM_CheckIntrinsicID, /*MI*/0, /*Op*/1, GIMT_Encode2(Intrinsic::sideeffects_1in_1out),
@@ -76,9 +76,9 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_MergeMemOperands, /*InsnID*/1, /*NumInsns*/1, /*MergeInsnID's*/0,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 4: @131
+// CHECK-NEXT:     // Label 4: @[[L131]]
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 2: @132
+// CHECK-NEXT:     // Label 2: @[[L132]]
 // CHECK-NEXT:     GIM_Reject,
 // CHECK-NEXT:     }; // Size: 133 bytes
 // CHECK-NEXT:   return MatchTable0;
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td
index 26a0ec6235e30..016ab05ca01e4 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-patfrag-root.td
@@ -28,12 +28,12 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 
 //      CHECK: const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(124), GIMT_Encode2(187), /*)*//*default:*//*Label 3*/ GIMT_Encode4(380),
-// CHECK-NEXT:     /*TargetOpcode::G_TRUNC*//*Label 0*/ GIMT_Encode4(262), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 1*/ GIMT_Encode4(298), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_FPEXT*//*Label 2*/ GIMT_Encode4(344),
-// CHECK-NEXT:     // Label 0: @262
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4(297), // Rule ID 1 //
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2({{[0-9]+}}), GIMT_Encode2({{[0-9]+}}), /*)*//*default:*//*Label 3*/ GIMT_Encode4([[L380:[0-9]+]]),
+// CHECK-NEXT:     /*TargetOpcode::G_TRUNC*//*Label 0*/ GIMT_Encode4([[L262:[0-9]+]]), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 1*/ GIMT_Encode4([[L298:[0-9]+]]), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_FPEXT*//*Label 2*/ GIMT_Encode4([[L344:[0-9]+]]),
+// CHECK-NEXT:     // Label 0: @[[L262]]
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 4*/ GIMT_Encode4([[L297:[0-9]+]]), // Rule ID 1 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled),
 // CHECK-NEXT:       // MIs[0] root
 // CHECK-NEXT:       // No operand predicates
@@ -47,10 +47,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 4: @297
+// CHECK-NEXT:     // Label 4: @[[L297]]
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 1: @298
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 5*/ GIMT_Encode4(343), // Rule ID 0 //
+// CHECK-NEXT:     // Label 1: @[[L298]]
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 5*/ GIMT_Encode4([[L343:[0-9]+]]), // Rule ID 0 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled),
 // CHECK-NEXT:       // MIs[0] root
 // CHECK-NEXT:       // No operand predicates
@@ -68,10 +68,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 5: @343
+// CHECK-NEXT:     // Label 5: @[[L343]]
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 2: @344
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4(379), // Rule ID 2 //
+// CHECK-NEXT:     // Label 2: @[[L344]]
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 6*/ GIMT_Encode4([[L379:[0-9]+]]), // Rule ID 2 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled),
 // CHECK-NEXT:       // MIs[0] root
 // CHECK-NEXT:       // No operand predicates
@@ -85,10 +85,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 6: @379
+// CHECK-NEXT:     // Label 6: @[[L379]]
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 3: @380
+// CHECK-NEXT:     // Label 3: @[[L380]]
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     }; // Size: 381 bytes
+// CHECK-NEXT:     }; // Size: {{[0-9]+}} bytes
 // CHECK-NEXT:   return MatchTable0;
 // CHECK-NEXT: }
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
index 0189d3d056fc0..02085c1fd2666 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table.td
@@ -135,15 +135,15 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // Verify match table.
 // CHECK:      const uint8_t *GenMyCombiner::getMatchTable() const {
 // CHECK-NEXT:   constexpr static uint8_t MatchTable0[] = {
-// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2(132), /*)*//*default:*//*Label 6*/ GIMT_Encode4(677),
-// CHECK-NEXT:     /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4(462), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_AND*//*Label 1*/ GIMT_Encode4(504), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_STORE*//*Label 2*/ GIMT_Encode4(557), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_TRUNC*//*Label 3*/ GIMT_Encode4(599), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_SEXT*//*Label 4*/ GIMT_Encode4(624), GIMT_Encode4(0),
-// CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 5*/ GIMT_Encode4(637),
-// CHECK-NEXT:     // Label 0: @462
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4(491), // Rule ID 4 //
+// CHECK-NEXT:     GIM_SwitchOpcode, /*MI*/0, /*[*/GIMT_Encode2(19), GIMT_Encode2({{[0-9]+}}), /*)*//*default:*//*Label 6*/ GIMT_Encode4([[L677:[0-9]+]]),
+// CHECK-NEXT:     /*TargetOpcode::COPY*//*Label 0*/ GIMT_Encode4([[L462:[0-9]+]]), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_AND*//*Label 1*/ GIMT_Encode4([[L504:[0-9]+]]), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_STORE*//*Label 2*/ GIMT_Encode4([[L557:[0-9]+]]), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_TRUNC*//*Label 3*/ GIMT_Encode4([[L599:[0-9]+]]), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_SEXT*//*Label 4*/ GIMT_Encode4([[L624:[0-9]+]]), GIMT_Encode4(0),
+// CHECK-NEXT:     /*TargetOpcode::G_ZEXT*//*Label 5*/ GIMT_Encode4([[L637:[0-9]+]]),
+// CHECK-NEXT:     // Label 0: @[[L462]]
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 7*/ GIMT_Encode4([[L491:[0-9]+]]), // Rule ID 4 //
 // CHECK-NEXT:       GIM_CheckFeatures, GIMT_Encode2(GIFBS_HasAnswerToEverything),
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule3Enabled),
 // CHECK-NEXT:       // MIs[0] a
@@ -158,8 +158,8 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       // Combiner Rule #3: InstTest1
 // CHECK-NEXT:       GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0),
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 7: @491
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4(503), // Rule ID 3 //
+// CHECK-NEXT:     // Label 7: @[[L491]]
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 8*/ GIMT_Encode4([[L503:[0-9]+]]), // Rule ID 3 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule2Enabled),
 // CHECK-NEXT:       // MIs[0] a
 // CHECK-NEXT:       // No operand predicates
@@ -168,10 +168,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       // Combiner Rule #2: InstTest0
 // CHECK-NEXT:       GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner1),
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 8: @503
+// CHECK-NEXT:     // Label 8: @[[L503]]
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 1: @504
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 9*/ GIMT_Encode4(556), // Rule ID 6 //
+// CHECK-NEXT:     // Label 1: @[[L504]]
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 9*/ GIMT_Encode4([[L556:[0-9]+]]), // Rule ID 6 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule5Enabled),
 // CHECK-NEXT:       GIM_CheckType, /*MI*/0, /*Op*/2, /*Type*/GILLT_s32,
 // CHECK-NEXT:       // MIs[0] dst
@@ -189,10 +189,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_Copy, /*NewInsnID*/0, /*OldInsnID*/1, /*OpIdx*/1, // z
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 9: @556
+// CHECK-NEXT:     // Label 9: @[[L556]]
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 2: @557
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 10*/ GIMT_Encode4(598), // Rule ID 5 //
+// CHECK-NEXT:     // Label 2: @[[L557]]
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 10*/ GIMT_Encode4([[L598:[0-9]+]]), // Rule ID 5 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule4Enabled),
 // CHECK-NEXT:       // MIs[0] tmp
 // CHECK-NEXT:       GIM_RecordInsnIgnoreCopies, /*DefineMI*/1, /*MI*/0, /*OpIdx*/0, // MIs[1]
@@ -210,32 +210,32 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner2),
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 10: @598
+// CHECK-NEXT:     // Label 10: @[[L598]]
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 3: @599
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 11*/ GIMT_Encode4(611), // Rule ID 0 //
+// CHECK-NEXT:     // Label 3: @[[L599]]
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 11*/ GIMT_Encode4([[L611:[0-9]+]]), // Rule ID 0 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule0Enabled),
 // CHECK-NEXT:       // Combiner Rule #0: WipOpcodeTest0; wip_match_opcode 'G_TRUNC'
 // CHECK-NEXT:       GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0),
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 11: @611
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 12*/ GIMT_Encode4(623), // Rule ID 1 //
+// CHECK-NEXT:     // Label 11: @[[L611]]
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 12*/ GIMT_Encode4([[L623:[0-9]+]]), // Rule ID 1 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled),
 // CHECK-NEXT:       // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_TRUNC'
 // CHECK-NEXT:       GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0),
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 12: @623
+// CHECK-NEXT:     // Label 12: @[[L623]]
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 4: @624
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 13*/ GIMT_Encode4(636), // Rule ID 2 //
+// CHECK-NEXT:     // Label 4: @[[L624]]
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 13*/ GIMT_Encode4([[L636:[0-9]+]]), // Rule ID 2 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule1Enabled),
 // CHECK-NEXT:       // Combiner Rule #1: WipOpcodeTest1; wip_match_opcode 'G_SEXT'
 // CHECK-NEXT:       GIR_CustomAction, GIMT_Encode2(GICXXCustomAction_CombineApplyGICombiner0),
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 13: @636
+// CHECK-NEXT:     // Label 13: @[[L636]]
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 5: @637
-// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 14*/ GIMT_Encode4(676), // Rule ID 7 //
+// CHECK-NEXT:     // Label 5: @[[L637]]
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 14*/ GIMT_Encode4([[L676:[0-9]+]]), // Rule ID 7 //
 // CHECK-NEXT:       GIM_CheckSimplePredicate, GIMT_Encode2(GICXXPred_Simple_IsRule6Enabled),
 // CHECK-NEXT:       // MIs[0] dst
 // CHECK-NEXT:       // No operand predicates
@@ -250,10 +250,10 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
 // CHECK-NEXT:       GIR_AddSimpleTempRegister, /*InsnID*/0, /*TempRegID*/0,
 // CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
 // CHECK-NEXT:       GIR_Done,
-// CHECK-NEXT:     // Label 14: @676
+// CHECK-NEXT:     // Label 14: @[[L676]]
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     // Label 6: @677
+// CHECK-NEXT:     // Label 6: @[[L677]]
 // CHECK-NEXT:     GIM_Reject,
-// CHECK-NEXT:     }; // Size: 678 bytes
+// CHECK-NEXT:     }; // Size: {{[0-9]+}} bytes
 // CHECK-NEXT:   return MatchTable0;
 // CHECK-NEXT: }
diff --git a/llvm/test/TableGen/GlobalISelEmitter.td b/llvm/test/TableGen/GlobalISelEmitter.td
index f79b792b37a36..82ecc4495e80a 100644
--- a/llvm/test/TableGen/GlobalISelEmitter.td
+++ b/llvm/test/TableGen/GlobalISelEmitter.td
@@ -518,7 +518,7 @@ def : Pat<(frag GPR32:$src1, complex:$src2, complex:$src3),
 // R00O-NEXT:  GIM_Reject,
 // R00O:       // Label [[DEFAULT_NUM]]: @[[DEFAULT]]
 // R00O-NEXT:  GIM_Reject,
-// R00O-NEXT:  }; // Size: 2023 bytes
+// R00O-NEXT:  }; // Size: 2027 bytes
 
 def INSNBOB : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2, GPR32:$src3, GPR32:$src4),
                  [(set GPR32:$dst,
diff --git a/llvm/test/TableGen/def-multiple-operands.td b/llvm/test/TableGen/def-multiple-operands.td
new file mode 100644
index 0000000000000..b747c58907505
--- /dev/null
+++ b/llvm/test/TableGen/def-multiple-operands.td
@@ -0,0 +1,37 @@
+// RUN: llvm-tblgen -gen-instr-info -I %p/../../include %s | FileCheck %s
+
+include "llvm/Target/Target.td"
+
+def archInstrInfo : InstrInfo {}
+
+def arch : Target {
+  let InstructionSet = archInstrInfo;
+}
+
+def R0 : Register<"r0">;
+def P0 : Register<"p0">;
+def R32 : RegisterClass<"MyNS", [i32], 0, (add R0)>;
+def P1 : RegisterClass<"MyNS", [i1], 0, (add P0)>;
+
+def Reg3Opnd : Operand<OtherVT> {
+  let MIOperandInfo = (ops R32, R32, P1);
+}
+
+// The following checks verify that 'MCInstrDesc' entry for 'InstA' has the
+// expected 'NumOperands' and 'NumDefs', i.e. 'InstA' should have 3 defs out of
+// 4 operands.
+
+// CHECK: archInstrTable {{.* = \{}}
+// CHECK: {{\{}}
+// CHECK: {{\{}} [[ID:[0-9]+]], 4, 3, 13, {{.+\}, \/\/}}
+// CHECK-SAME: Inst #[[ID]] = InstA
+def InstA : Instruction {
+  let Namespace = "MyNS";
+  let Size = 13;
+  // InstA should have 3 defs out of 4 operands.
+  let OutOperandList = (outs Reg3Opnd:$dst);
+  let InOperandList = (ins i32imm:$c);
+  field bits<8> Inst;
+  field bits<8> SoftFail = 0;
+  let hasSideEffects = false;
+}
diff --git a/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll b/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll
index 6f659a88da2e2..c5f656c870a23 100644
--- a/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll
+++ b/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll
@@ -41,7 +41,7 @@ define i32 @test1() {
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_COND]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[I_05_LCSSA_WIDE:%.*]] = phi i64 [ [[INDVARS_IV]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[I_05_LCSSA_WIDE]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc nuw nsw i64 [[I_05_LCSSA_WIDE]] to i32
 ; CHECK-NEXT:    store i32 [[TMP5]], ptr @idx, align 4
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
 ; CHECK:       for.cond.for.end.loopexit_crit_edge:
@@ -237,7 +237,7 @@ define i32 @test4(i32 %a) {
 ; CHECK-NEXT:    [[CONV3:%.*]] = trunc i32 [[OR]] to i8
 ; CHECK-NEXT:    [[CALL:%.*]] = call i32 @fn1(i8 signext [[CONV3]])
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i32 [[INDVARS_IV]], -1
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[INDVARS_IV_NEXT]] to i8
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc nuw i32 [[INDVARS_IV_NEXT]] to i8
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[TMP0]], -14
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
@@ -466,7 +466,7 @@ define i32 @test9(ptr %a, i32 %b, i32 %init) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ADD]] = add nsw i32 [[SUM_0]], [[TMP1]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc nuw i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 0, [[TMP2]]
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_COND]], label [[FOR_END]]
 ; CHECK:       for.end:
@@ -997,7 +997,7 @@ define i32 @test16_unsigned_neg(i32 %start, ptr %p, ptr %q, i32 %x) {
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc nuw i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[FOO:%.*]] = add i32 [[TMP1]], -1
 ; CHECK-NEXT:    br i1 [[COND]], label [[EXIT:%.*]], label [[GUARDED:%.*]]
 ; CHECK:       guarded:
diff --git a/llvm/test/Transforms/IndVarSimplify/X86/iv-widen.ll b/llvm/test/Transforms/IndVarSimplify/X86/iv-widen.ll
index d05755bea0ddd..4e0c503794bfe 100644
--- a/llvm/test/Transforms/IndVarSimplify/X86/iv-widen.ll
+++ b/llvm/test/Transforms/IndVarSimplify/X86/iv-widen.ll
@@ -23,7 +23,7 @@ define void @loop_0(ptr %a) {
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[B18_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[B24:%.*]] ]
 ; CHECK-NEXT:    call void @use(i64 [[INDVARS_IV]])
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[O:%.*]] = getelementptr i32, ptr [[A:%.*]], i32 [[TMP0]]
 ; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[O]], align 4
 ; CHECK-NEXT:    [[T:%.*]] = icmp eq i32 [[V]], 0
@@ -37,7 +37,7 @@ define void @loop_0(ptr %a) {
 ; CHECK-NEXT:    ret void
 ; CHECK:       exit24:
 ; CHECK-NEXT:    [[DOT02_LCSSA_WIDE:%.*]] = phi i64 [ [[INDVARS_IV]], [[B18]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[DOT02_LCSSA_WIDE]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc nuw nsw i64 [[DOT02_LCSSA_WIDE]] to i32
 ; CHECK-NEXT:    call void @dummy(i32 [[TMP1]])
 ; CHECK-NEXT:    unreachable
 ;
@@ -159,7 +159,7 @@ declare void @dummy(i32)
 declare void @dummy.i64(i64)
 
 
-define void @loop_2(i32 %size, i32 %nsteps, i32 %hsize, ptr %lined, i8 %tmp1) {
+define void @loop_2(i32 %size, i32 %nsteps, i32 %hsize, ptr %lined, i8 %arg) {
 ; CHECK-LABEL: @loop_2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP215:%.*]] = icmp sgt i32 [[SIZE:%.*]], 1
@@ -180,12 +180,12 @@ define void @loop_2(i32 %size, i32 %nsteps, i32 %hsize, ptr %lined, i8 %tmp1) {
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 1, [[FOR_BODY2_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY2]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP3]], [[INDVARS_IV]]
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[LINED:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    store i8 [[TMP1:%.*]], ptr [[ADD_PTR]], align 1
+; CHECK-NEXT:    store i8 [[ARG:%.*]], ptr [[ADD_PTR]], align 1
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY2]], label [[FOR_BODY3_PREHEADER:%.*]]
 ; CHECK:       for.body3.preheader:
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc nsw i64 [[TMP3]] to i32
 ; CHECK-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT7:%.*]] = zext i32 [[SIZE]] to i64
 ; CHECK-NEXT:    br label [[FOR_BODY3:%.*]]
@@ -193,7 +193,7 @@ define void @loop_2(i32 %size, i32 %nsteps, i32 %hsize, ptr %lined, i8 %tmp1) {
 ; CHECK-NEXT:    [[INDVARS_IV3:%.*]] = phi i64 [ 1, [[FOR_BODY3_PREHEADER]] ], [ [[INDVARS_IV_NEXT4:%.*]], [[FOR_BODY3]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP6]], [[INDVARS_IV3]]
 ; CHECK-NEXT:    [[ADD_PTR2:%.*]] = getelementptr inbounds i8, ptr [[LINED]], i64 [[TMP7]]
-; CHECK-NEXT:    store i8 [[TMP1]], ptr [[ADD_PTR2]], align 1
+; CHECK-NEXT:    store i8 [[ARG]], ptr [[ADD_PTR2]], align 1
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT4]] = add nuw nsw i64 [[INDVARS_IV3]], 1
 ; CHECK-NEXT:    [[EXITCOND8:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT4]], [[WIDE_TRIP_COUNT7]]
 ; CHECK-NEXT:    br i1 [[EXITCOND8]], label [[FOR_BODY3]], label [[FOR_INC_LOOPEXIT:%.*]]
@@ -222,7 +222,7 @@ for.body2:
   %add4 = add nsw i32 %add, %k
   %idx.ext = sext i32 %add4 to i64
   %add.ptr = getelementptr inbounds i8, ptr %lined, i64 %idx.ext
-  store i8 %tmp1, ptr %add.ptr, align 1
+  store i8 %arg, ptr %add.ptr, align 1
   %inc = add nsw i32 %k, 1
   %cmp2 = icmp slt i32 %inc, %size
   br i1 %cmp2, label %for.body2, label %for.body3
@@ -233,7 +233,7 @@ for.body3:
   %add5 = add nuw i32 %add, %l
   %idx.ext2 = zext i32 %add5 to i64
   %add.ptr2 = getelementptr inbounds i8, ptr %lined, i64 %idx.ext2
-  store i8 %tmp1, ptr %add.ptr2, align 1
+  store i8 %arg, ptr %add.ptr2, align 1
   %inc2 = add nsw i32 %l, 1
   %cmp3 = icmp slt i32 %inc2, %size
   br i1 %cmp3, label %for.body3, label %for.inc
diff --git a/llvm/test/Transforms/IndVarSimplify/elim-extend.ll b/llvm/test/Transforms/IndVarSimplify/elim-extend.ll
index 54bb9951ff66a..01c95dadd1626 100644
--- a/llvm/test/Transforms/IndVarSimplify/elim-extend.ll
+++ b/llvm/test/Transforms/IndVarSimplify/elim-extend.ll
@@ -142,7 +142,7 @@ define void @nestedIV(ptr %address, i32 %limit) nounwind {
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[INNERLOOP]], label [[INNEREXIT:%.*]]
 ; CHECK:       innerexit:
 ; CHECK-NEXT:    [[INNERCOUNT_LCSSA_WIDE:%.*]] = phi i64 [ [[INDVARS_IV_NEXT]], [[INNERLOOP]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INNERCOUNT_LCSSA_WIDE]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc nsw i64 [[INNERCOUNT_LCSSA_WIDE]] to i32
 ; CHECK-NEXT:    br label [[OUTERMERGE]]
 ; CHECK:       outermerge:
 ; CHECK-NEXT:    [[INNERCOUNT_MERGE]] = phi i32 [ [[TMP3]], [[INNEREXIT]] ], [ [[INNERCOUNT]], [[INNERPREHEADER]] ]
diff --git a/llvm/test/Transforms/IndVarSimplify/hoist-wide-inc-for-narrow-use-recompute-flags.ll b/llvm/test/Transforms/IndVarSimplify/hoist-wide-inc-for-narrow-use-recompute-flags.ll
index cc99ee312ccb7..1135ca9dbf00d 100644
--- a/llvm/test/Transforms/IndVarSimplify/hoist-wide-inc-for-narrow-use-recompute-flags.ll
+++ b/llvm/test/Transforms/IndVarSimplify/hoist-wide-inc-for-narrow-use-recompute-flags.ll
@@ -15,7 +15,7 @@ define void @test_pr82243(ptr %f) {
 ; CHECK-NEXT:    [[GEP_IV_EXT:%.*]] = getelementptr i32, ptr [[F]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store i32 1, ptr [[GEP_IV_EXT]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 123, [[TMP0]]
 ; CHECK-NEXT:    [[GEP_SHL:%.*]] = getelementptr i32, ptr [[F]], i32 [[SHL]]
 ; CHECK-NEXT:    br label [[INNER_HEADER:%.*]]
diff --git a/llvm/test/Transforms/IndVarSimplify/iv-sext.ll b/llvm/test/Transforms/IndVarSimplify/iv-sext.ll
index 450913f16baa2..95a036f0e54c7 100644
--- a/llvm/test/Transforms/IndVarSimplify/iv-sext.ll
+++ b/llvm/test/Transforms/IndVarSimplify/iv-sext.ll
@@ -99,7 +99,7 @@ define void @t(ptr %pval1, ptr %peakWeight, ptr %nrgReducePeakrate, i32 %bandEdg
 ; CHECK-NEXT:    [[VAL35_LCSSA:%.*]] = phi float [ [[VAL35]], [[BB5]] ]
 ; CHECK-NEXT:    [[VAL31_LCSSA_WIDE:%.*]] = phi i64 [ [[INDVARS_IV_NEXT]], [[BB5]] ]
 ; CHECK-NEXT:    [[VAL30_LCSSA:%.*]] = phi float [ [[VAL30]], [[BB5]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[VAL31_LCSSA_WIDE]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc nsw i64 [[VAL31_LCSSA_WIDE]] to i32
 ; CHECK-NEXT:    br label [[BB7]]
 ; CHECK:       bb7:
 ; CHECK-NEXT:    [[DISTERBHI_2_LCSSA]] = phi float [ [[VAL30_LCSSA]], [[BB5_BB7_CRIT_EDGE]] ], [ [[DISTERBHI_0_PH]], [[BB5_PREHEADER]] ]
diff --git a/llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll b/llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll
index 59a0241bfe9fd..a83e9ce74b12a 100644
--- a/llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll
+++ b/llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll
@@ -22,7 +22,7 @@ define void @foo(ptr %A, ptr %B, ptr %C, i32 %N) {
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
 ; CHECK-NEXT:    [[DIV0:%.*]] = udiv i32 5, [[TMP3]]
 ; CHECK-NEXT:    [[ADD4:%.*]] = add nsw i32 [[ADD3]], [[DIV0]]
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
@@ -224,7 +224,7 @@ define i32 @foo3(i32 %M) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i64 [[INDVARS_IV]], [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc nsw i64 [[TMP3]] to i32
 ; CHECK-NEXT:    [[IDXPROM4:%.*]] = zext i32 [[TMP4]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [100 x i32], ptr @a, i64 0, i64 [[IDXPROM4]]
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4
@@ -365,7 +365,7 @@ define i32 @foo5(ptr %input, i32 %length, ptr %in) {
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 1, [[FOR_BODY_LR_PH]] ]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[INPUT]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc nuw nsw i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[MUL]] to i64
 ; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[IN:%.*]], i64 [[IDX_EXT]]
@@ -514,7 +514,7 @@ define void @foo7(i32 %n, ptr %a, i32 %x) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP2]], 1
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    store i32 [[TMP4]], ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], [[TMP0]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], [[TMP1]]
diff --git a/llvm/test/Transforms/IndVarSimplify/lftr.ll b/llvm/test/Transforms/IndVarSimplify/lftr.ll
index 41db925de577e..7f4820f093e55 100644
--- a/llvm/test/Transforms/IndVarSimplify/lftr.ll
+++ b/llvm/test/Transforms/IndVarSimplify/lftr.ll
@@ -525,7 +525,7 @@ define float @wide_trip_count_test3(ptr %b,
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[INDVARS_IV]], 20
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TEMP:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to float
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[CONV]], [[TEMP]]
 ; CHECK-NEXT:    [[ADD1]] = fadd float [[SUM_07]], [[MUL]]
@@ -584,7 +584,7 @@ define float @wide_trip_count_test4(ptr %b,
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 20
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TEMP:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP1]] to float
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[CONV]], [[TEMP]]
 ; CHECK-NEXT:    [[ADD1]] = fadd float [[SUM_07]], [[MUL]]
diff --git a/llvm/test/Transforms/IndVarSimplify/no-iv-rewrite.ll b/llvm/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
index c35c5bacf68ca..579b8536cedf0 100644
--- a/llvm/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
+++ b/llvm/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
@@ -213,7 +213,7 @@ define void @maxvisitor(i32 %limit, ptr %base) nounwind {
 ; CHECK-NEXT:    [[CMP19:%.*]] = icmp sgt i32 [[VAL]], [[MAX]]
 ; CHECK-NEXT:    br i1 [[CMP19]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    br label [[LOOP_INC]]
 ; CHECK:       if.else:
 ; CHECK-NEXT:    br label [[LOOP_INC]]
diff --git a/llvm/test/Transforms/IndVarSimplify/post-inc-range.ll b/llvm/test/Transforms/IndVarSimplify/post-inc-range.ll
index 5c22ba1044b60..bbdee0267effb 100644
--- a/llvm/test/Transforms/IndVarSimplify/post-inc-range.ll
+++ b/llvm/test/Transforms/IndVarSimplify/post-inc-range.ll
@@ -180,7 +180,7 @@ define void @test_neg(ptr %array_length_ptr, ptr %base,
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc nuw i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP2]], [[LIMIT:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]]
 ; CHECK:       for.end:
diff --git a/llvm/test/Transforms/IndVarSimplify/pr25578.ll b/llvm/test/Transforms/IndVarSimplify/pr25578.ll
index d8adc178474c0..380e8171798b0 100644
--- a/llvm/test/Transforms/IndVarSimplify/pr25578.ll
+++ b/llvm/test/Transforms/IndVarSimplify/pr25578.ll
@@ -13,7 +13,7 @@ L1_header:
 
 ; CHECK: L2_header:
 ; CHECK: %[[INDVAR:.*]] = phi i64
-; CHECK: %[[TRUNC:.*]] = trunc i64 %[[INDVAR]] to i32
+; CHECK: %[[TRUNC:.*]] = trunc nuw nsw i64 %[[INDVAR]] to i32
 L2_header:
   %i = phi i32 [ 0, %L1_header ], [ %i_next, %L2_latch ]
   %i_prom = sext i32 %i to i64
diff --git a/llvm/test/Transforms/IndVarSimplify/pr55925.ll b/llvm/test/Transforms/IndVarSimplify/pr55925.ll
index 312a8295ccdc9..2ad187add4e10 100644
--- a/llvm/test/Transforms/IndVarSimplify/pr55925.ll
+++ b/llvm/test/Transforms/IndVarSimplify/pr55925.ll
@@ -14,11 +14,11 @@ define void @test(ptr %p) personality ptr undef {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc nuw i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[RES:%.*]] = invoke i32 @foo(i32 returned [[TMP0]])
 ; CHECK-NEXT:            to label [[LOOP_LATCH]] unwind label [[EXIT:%.*]]
 ; CHECK:       loop.latch:
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc nuw i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @foo(i32 [[TMP1]])
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    br label [[LOOP]]
@@ -56,8 +56,8 @@ define void @test_critedge(i1 %c, ptr %p) personality ptr undef {
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LOOP_INVOKE:%.*]], label [[LOOP_OTHER:%.*]]
 ; CHECK:       loop.invoke:
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc nuw i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc nuw i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[RES:%.*]] = invoke i32 @foo(i32 returned [[TMP0]])
 ; CHECK-NEXT:            to label [[LOOP_LATCH]] unwind label [[EXIT:%.*]]
 ; CHECK:       loop.other:
diff --git a/llvm/test/Transforms/IndVarSimplify/widen-nonnegative-countdown.ll b/llvm/test/Transforms/IndVarSimplify/widen-nonnegative-countdown.ll
index d473103f5824e..9c8983421029f 100644
--- a/llvm/test/Transforms/IndVarSimplify/widen-nonnegative-countdown.ll
+++ b/llvm/test/Transforms/IndVarSimplify/widen-nonnegative-countdown.ll
@@ -223,7 +223,7 @@ define void @sext_postinc(ptr %A, i32 %start) {
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc nsw i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i32 [[TMP1]], 6
 ; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]]
 ; CHECK:       exit.loopexit:
@@ -262,7 +262,7 @@ define void @sext_preinc(ptr %A, i32 %start) {
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i32 [[TMP1]], 6
 ; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]]
 ; CHECK:       exit.loopexit:
@@ -366,7 +366,7 @@ define void @zext_postinc_offset_constant_one(ptr %A, i32 %start) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[J_016_US:%.*]] = phi i32 [ [[INC_US:%.*]], [[FOR_BODY]] ], [ [[START]], [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc nuw i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[ADD_US:%.*]] = add i32 [[TMP1]], 1
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext i32 [[ADD_US]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
@@ -513,13 +513,13 @@ define void @sext_postinc_offset_constant_one(ptr %A, i32 %start) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[ADD_US:%.*]] = add i32 [[TMP1]], 1
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
 ; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc nsw i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i32 [[TMP2]], 6
 ; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]]
 ; CHECK:       exit.loopexit:
@@ -556,13 +556,13 @@ define void @sext_preinc_offset_constant_one(ptr %A, i32 %start) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[ADD_US:%.*]] = add nuw i32 [[TMP1]], 1
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
 ; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i32 [[TMP2]], 6
 ; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]]
 ; CHECK:       exit.loopexit:
@@ -808,13 +808,13 @@ define void @sext_postinc_offset_constant_minus_one(ptr %A, i32 %start) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[ADD_US:%.*]] = add i32 [[TMP1]], -1
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
 ; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc nsw i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i32 [[TMP2]], 6
 ; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]]
 ; CHECK:       exit.loopexit:
@@ -851,13 +851,13 @@ define void @sext_preinc_offset_constant_minus_one(ptr %A, i32 %start) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[ADD_US:%.*]] = add i32 [[TMP1]], -1
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
 ; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc nsw i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ugt i32 [[TMP2]], 6
 ; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]]
 ; CHECK:       exit.loopexit:
diff --git a/llvm/test/Transforms/IndVarSimplify/widen-nonnegative.ll b/llvm/test/Transforms/IndVarSimplify/widen-nonnegative.ll
index 739db26311f4a..e00eaafa3f192 100644
--- a/llvm/test/Transforms/IndVarSimplify/widen-nonnegative.ll
+++ b/llvm/test/Transforms/IndVarSimplify/widen-nonnegative.ll
@@ -150,7 +150,7 @@ define void @sext_add_nuw(ptr %A, i32 %offset, i32 %M) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc nuw i64 [[TMP1]] to i32
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[TMP2]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
 ; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
@@ -185,7 +185,7 @@ define void @sext_add_noflags(ptr %A, i32 %offset, i32 %M) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[ADD_US:%.*]] = add i32 [[TMP0]], [[OFFSET:%.*]]
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD_US]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
@@ -223,7 +223,7 @@ define void @zext_add_nsw(ptr %A, i32 %offset, i32 %M) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[INDVARS_IV]], [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc nsw i64 [[TMP1]] to i32
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext i32 [[TMP2]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
 ; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
@@ -293,7 +293,7 @@ define void @zext_add_noflags(ptr %A, i32 %offset, i32 %M) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[ADD_US:%.*]] = add i32 [[TMP0]], [[OFFSET:%.*]]
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext i32 [[ADD_US]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
@@ -399,7 +399,7 @@ define void @zext_nneg_add_noflags(ptr %A, i32 %offset, i32 %M) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[ADD_US:%.*]] = add i32 [[TMP0]], [[OFFSET:%.*]]
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext nneg i32 [[ADD_US]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
@@ -475,7 +475,7 @@ define void @sext_mul_nuw(ptr %A, i32 %multiple, i32 %M) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw i64 [[INDVARS_IV]], [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc nuw i64 [[TMP1]] to i32
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[TMP2]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
 ; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
@@ -510,7 +510,7 @@ define void @sext_mul_noflags(ptr %A, i32 %multiple, i32 %M) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[MUL_US:%.*]] = mul i32 [[TMP0]], [[MULTIPLE:%.*]]
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[MUL_US]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
@@ -548,7 +548,7 @@ define void @zext_mul_nsw(ptr %A, i32 %multiple, i32 %M) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw i64 [[INDVARS_IV]], [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc nsw i64 [[TMP1]] to i32
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext i32 [[TMP2]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
 ; CHECK-NEXT:    tail call void @use_ptr(ptr [[ARRAYIDX_US]])
@@ -618,7 +618,7 @@ define void @zext_mul_noflags(ptr %A, i32 %multiple, i32 %M) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[MUL_US:%.*]] = mul i32 [[TMP0]], [[MULTIPLE:%.*]]
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext i32 [[MUL_US]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
@@ -724,7 +724,7 @@ define void @zext_nneg_mul_noflags(ptr %A, i32 %multiple, i32 %M) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc nuw nsw i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[MUL_US:%.*]] = mul i32 [[TMP0]], [[MULTIPLE:%.*]]
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext nneg i32 [[MUL_US]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
diff --git a/llvm/test/Transforms/Inline/inline-switch-default-2.ll b/llvm/test/Transforms/Inline/inline-switch-default-2.ll
index 8d3e24c798df8..82dae1c27648f 100644
--- a/llvm/test/Transforms/Inline/inline-switch-default-2.ll
+++ b/llvm/test/Transforms/Inline/inline-switch-default-2.ll
@@ -4,50 +4,6 @@
 ; Check for scenarios without TTI.
 
 define i64 @foo1(i64 %a) {
-; LOOKUPTABLE-LABEL: define i64 @foo1(
-; LOOKUPTABLE-SAME: i64 [[TMP0:%.*]]) {
-; LOOKUPTABLE-NEXT:    switch i64 [[TMP0]], label [[DEFAULT_BRANCH_I:%.*]] [
-; LOOKUPTABLE-NEXT:      i64 0, label [[BRANCH_0_I:%.*]]
-; LOOKUPTABLE-NEXT:      i64 2, label [[BRANCH_2_I:%.*]]
-; LOOKUPTABLE-NEXT:      i64 4, label [[BRANCH_4_I:%.*]]
-; LOOKUPTABLE-NEXT:      i64 6, label [[BRANCH_6_I:%.*]]
-; LOOKUPTABLE-NEXT:    ]
-; LOOKUPTABLE:       branch_0.i:
-; LOOKUPTABLE-NEXT:    br label [[BAR1_EXIT:%.*]]
-; LOOKUPTABLE:       branch_2.i:
-; LOOKUPTABLE-NEXT:    br label [[BAR1_EXIT]]
-; LOOKUPTABLE:       branch_4.i:
-; LOOKUPTABLE-NEXT:    br label [[BAR1_EXIT]]
-; LOOKUPTABLE:       branch_6.i:
-; LOOKUPTABLE-NEXT:    br label [[BAR1_EXIT]]
-; LOOKUPTABLE:       default_branch.i:
-; LOOKUPTABLE-NEXT:    br label [[BAR1_EXIT]]
-; LOOKUPTABLE:       bar1.exit:
-; LOOKUPTABLE-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ], [ 3, [[DEFAULT_BRANCH_I]] ]
-; LOOKUPTABLE-NEXT:    ret i64 [[TMP2]]
-;
-; SWITCH-LABEL: define i64 @foo1(
-; SWITCH-SAME: i64 [[TMP0:%.*]]) {
-; SWITCH-NEXT:    switch i64 [[TMP0]], label [[DEFAULT_BRANCH_I:%.*]] [
-; SWITCH-NEXT:      i64 0, label [[BRANCH_0_I:%.*]]
-; SWITCH-NEXT:      i64 2, label [[BRANCH_2_I:%.*]]
-; SWITCH-NEXT:      i64 4, label [[BRANCH_4_I:%.*]]
-; SWITCH-NEXT:      i64 6, label [[BRANCH_6_I:%.*]]
-; SWITCH-NEXT:    ]
-; SWITCH:       branch_0.i:
-; SWITCH-NEXT:    br label [[BAR1_EXIT:%.*]]
-; SWITCH:       branch_2.i:
-; SWITCH-NEXT:    br label [[BAR1_EXIT]]
-; SWITCH:       branch_4.i:
-; SWITCH-NEXT:    br label [[BAR1_EXIT]]
-; SWITCH:       branch_6.i:
-; SWITCH-NEXT:    br label [[BAR1_EXIT]]
-; SWITCH:       default_branch.i:
-; SWITCH-NEXT:    br label [[BAR1_EXIT]]
-; SWITCH:       bar1.exit:
-; SWITCH-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ], [ 3, [[DEFAULT_BRANCH_I]] ]
-; SWITCH-NEXT:    ret i64 [[TMP2]]
-;
 ; CHECK-LABEL: define i64 @foo1(
 ; CHECK-SAME: i64 [[A:%.*]]) {
 ; CHECK-NEXT:    [[B:%.*]] = call i64 @bar1(i64 [[A]])
@@ -58,50 +14,6 @@ define i64 @foo1(i64 %a) {
 }
 
 define i64 @foo2(i64 %a) {
-; LOOKUPTABLE-LABEL: define i64 @foo2(
-; LOOKUPTABLE-SAME: i64 [[TMP0:%.*]]) {
-; LOOKUPTABLE-NEXT:    switch i64 [[TMP0]], label [[UNREACHABLEDEFAULT_I:%.*]] [
-; LOOKUPTABLE-NEXT:      i64 0, label [[BRANCH_0_I:%.*]]
-; LOOKUPTABLE-NEXT:      i64 2, label [[BRANCH_2_I:%.*]]
-; LOOKUPTABLE-NEXT:      i64 4, label [[BRANCH_4_I:%.*]]
-; LOOKUPTABLE-NEXT:      i64 6, label [[BRANCH_6_I:%.*]]
-; LOOKUPTABLE-NEXT:    ]
-; LOOKUPTABLE:       branch_0.i:
-; LOOKUPTABLE-NEXT:    br label [[BAR2_EXIT:%.*]]
-; LOOKUPTABLE:       branch_2.i:
-; LOOKUPTABLE-NEXT:    br label [[BAR2_EXIT]]
-; LOOKUPTABLE:       branch_4.i:
-; LOOKUPTABLE-NEXT:    br label [[BAR2_EXIT]]
-; LOOKUPTABLE:       branch_6.i:
-; LOOKUPTABLE-NEXT:    br label [[BAR2_EXIT]]
-; LOOKUPTABLE:       unreachabledefault.i:
-; LOOKUPTABLE-NEXT:    unreachable
-; LOOKUPTABLE:       bar2.exit:
-; LOOKUPTABLE-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ]
-; LOOKUPTABLE-NEXT:    ret i64 [[TMP2]]
-;
-; SWITCH-LABEL: define i64 @foo2(
-; SWITCH-SAME: i64 [[TMP0:%.*]]) {
-; SWITCH-NEXT:    switch i64 [[TMP0]], label [[UNREACHABLEDEFAULT_I:%.*]] [
-; SWITCH-NEXT:      i64 0, label [[BRANCH_0_I:%.*]]
-; SWITCH-NEXT:      i64 2, label [[BRANCH_2_I:%.*]]
-; SWITCH-NEXT:      i64 4, label [[BRANCH_4_I:%.*]]
-; SWITCH-NEXT:      i64 6, label [[BRANCH_6_I:%.*]]
-; SWITCH-NEXT:    ]
-; SWITCH:       branch_0.i:
-; SWITCH-NEXT:    br label [[BAR2_EXIT:%.*]]
-; SWITCH:       branch_2.i:
-; SWITCH-NEXT:    br label [[BAR2_EXIT]]
-; SWITCH:       branch_4.i:
-; SWITCH-NEXT:    br label [[BAR2_EXIT]]
-; SWITCH:       branch_6.i:
-; SWITCH-NEXT:    br label [[BAR2_EXIT]]
-; SWITCH:       unreachabledefault.i:
-; SWITCH-NEXT:    unreachable
-; SWITCH:       bar2.exit:
-; SWITCH-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0_I]] ], [ 9, [[BRANCH_2_I]] ], [ 2, [[BRANCH_4_I]] ], [ 7, [[BRANCH_6_I]] ]
-; SWITCH-NEXT:    ret i64 [[TMP2]]
-;
 ; CHECK-LABEL: define i64 @foo2(
 ; CHECK-SAME: i64 [[A:%.*]]) {
 ; CHECK-NEXT:    switch i64 [[A]], label [[UNREACHABLEDEFAULT_I:%.*]] [
@@ -129,50 +41,6 @@ define i64 @foo2(i64 %a) {
 }
 
 define i64 @bar1(i64 %a) {
-; LOOKUPTABLE-LABEL: define i64 @bar1(
-; LOOKUPTABLE-SAME: i64 [[TMP0:%.*]]) {
-; LOOKUPTABLE-NEXT:    switch i64 [[TMP0]], label [[DEFAULT_BRANCH:%.*]] [
-; LOOKUPTABLE-NEXT:      i64 0, label [[BRANCH_0:%.*]]
-; LOOKUPTABLE-NEXT:      i64 2, label [[BRANCH_2:%.*]]
-; LOOKUPTABLE-NEXT:      i64 4, label [[BRANCH_4:%.*]]
-; LOOKUPTABLE-NEXT:      i64 6, label [[BRANCH_6:%.*]]
-; LOOKUPTABLE-NEXT:    ]
-; LOOKUPTABLE:       branch_0:
-; LOOKUPTABLE-NEXT:    br label [[EXIT:%.*]]
-; LOOKUPTABLE:       branch_2:
-; LOOKUPTABLE-NEXT:    br label [[EXIT]]
-; LOOKUPTABLE:       branch_4:
-; LOOKUPTABLE-NEXT:    br label [[EXIT]]
-; LOOKUPTABLE:       branch_6:
-; LOOKUPTABLE-NEXT:    br label [[EXIT]]
-; LOOKUPTABLE:       default_branch:
-; LOOKUPTABLE-NEXT:    br label [[EXIT]]
-; LOOKUPTABLE:       exit:
-; LOOKUPTABLE-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ], [ 3, [[DEFAULT_BRANCH]] ]
-; LOOKUPTABLE-NEXT:    ret i64 [[TMP2]]
-;
-; SWITCH-LABEL: define i64 @bar1(
-; SWITCH-SAME: i64 [[TMP0:%.*]]) {
-; SWITCH-NEXT:    switch i64 [[TMP0]], label [[DEFAULT_BRANCH:%.*]] [
-; SWITCH-NEXT:      i64 0, label [[BRANCH_0:%.*]]
-; SWITCH-NEXT:      i64 2, label [[BRANCH_2:%.*]]
-; SWITCH-NEXT:      i64 4, label [[BRANCH_4:%.*]]
-; SWITCH-NEXT:      i64 6, label [[BRANCH_6:%.*]]
-; SWITCH-NEXT:    ]
-; SWITCH:       branch_0:
-; SWITCH-NEXT:    br label [[EXIT:%.*]]
-; SWITCH:       branch_2:
-; SWITCH-NEXT:    br label [[EXIT]]
-; SWITCH:       branch_4:
-; SWITCH-NEXT:    br label [[EXIT]]
-; SWITCH:       branch_6:
-; SWITCH-NEXT:    br label [[EXIT]]
-; SWITCH:       default_branch:
-; SWITCH-NEXT:    br label [[EXIT]]
-; SWITCH:       exit:
-; SWITCH-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ], [ 3, [[DEFAULT_BRANCH]] ]
-; SWITCH-NEXT:    ret i64 [[TMP2]]
-;
 ; CHECK-LABEL: define i64 @bar1(
 ; CHECK-SAME: i64 [[A:%.*]]) {
 ; CHECK-NEXT:    switch i64 [[A]], label [[DEFAULT_BRANCH:%.*]] [
@@ -223,50 +91,6 @@ exit:
 }
 
 define i64 @bar2(i64 %a) {
-; LOOKUPTABLE-LABEL: define i64 @bar2(
-; LOOKUPTABLE-SAME: i64 [[TMP0:%.*]]) {
-; LOOKUPTABLE-NEXT:    switch i64 [[TMP0]], label [[UNREACHABLEDEFAULT:%.*]] [
-; LOOKUPTABLE-NEXT:      i64 0, label [[BRANCH_0:%.*]]
-; LOOKUPTABLE-NEXT:      i64 2, label [[BRANCH_2:%.*]]
-; LOOKUPTABLE-NEXT:      i64 4, label [[BRANCH_4:%.*]]
-; LOOKUPTABLE-NEXT:      i64 6, label [[BRANCH_6:%.*]]
-; LOOKUPTABLE-NEXT:    ]
-; LOOKUPTABLE:       branch_0:
-; LOOKUPTABLE-NEXT:    br label [[EXIT:%.*]]
-; LOOKUPTABLE:       branch_2:
-; LOOKUPTABLE-NEXT:    br label [[EXIT]]
-; LOOKUPTABLE:       branch_4:
-; LOOKUPTABLE-NEXT:    br label [[EXIT]]
-; LOOKUPTABLE:       branch_6:
-; LOOKUPTABLE-NEXT:    br label [[EXIT]]
-; LOOKUPTABLE:       unreachabledefault:
-; LOOKUPTABLE-NEXT:    unreachable
-; LOOKUPTABLE:       exit:
-; LOOKUPTABLE-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ]
-; LOOKUPTABLE-NEXT:    ret i64 [[TMP2]]
-;
-; SWITCH-LABEL: define i64 @bar2(
-; SWITCH-SAME: i64 [[TMP0:%.*]]) {
-; SWITCH-NEXT:    switch i64 [[TMP0]], label [[UNREACHABLEDEFAULT:%.*]] [
-; SWITCH-NEXT:      i64 0, label [[BRANCH_0:%.*]]
-; SWITCH-NEXT:      i64 2, label [[BRANCH_2:%.*]]
-; SWITCH-NEXT:      i64 4, label [[BRANCH_4:%.*]]
-; SWITCH-NEXT:      i64 6, label [[BRANCH_6:%.*]]
-; SWITCH-NEXT:    ]
-; SWITCH:       branch_0:
-; SWITCH-NEXT:    br label [[EXIT:%.*]]
-; SWITCH:       branch_2:
-; SWITCH-NEXT:    br label [[EXIT]]
-; SWITCH:       branch_4:
-; SWITCH-NEXT:    br label [[EXIT]]
-; SWITCH:       branch_6:
-; SWITCH-NEXT:    br label [[EXIT]]
-; SWITCH:       unreachabledefault:
-; SWITCH-NEXT:    unreachable
-; SWITCH:       exit:
-; SWITCH-NEXT:    [[TMP2:%.*]] = phi i64 [ 5, [[BRANCH_0]] ], [ 9, [[BRANCH_2]] ], [ 2, [[BRANCH_4]] ], [ 7, [[BRANCH_6]] ]
-; SWITCH-NEXT:    ret i64 [[TMP2]]
-;
 ; CHECK-LABEL: define i64 @bar2(
 ; CHECK-SAME: i64 [[A:%.*]]) {
 ; CHECK-NEXT:    switch i64 [[A]], label [[UNREACHABLEDEFAULT:%.*]] [
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
index 4600a6654a362..b1e5fa4f9e1c9 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll
@@ -2032,23 +2032,23 @@ define <4 x i64> @avx2_psrlv_q_256_allbig(<4 x i64> %v) {
   ret <4 x i64> %1
 }
 
-; The shift amount is 0 (the undef lane could be 0), so we return the unshifted input.
+; The shift amount is 0 (the poison lane could be 0), so we return the unshifted input.
 
-define <2 x i64> @avx2_psrlv_q_128_undef(<2 x i64> %v) {
-; CHECK-LABEL: @avx2_psrlv_q_128_undef(
+define <2 x i64> @avx2_psrlv_q_128_poison(<2 x i64> %v) {
+; CHECK-LABEL: @avx2_psrlv_q_128_poison(
 ; CHECK-NEXT:    ret <2 x i64> [[V:%.*]]
 ;
-  %1 = insertelement <2 x i64> <i64 0, i64 8>, i64 undef, i64 1
+  %1 = insertelement <2 x i64> <i64 0, i64 8>, i64 poison, i64 1
   %2 = tail call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %v, <2 x i64> %1)
   ret <2 x i64> %2
 }
 
-define <4 x i64> @avx2_psrlv_q_256_undef(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_psrlv_q_256_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i64> [[V:%.*]], <i64 undef, i64 8, i64 16, i64 31>
+define <4 x i64> @avx2_psrlv_q_256_poison(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_psrlv_q_256_poison(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i64> [[V:%.*]], <i64 poison, i64 8, i64 16, i64 31>
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
 ;
-  %1 = insertelement <4 x i64> <i64 0, i64 8, i64 16, i64 31>, i64 undef, i64 0
+  %1 = insertelement <4 x i64> <i64 0, i64 8, i64 16, i64 31>, i64 poison, i64 0
   %2 = tail call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %v, <4 x i64> %1)
   ret <4 x i64> %2
 }
@@ -2435,21 +2435,21 @@ define <4 x i64> @avx2_psllv_q_256_allbig(<4 x i64> %v) {
 
 ; The shift amount is 0 (the undef lane could be 0), so we return the unshifted input.
 
-define <2 x i64> @avx2_psllv_q_128_undef(<2 x i64> %v) {
-; CHECK-LABEL: @avx2_psllv_q_128_undef(
+define <2 x i64> @avx2_psllv_q_128_poison(<2 x i64> %v) {
+; CHECK-LABEL: @avx2_psllv_q_128_poison(
 ; CHECK-NEXT:    ret <2 x i64> [[V:%.*]]
 ;
-  %1 = insertelement <2 x i64> <i64 0, i64 8>, i64 undef, i64 1
+  %1 = insertelement <2 x i64> <i64 0, i64 8>, i64 poison, i64 1
   %2 = tail call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %v, <2 x i64> %1)
   ret <2 x i64> %2
 }
 
-define <4 x i64> @avx2_psllv_q_256_undef(<4 x i64> %v) {
-; CHECK-LABEL: @avx2_psllv_q_256_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i64> [[V:%.*]], <i64 undef, i64 8, i64 16, i64 31>
+define <4 x i64> @avx2_psllv_q_256_poison(<4 x i64> %v) {
+; CHECK-LABEL: @avx2_psllv_q_256_poison(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <4 x i64> [[V:%.*]], <i64 poison, i64 8, i64 16, i64 31>
 ; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
 ;
-  %1 = insertelement <4 x i64> <i64 0, i64 8, i64 16, i64 31>, i64 undef, i64 0
+  %1 = insertelement <4 x i64> <i64 0, i64 8, i64 16, i64 31>, i64 poison, i64 0
   %2 = tail call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %v, <4 x i64> %1)
   ret <4 x i64> %2
 }
diff --git a/llvm/test/Transforms/InstCombine/abs-1.ll b/llvm/test/Transforms/InstCombine/abs-1.ll
index 7355c560c820b..32bd7a37053ed 100644
--- a/llvm/test/Transforms/InstCombine/abs-1.ll
+++ b/llvm/test/Transforms/InstCombine/abs-1.ll
@@ -63,14 +63,14 @@ define <2 x i8> @abs_canonical_2(<2 x i8> %x) {
   ret <2 x i8> %abs
 }
 
-; Even if a constant has undef elements.
+; Even if a constant has poison elements.
 
-define <2 x i8> @abs_canonical_2_vec_undef_elts(<2 x i8> %x) {
-; CHECK-LABEL: @abs_canonical_2_vec_undef_elts(
+define <2 x i8> @abs_canonical_2_vec_poison_elts(<2 x i8> %x) {
+; CHECK-LABEL: @abs_canonical_2_vec_poison_elts(
 ; CHECK-NEXT:    [[ABS:%.*]] = call <2 x i8> @llvm.abs.v2i8(<2 x i8> [[X:%.*]], i1 false)
 ; CHECK-NEXT:    ret <2 x i8> [[ABS]]
 ;
-  %cmp = icmp sgt <2 x i8> %x, <i8 undef, i8 -1>
+  %cmp = icmp sgt <2 x i8> %x, <i8 poison, i8 -1>
   %neg = sub <2 x i8> zeroinitializer, %x
   %abs = select <2 x i1> %cmp, <2 x i8> %x, <2 x i8> %neg
   ret <2 x i8> %abs
@@ -208,15 +208,15 @@ define <2 x i8> @nabs_canonical_2(<2 x i8> %x) {
   ret <2 x i8> %abs
 }
 
-; Even if a constant has undef elements.
+; Even if a constant has poison elements.
 
-define <2 x i8> @nabs_canonical_2_vec_undef_elts(<2 x i8> %x) {
-; CHECK-LABEL: @nabs_canonical_2_vec_undef_elts(
+define <2 x i8> @nabs_canonical_2_vec_poison_elts(<2 x i8> %x) {
+; CHECK-LABEL: @nabs_canonical_2_vec_poison_elts(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i8> @llvm.abs.v2i8(<2 x i8> [[X:%.*]], i1 false)
 ; CHECK-NEXT:    [[ABS:%.*]] = sub <2 x i8> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    ret <2 x i8> [[ABS]]
 ;
-  %cmp = icmp sgt <2 x i8> %x, <i8 -1, i8 undef>
+  %cmp = icmp sgt <2 x i8> %x, <i8 -1, i8 poison>
   %neg = sub <2 x i8> zeroinitializer, %x
   %abs = select <2 x i1> %cmp, <2 x i8> %neg, <2 x i8> %x
   ret <2 x i8> %abs
diff --git a/llvm/test/Transforms/InstCombine/add-mask-neg.ll b/llvm/test/Transforms/InstCombine/add-mask-neg.ll
index 5fad6155d348e..0e579f3097607 100644
--- a/llvm/test/Transforms/InstCombine/add-mask-neg.ll
+++ b/llvm/test/Transforms/InstCombine/add-mask-neg.ll
@@ -89,8 +89,8 @@ define <2 x i32> @dec_mask_neg_v2i32(<2 x i32> %X) {
   ret <2 x i32> %dec
 }
 
-define <2 x i32> @dec_mask_neg_v2i32_undef(<2 x i32> %X) {
-; CHECK-LABEL: @dec_mask_neg_v2i32_undef(
+define <2 x i32> @dec_mask_neg_v2i32_poison(<2 x i32> %X) {
+; CHECK-LABEL: @dec_mask_neg_v2i32_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> [[X:%.*]], <i32 -1, i32 -1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor <2 x i32> [[X]], <i32 -1, i32 -1>
 ; CHECK-NEXT:    [[DEC:%.*]] = and <2 x i32> [[TMP1]], [[TMP2]]
@@ -98,7 +98,7 @@ define <2 x i32> @dec_mask_neg_v2i32_undef(<2 x i32> %X) {
 ;
   %neg = sub <2 x i32> zeroinitializer, %X
   %mask = and <2 x i32> %neg, %X
-  %dec = add <2 x i32> %mask, <i32 -1, i32 undef>
+  %dec = add <2 x i32> %mask, <i32 -1, i32 poison>
   ret <2 x i32> %dec
 }
 
diff --git a/llvm/test/Transforms/InstCombine/add-sitofp.ll b/llvm/test/Transforms/InstCombine/add-sitofp.ll
index 2bdc808d9771c..f1afcaf5f85d2 100644
--- a/llvm/test/Transforms/InstCombine/add-sitofp.ll
+++ b/llvm/test/Transforms/InstCombine/add-sitofp.ll
@@ -6,7 +6,7 @@ define double @x(i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[M:%.*]] = lshr i32 [[A:%.*]], 24
 ; CHECK-NEXT:    [[N:%.*]] = and i32 [[M]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[N]], 1
-; CHECK-NEXT:    [[P:%.*]] = uitofp i32 [[TMP1]] to double
+; CHECK-NEXT:    [[P:%.*]] = uitofp nneg i32 [[TMP1]] to double
 ; CHECK-NEXT:    ret double [[P]]
 ;
   %m = lshr i32 %a, 24
@@ -20,7 +20,7 @@ define double @test(i32 %a) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    [[A_AND:%.*]] = and i32 [[A:%.*]], 1073741823
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[A_AND]], 1
-; CHECK-NEXT:    [[RES:%.*]] = uitofp i32 [[TMP1]] to double
+; CHECK-NEXT:    [[RES:%.*]] = uitofp nneg i32 [[TMP1]] to double
 ; CHECK-NEXT:    ret double [[RES]]
 ;
   ; Drop two highest bits to guarantee that %a + 1 doesn't overflow
@@ -33,7 +33,7 @@ define double @test(i32 %a) {
 define float @test_neg(i32 %a) {
 ; CHECK-LABEL: @test_neg(
 ; CHECK-NEXT:    [[A_AND:%.*]] = and i32 [[A:%.*]], 1073741823
-; CHECK-NEXT:    [[A_AND_FP:%.*]] = sitofp i32 [[A_AND]] to float
+; CHECK-NEXT:    [[A_AND_FP:%.*]] = uitofp nneg i32 [[A_AND]] to float
 ; CHECK-NEXT:    [[RES:%.*]] = fadd float [[A_AND_FP]], 1.000000e+00
 ; CHECK-NEXT:    ret float [[RES]]
 ;
@@ -49,7 +49,7 @@ define double @test_2(i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[A_AND:%.*]] = and i32 [[A:%.*]], 1073741823
 ; CHECK-NEXT:    [[B_AND:%.*]] = and i32 [[B:%.*]], 1073741823
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[A_AND]], [[B_AND]]
-; CHECK-NEXT:    [[RES:%.*]] = uitofp i32 [[TMP1]] to double
+; CHECK-NEXT:    [[RES:%.*]] = uitofp nneg i32 [[TMP1]] to double
 ; CHECK-NEXT:    ret double [[RES]]
 ;
   ; Drop two highest bits to guarantee that %a + %b doesn't overflow
@@ -67,8 +67,8 @@ define float @test_2_neg(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test_2_neg(
 ; CHECK-NEXT:    [[A_AND:%.*]] = and i32 [[A:%.*]], 1073741823
 ; CHECK-NEXT:    [[B_AND:%.*]] = and i32 [[B:%.*]], 1073741823
-; CHECK-NEXT:    [[A_AND_FP:%.*]] = sitofp i32 [[A_AND]] to float
-; CHECK-NEXT:    [[B_AND_FP:%.*]] = sitofp i32 [[B_AND]] to float
+; CHECK-NEXT:    [[A_AND_FP:%.*]] = uitofp nneg i32 [[A_AND]] to float
+; CHECK-NEXT:    [[B_AND_FP:%.*]] = uitofp nneg i32 [[B_AND]] to float
 ; CHECK-NEXT:    [[RES:%.*]] = fadd float [[A_AND_FP]], [[B_AND_FP]]
 ; CHECK-NEXT:    ret float [[RES]]
 ;
@@ -89,7 +89,7 @@ define float @test_3(i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[M:%.*]] = lshr i32 [[A:%.*]], 24
 ; CHECK-NEXT:    [[N:%.*]] = and i32 [[M]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[N]], 1
-; CHECK-NEXT:    [[P:%.*]] = uitofp i32 [[TMP1]] to float
+; CHECK-NEXT:    [[P:%.*]] = uitofp nneg i32 [[TMP1]] to float
 ; CHECK-NEXT:    ret float [[P]]
 ;
   %m = lshr i32 %a, 24
@@ -104,7 +104,7 @@ define <4 x double> @test_4(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-NEXT:    [[A_AND:%.*]] = and <4 x i32> [[A:%.*]], <i32 1073741823, i32 1073741823, i32 1073741823, i32 1073741823>
 ; CHECK-NEXT:    [[B_AND:%.*]] = and <4 x i32> [[B:%.*]], <i32 1073741823, i32 1073741823, i32 1073741823, i32 1073741823>
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw <4 x i32> [[A_AND]], [[B_AND]]
-; CHECK-NEXT:    [[RES:%.*]] = uitofp <4 x i32> [[TMP1]] to <4 x double>
+; CHECK-NEXT:    [[RES:%.*]] = uitofp nneg <4 x i32> [[TMP1]] to <4 x double>
 ; CHECK-NEXT:    ret <4 x double> [[RES]]
 ;
   ; Drop two highest bits to guarantee that %a + %b doesn't overflow
@@ -122,8 +122,8 @@ define <4 x float> @test_4_neg(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @test_4_neg(
 ; CHECK-NEXT:    [[A_AND:%.*]] = and <4 x i32> [[A:%.*]], <i32 1073741823, i32 1073741823, i32 1073741823, i32 1073741823>
 ; CHECK-NEXT:    [[B_AND:%.*]] = and <4 x i32> [[B:%.*]], <i32 1073741823, i32 1073741823, i32 1073741823, i32 1073741823>
-; CHECK-NEXT:    [[A_AND_FP:%.*]] = sitofp <4 x i32> [[A_AND]] to <4 x float>
-; CHECK-NEXT:    [[B_AND_FP:%.*]] = sitofp <4 x i32> [[B_AND]] to <4 x float>
+; CHECK-NEXT:    [[A_AND_FP:%.*]] = uitofp nneg <4 x i32> [[A_AND]] to <4 x float>
+; CHECK-NEXT:    [[B_AND_FP:%.*]] = uitofp nneg <4 x i32> [[B_AND]] to <4 x float>
 ; CHECK-NEXT:    [[RES:%.*]] = fadd <4 x float> [[A_AND_FP]], [[B_AND_FP]]
 ; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/add.ll b/llvm/test/Transforms/InstCombine/add.ll
index 408b0c6559b00..39b4ad8055088 100644
--- a/llvm/test/Transforms/InstCombine/add.ll
+++ b/llvm/test/Transforms/InstCombine/add.ll
@@ -150,24 +150,24 @@ define i32 @test5_add_nsw(i32 %A, i32 %B) {
   ret i32 %D
 }
 
-define <2 x i8> @neg_op0_vec_undef_elt(<2 x i8> %a, <2 x i8> %b) {
-; CHECK-LABEL: @neg_op0_vec_undef_elt(
+define <2 x i8> @neg_op0_vec_poison_elt(<2 x i8> %a, <2 x i8> %b) {
+; CHECK-LABEL: @neg_op0_vec_poison_elt(
 ; CHECK-NEXT:    [[R:%.*]] = sub <2 x i8> [[B:%.*]], [[A:%.*]]
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
-  %nega = sub <2 x i8> <i8 0, i8 undef>, %a
+  %nega = sub <2 x i8> <i8 0, i8 poison>, %a
   %r = add <2 x i8> %nega, %b
   ret <2 x i8> %r
 }
 
-define <2 x i8> @neg_neg_vec_undef_elt(<2 x i8> %a, <2 x i8> %b) {
-; CHECK-LABEL: @neg_neg_vec_undef_elt(
+define <2 x i8> @neg_neg_vec_poison_elt(<2 x i8> %a, <2 x i8> %b) {
+; CHECK-LABEL: @neg_neg_vec_poison_elt(
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i8> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = sub <2 x i8> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
-  %nega = sub <2 x i8> <i8 undef, i8 0>, %a
-  %negb = sub <2 x i8> <i8 undef, i8 0>, %b
+  %nega = sub <2 x i8> <i8 poison, i8 0>, %a
+  %negb = sub <2 x i8> <i8 poison, i8 0>, %b
   %r = add <2 x i8> %nega, %negb
   ret <2 x i8> %r
 }
@@ -1196,14 +1196,14 @@ define <2 x i32> @test44_vec_non_matching(<2 x i32> %A) {
   ret <2 x i32> %C
 }
 
-define <2 x i32> @test44_vec_undef(<2 x i32> %A) {
-; CHECK-LABEL: @test44_vec_undef(
-; CHECK-NEXT:    [[B:%.*]] = or <2 x i32> [[A:%.*]], <i32 123, i32 undef>
-; CHECK-NEXT:    [[C:%.*]] = add <2 x i32> [[B]], <i32 -123, i32 undef>
+define <2 x i32> @test44_vec_poison(<2 x i32> %A) {
+; CHECK-LABEL: @test44_vec_poison(
+; CHECK-NEXT:    [[B:%.*]] = or <2 x i32> [[A:%.*]], <i32 123, i32 poison>
+; CHECK-NEXT:    [[C:%.*]] = add nsw <2 x i32> [[B]], <i32 -123, i32 poison>
 ; CHECK-NEXT:    ret <2 x i32> [[C]]
 ;
-  %B = or <2 x i32> %A, <i32 123, i32 undef>
-  %C = add <2 x i32> %B, <i32 -123, i32 undef>
+  %B = or <2 x i32> %A, <i32 123, i32 poison>
+  %C = add <2 x i32> %B, <i32 -123, i32 poison>
   ret <2 x i32> %C
 }
 
@@ -2983,7 +2983,7 @@ define i8 @signum_i8_i8_use3(i8 %x) {
   ret i8 %r
 }
 
-; poison/undef is ok to propagate in shift amount
+; poison is ok to propagate in shift amount
 ; complexity canonicalization guarantees that shift is op0 of add
 
 define <2 x i5> @signum_v2i5_v2i5(<2 x i5> %x) {
diff --git a/llvm/test/Transforms/InstCombine/and-or-icmps.ll b/llvm/test/Transforms/InstCombine/and-or-icmps.ll
index 63b11d0c0bc08..c20f48a985b3e 100644
--- a/llvm/test/Transforms/InstCombine/and-or-icmps.ll
+++ b/llvm/test/Transforms/InstCombine/and-or-icmps.ll
@@ -952,8 +952,8 @@ define i1 @substitute_constant_or_ne_uge_commute_logical(i8 %x, i8 %y) {
 
 ; Negative test - not safe to substitute vector constant with undef element
 
-define <2 x i1> @substitute_constant_or_ne_slt_swap_vec(<2 x i8> %x, <2 x i8> %y) {
-; CHECK-LABEL: @substitute_constant_or_ne_slt_swap_vec(
+define <2 x i1> @substitute_constant_or_ne_slt_swap_vec_undef(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @substitute_constant_or_ne_slt_swap_vec_undef(
 ; CHECK-NEXT:    [[C1:%.*]] = icmp ne <2 x i8> [[X:%.*]], <i8 42, i8 undef>
 ; CHECK-NEXT:    [[C2:%.*]] = icmp slt <2 x i8> [[Y:%.*]], [[X]]
 ; CHECK-NEXT:    [[R:%.*]] = or <2 x i1> [[C1]], [[C2]]
@@ -965,14 +965,29 @@ define <2 x i1> @substitute_constant_or_ne_slt_swap_vec(<2 x i8> %x, <2 x i8> %y
   ret <2 x i1> %r
 }
 
+; TODO: The poison case would be valid to fold.
+
+define <2 x i1> @substitute_constant_or_ne_slt_swap_vec_poison(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @substitute_constant_or_ne_slt_swap_vec_poison(
+; CHECK-NEXT:    [[C1:%.*]] = icmp ne <2 x i8> [[X:%.*]], <i8 42, i8 poison>
+; CHECK-NEXT:    [[C2:%.*]] = icmp slt <2 x i8> [[Y:%.*]], [[X]]
+; CHECK-NEXT:    [[R:%.*]] = or <2 x i1> [[C1]], [[C2]]
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %c1 = icmp ne <2 x i8> %x, <i8 42, i8 poison>
+  %c2 = icmp slt <2 x i8> %y, %x
+  %r = or <2 x i1> %c1, %c2
+  ret <2 x i1> %r
+}
+
 define <2 x i1> @substitute_constant_or_ne_slt_swap_vec_logical(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @substitute_constant_or_ne_slt_swap_vec_logical(
-; CHECK-NEXT:    [[C1:%.*]] = icmp ne <2 x i8> [[X:%.*]], <i8 42, i8 undef>
+; CHECK-NEXT:    [[C1:%.*]] = icmp ne <2 x i8> [[X:%.*]], <i8 42, i8 poison>
 ; CHECK-NEXT:    [[C2:%.*]] = icmp slt <2 x i8> [[Y:%.*]], [[X]]
 ; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[C1]], <2 x i1> <i1 true, i1 true>, <2 x i1> [[C2]]
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
-  %c1 = icmp ne <2 x i8> %x, <i8 42, i8 undef>
+  %c1 = icmp ne <2 x i8> %x, <i8 42, i8 poison>
   %c2 = icmp slt <2 x i8> %y, %x
   %r = select <2 x i1> %c1, <2 x i1> <i1 true, i1 true>, <2 x i1> %c2
   ret <2 x i1> %r
@@ -2497,29 +2512,29 @@ define <2 x i1> @icmp_eq_m1_and_eq_m1(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-NEXT:    [[R:%.*]] = icmp eq <2 x i8> [[TMP1]], <i8 -1, i8 -1>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
-  %rx = icmp eq <2 x i8> %x, <i8 -1, i8 undef>
-  %ry = icmp eq <2 x i8> %y, <i8 -1, i8 undef>
+  %rx = icmp eq <2 x i8> %x, <i8 -1, i8 poison>
+  %ry = icmp eq <2 x i8> %y, <i8 -1, i8 poison>
   %r = and <2 x i1> %rx, %ry
   ret <2 x i1> %r
 }
 
-define <2 x i1> @icmp_eq_m1_and_eq_undef_m1(<2 x i8> %x, <2 x i8> %y) {
-; CHECK-LABEL: @icmp_eq_m1_and_eq_undef_m1(
+define <2 x i1> @icmp_eq_m1_and_eq_poison_m1(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @icmp_eq_m1_and_eq_poison_m1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i8> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = icmp eq <2 x i8> [[TMP1]], <i8 -1, i8 -1>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
-  %rx = icmp eq <2 x i8> %x, <i8 -1, i8 undef>
-  %ry = icmp eq <2 x i8> %y, <i8 undef, i8 -1>
+  %rx = icmp eq <2 x i8> %x, <i8 -1, i8 poison>
+  %ry = icmp eq <2 x i8> %y, <i8 poison, i8 -1>
   %r = and <2 x i1> %rx, %ry
   ret <2 x i1> %r
 }
 
-define <2 x i1> @icmp_eq_undef_and_eq_m1_m2(<2 x i8> %x, <2 x i8> %y) {
-; CHECK-LABEL: @icmp_eq_undef_and_eq_m1_m2(
-; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+define <2 x i1> @icmp_eq_poison_and_eq_m1_m2(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @icmp_eq_poison_and_eq_m1_m2(
+; CHECK-NEXT:    ret <2 x i1> poison
 ;
-  %rx = icmp eq <2 x i8> %x, <i8 undef, i8 undef>
+  %rx = icmp eq <2 x i8> %x, <i8 poison, i8 poison>
   %ry = icmp eq <2 x i8> %y, <i8 -1, i8 -2>
   %r = and <2 x i1> %rx, %ry
   ret <2 x i1> %r
@@ -2527,13 +2542,13 @@ define <2 x i1> @icmp_eq_undef_and_eq_m1_m2(<2 x i8> %x, <2 x i8> %y) {
 
 define <2 x i1> @icmp_ne_m1_and_ne_m1_fail(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @icmp_ne_m1_and_ne_m1_fail(
-; CHECK-NEXT:    [[RX:%.*]] = icmp ne <2 x i8> [[X:%.*]], <i8 -1, i8 undef>
-; CHECK-NEXT:    [[RY:%.*]] = icmp ne <2 x i8> [[Y:%.*]], <i8 -1, i8 undef>
+; CHECK-NEXT:    [[RX:%.*]] = icmp ne <2 x i8> [[X:%.*]], <i8 -1, i8 poison>
+; CHECK-NEXT:    [[RY:%.*]] = icmp ne <2 x i8> [[Y:%.*]], <i8 -1, i8 poison>
 ; CHECK-NEXT:    [[R:%.*]] = and <2 x i1> [[RX]], [[RY]]
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
-  %rx = icmp ne <2 x i8> %x, <i8 -1, i8 undef>
-  %ry = icmp ne <2 x i8> %y, <i8 -1, i8 undef>
+  %rx = icmp ne <2 x i8> %x, <i8 -1, i8 poison>
+  %ry = icmp ne <2 x i8> %y, <i8 -1, i8 poison>
   %r = and <2 x i1> %rx, %ry
   ret <2 x i1> %r
 }
@@ -2541,13 +2556,13 @@ define <2 x i1> @icmp_ne_m1_and_ne_m1_fail(<2 x i8> %x, <2 x i8> %y) {
 
 define <2 x i1> @icmp_eq_m1_or_eq_m1_fail(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @icmp_eq_m1_or_eq_m1_fail(
-; CHECK-NEXT:    [[RX:%.*]] = icmp eq <2 x i8> [[X:%.*]], <i8 -1, i8 undef>
-; CHECK-NEXT:    [[RY:%.*]] = icmp eq <2 x i8> [[Y:%.*]], <i8 -1, i8 undef>
+; CHECK-NEXT:    [[RX:%.*]] = icmp eq <2 x i8> [[X:%.*]], <i8 -1, i8 poison>
+; CHECK-NEXT:    [[RY:%.*]] = icmp eq <2 x i8> [[Y:%.*]], <i8 -1, i8 poison>
 ; CHECK-NEXT:    [[R:%.*]] = or <2 x i1> [[RX]], [[RY]]
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
-  %rx = icmp eq <2 x i8> %x, <i8 -1, i8 undef>
-  %ry = icmp eq <2 x i8> %y, <i8 -1, i8 undef>
+  %rx = icmp eq <2 x i8> %x, <i8 -1, i8 poison>
+  %ry = icmp eq <2 x i8> %y, <i8 -1, i8 poison>
   %r = or <2 x i1> %rx, %ry
   ret <2 x i1> %r
 }
@@ -2560,7 +2575,7 @@ define <2 x i1> @icmp_ne_m1_or_ne_m1(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %rx = icmp ne <2 x i8> %x, <i8 -1, i8 -1>
-  %ry = icmp ne <2 x i8> %y, <i8 -1, i8 undef>
+  %ry = icmp ne <2 x i8> %y, <i8 -1, i8 poison>
   %r = or <2 x i1> %rx, %ry
   ret <2 x i1> %r
 }
diff --git a/llvm/test/Transforms/InstCombine/and-xor-or.ll b/llvm/test/Transforms/InstCombine/and-xor-or.ll
index d072dc15cbb2c..b26d6e16c2db2 100644
--- a/llvm/test/Transforms/InstCombine/and-xor-or.ll
+++ b/llvm/test/Transforms/InstCombine/and-xor-or.ll
@@ -843,7 +843,7 @@ define <2 x i6> @not_or_or_not_2i6(<2 x i6> %a0, <2 x i6> %b, <2 x i6> %c) {
 ;
   %a = sdiv <2 x i6> <i6 3, i6 3>, %a0 ; thwart complexity-based canonicalization
   %not1 = xor <2 x i6> %b, <i6 -1, i6 -1>
-  %not2 = xor <2 x i6> %c, <i6 -1, i6 undef>
+  %not2 = xor <2 x i6> %c, <i6 -1, i6 poison>
   %or1 = or <2 x i6> %a, %not1
   %or2 = or <2 x i6> %or1, %not2
   ret <2 x i6> %or2
@@ -4018,7 +4018,7 @@ define <2 x i4> @and_orn_xor_commute1(<2 x i4> %a, <2 x i4> %b) {
 ; CHECK-NEXT:    ret <2 x i4> [[R]]
 ;
   %xor = xor <2 x i4> %a, %b
-  %nota = xor <2 x i4> %a, <i4 -1, i4 undef>
+  %nota = xor <2 x i4> %a, <i4 -1, i4 poison>
   %or = or <2 x i4> %nota, %b
   %r = and <2 x i4> %xor, %or
   ret <2 x i4> %r
diff --git a/llvm/test/Transforms/InstCombine/and.ll b/llvm/test/Transforms/InstCombine/and.ll
index ffd8c2a06c86e..b5250fc1a7849 100644
--- a/llvm/test/Transforms/InstCombine/and.ll
+++ b/llvm/test/Transforms/InstCombine/and.ll
@@ -752,16 +752,16 @@ define <2 x i64> @test36_uniform(<2 x i32> %X) {
   ret <2 x i64> %res
 }
 
-define <2 x i64> @test36_undef(<2 x i32> %X) {
-; CHECK-LABEL: @test36_undef(
+define <2 x i64> @test36_poison(<2 x i32> %X) {
+; CHECK-LABEL: @test36_poison(
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext <2 x i32> [[X:%.*]] to <2 x i64>
-; CHECK-NEXT:    [[ZSUB:%.*]] = add <2 x i64> [[ZEXT]], <i64 7, i64 undef>
-; CHECK-NEXT:    [[RES:%.*]] = and <2 x i64> [[ZSUB]], <i64 240, i64 undef>
+; CHECK-NEXT:    [[ZSUB:%.*]] = add nuw nsw <2 x i64> [[ZEXT]], <i64 7, i64 poison>
+; CHECK-NEXT:    [[RES:%.*]] = and <2 x i64> [[ZSUB]], <i64 240, i64 poison>
 ; CHECK-NEXT:    ret <2 x i64> [[RES]]
 ;
   %zext = zext <2 x i32> %X to <2 x i64>
-  %zsub = add <2 x i64> %zext, <i64 7, i64 undef>
-  %res = and <2 x i64> %zsub, <i64 240, i64 undef>
+  %zsub = add <2 x i64> %zext, <i64 7, i64 poison>
+  %res = and <2 x i64> %zsub, <i64 240, i64 poison>
   ret <2 x i64> %res
 }
 
@@ -1630,16 +1630,16 @@ define <2 x i8> @lowmask_add_splat(<2 x i8> %x, ptr %p) {
   ret <2 x i8> %r
 }
 
-define <2 x i8> @lowmask_add_splat_undef(<2 x i8> %x, ptr %p) {
-; CHECK-LABEL: @lowmask_add_splat_undef(
-; CHECK-NEXT:    [[A:%.*]] = add <2 x i8> [[X:%.*]], <i8 -64, i8 undef>
+define <2 x i8> @lowmask_add_splat_poison(<2 x i8> %x, ptr %p) {
+; CHECK-LABEL: @lowmask_add_splat_poison(
+; CHECK-NEXT:    [[A:%.*]] = add <2 x i8> [[X:%.*]], <i8 -64, i8 poison>
 ; CHECK-NEXT:    store <2 x i8> [[A]], ptr [[P:%.*]], align 2
-; CHECK-NEXT:    [[R:%.*]] = and <2 x i8> [[A]], <i8 undef, i8 32>
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i8> [[X]], <i8 poison, i8 32>
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
-  %a = add <2 x i8> %x, <i8 -64, i8 undef> ; 0xc0
+  %a = add <2 x i8> %x, <i8 -64, i8 poison> ; 0xc0
   store <2 x i8> %a, ptr %p
-  %r = and <2 x i8> %a, <i8 undef, i8 32> ; 0x20
+  %r = and <2 x i8> %a, <i8 poison, i8 32> ; 0x20
   ret <2 x i8> %r
 }
 
@@ -1679,14 +1679,14 @@ define <2 x i8> @flip_masked_bit_uniform(<2 x i8> %A) {
   ret <2 x i8> %C
 }
 
-define <2 x i8> @flip_masked_bit_undef(<2 x i8> %A) {
-; CHECK-LABEL: @flip_masked_bit_undef(
+define <2 x i8> @flip_masked_bit_poison(<2 x i8> %A) {
+; CHECK-LABEL: @flip_masked_bit_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i8> [[A:%.*]], <i8 -1, i8 -1>
-; CHECK-NEXT:    [[C:%.*]] = and <2 x i8> [[TMP1]], <i8 16, i8 undef>
+; CHECK-NEXT:    [[C:%.*]] = and <2 x i8> [[TMP1]], <i8 16, i8 poison>
 ; CHECK-NEXT:    ret <2 x i8> [[C]]
 ;
-  %B = add <2 x i8> %A, <i8 16, i8 undef>
-  %C = and <2 x i8> %B, <i8 16, i8 undef>
+  %B = add <2 x i8> %A, <i8 16, i8 poison>
+  %C = and <2 x i8> %B, <i8 16, i8 poison>
   ret <2 x i8> %C
 }
 
@@ -2004,7 +2004,7 @@ define i16 @invert_signbit_splat_mask_use2(i8 %x, i16 %y) {
   ret i16 %r
 }
 
-; extra use of sext is ok 
+; extra use of sext is ok
 
 define i16 @invert_signbit_splat_mask_use3(i8 %x, i16 %y) {
 ; CHECK-LABEL: @invert_signbit_splat_mask_use3(
@@ -2120,41 +2120,40 @@ define <3 x i16> @shl_lshr_pow2_const_case1_non_uniform_vec_negative(<3 x i16> %
   ret <3 x i16> %r
 }
 
-define <3 x i16> @shl_lshr_pow2_const_case1_undef1_vec(<3 x i16> %x) {
-; CHECK-LABEL: @shl_lshr_pow2_const_case1_undef1_vec(
+define <3 x i16> @shl_lshr_pow2_const_case1_poison1_vec(<3 x i16> %x) {
+; CHECK-LABEL: @shl_lshr_pow2_const_case1_poison1_vec(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <3 x i16> [[X:%.*]], <i16 8, i16 4, i16 4>
 ; CHECK-NEXT:    [[R:%.*]] = select <3 x i1> [[TMP1]], <3 x i16> <i16 8, i16 8, i16 8>, <3 x i16> zeroinitializer
 ; CHECK-NEXT:    ret <3 x i16> [[R]]
 ;
-  %shl = shl <3 x i16> <i16 undef, i16 16, i16 16>, %x
+  %shl = shl <3 x i16> <i16 poison, i16 16, i16 16>, %x
   %lshr = lshr <3 x i16> %shl, <i16 5, i16 5, i16 5>
   %r = and <3 x i16> %lshr, <i16 8, i16 8, i16 8>
   ret <3 x i16> %r
 }
 
-define <3 x i16> @shl_lshr_pow2_const_case1_undef2_vec(<3 x i16> %x) {
-; CHECK-LABEL: @shl_lshr_pow2_const_case1_undef2_vec(
-; CHECK-NEXT:    [[SHL:%.*]] = shl <3 x i16> <i16 16, i16 16, i16 16>, [[X:%.*]]
-; CHECK-NEXT:    [[LSHR:%.*]] = lshr <3 x i16> [[SHL]], <i16 undef, i16 5, i16 5>
-; CHECK-NEXT:    [[R:%.*]] = and <3 x i16> [[LSHR]], <i16 8, i16 8, i16 8>
+define <3 x i16> @shl_lshr_pow2_const_case1_poison2_vec(<3 x i16> %x) {
+; CHECK-LABEL: @shl_lshr_pow2_const_case1_poison2_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <3 x i16> [[X:%.*]], <i16 poison, i16 4, i16 4>
+; CHECK-NEXT:    [[R:%.*]] = select <3 x i1> [[TMP1]], <3 x i16> <i16 8, i16 8, i16 8>, <3 x i16> zeroinitializer
 ; CHECK-NEXT:    ret <3 x i16> [[R]]
 ;
   %shl = shl <3 x i16> <i16 16, i16 16, i16 16>, %x
-  %lshr = lshr <3 x i16> %shl, <i16 undef, i16 5, i16 5>
+  %lshr = lshr <3 x i16> %shl, <i16 poison, i16 5, i16 5>
   %r = and <3 x i16> %lshr, <i16 8, i16 8, i16 8>
   ret <3 x i16> %r
 }
 
-define <3 x i16> @shl_lshr_pow2_const_case1_undef3_vec(<3 x i16> %x) {
-; CHECK-LABEL: @shl_lshr_pow2_const_case1_undef3_vec(
+define <3 x i16> @shl_lshr_pow2_const_case1_poison3_vec(<3 x i16> %x) {
+; CHECK-LABEL: @shl_lshr_pow2_const_case1_poison3_vec(
 ; CHECK-NEXT:    [[SHL:%.*]] = shl <3 x i16> <i16 16, i16 16, i16 16>, [[X:%.*]]
 ; CHECK-NEXT:    [[LSHR:%.*]] = lshr <3 x i16> [[SHL]], <i16 5, i16 5, i16 5>
-; CHECK-NEXT:    [[R:%.*]] = and <3 x i16> [[LSHR]], <i16 undef, i16 8, i16 8>
+; CHECK-NEXT:    [[R:%.*]] = and <3 x i16> [[LSHR]], <i16 poison, i16 8, i16 8>
 ; CHECK-NEXT:    ret <3 x i16> [[R]]
 ;
   %shl = shl <3 x i16> <i16 16, i16 16, i16 16>, %x
   %lshr = lshr <3 x i16> %shl, <i16 5, i16 5, i16 5>
-  %r = and <3 x i16> %lshr, <i16 undef, i16 8, i16 8>
+  %r = and <3 x i16> %lshr, <i16 poison, i16 8, i16 8>
   ret <3 x i16> %r
 }
 
@@ -2417,40 +2416,41 @@ define <3 x i16> @lshr_shl_pow2_const_case1_non_uniform_vec_negative(<3 x i16> %
   ret <3 x i16> %r
 }
 
-define <3 x i16> @lshr_shl_pow2_const_case1_undef1_vec(<3 x i16> %x) {
-; CHECK-LABEL: @lshr_shl_pow2_const_case1_undef1_vec(
+define <3 x i16> @lshr_shl_pow2_const_case1_poison1_vec(<3 x i16> %x) {
+; CHECK-LABEL: @lshr_shl_pow2_const_case1_poison1_vec(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <3 x i16> [[X:%.*]], <i16 -1, i16 12, i16 12>
 ; CHECK-NEXT:    [[R:%.*]] = select <3 x i1> [[TMP1]], <3 x i16> <i16 128, i16 128, i16 128>, <3 x i16> zeroinitializer
 ; CHECK-NEXT:    ret <3 x i16> [[R]]
 ;
-  %lshr = lshr <3 x i16> <i16 undef, i16 8192, i16 8192>, %x
+  %lshr = lshr <3 x i16> <i16 poison, i16 8192, i16 8192>, %x
   %shl = shl <3 x i16> %lshr, <i16 6, i16 6, i16 6>
   %r = and <3 x i16> %shl, <i16 128, i16 128, i16 128>
   ret <3 x i16> %r
 }
 
-define <3 x i16> @lshr_shl_pow2_const_case1_undef2_vec(<3 x i16> %x) {
-; CHECK-LABEL: @lshr_shl_pow2_const_case1_undef2_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <3 x i16> [[X:%.*]], <i16 undef, i16 12, i16 12>
-; CHECK-NEXT:    [[R:%.*]] = select <3 x i1> [[TMP1]], <3 x i16> <i16 128, i16 128, i16 128>, <3 x i16> zeroinitializer
+define <3 x i16> @lshr_shl_pow2_const_case1_poison2_vec(<3 x i16> %x) {
+; CHECK-LABEL: @lshr_shl_pow2_const_case1_poison2_vec(
+; CHECK-NEXT:    [[LSHR:%.*]] = lshr <3 x i16> <i16 8192, i16 8192, i16 8192>, [[X:%.*]]
+; CHECK-NEXT:    [[SHL:%.*]] = shl <3 x i16> [[LSHR]], <i16 poison, i16 6, i16 6>
+; CHECK-NEXT:    [[R:%.*]] = and <3 x i16> [[SHL]], <i16 128, i16 128, i16 128>
 ; CHECK-NEXT:    ret <3 x i16> [[R]]
 ;
   %lshr = lshr <3 x i16> <i16 8192, i16 8192, i16 8192>, %x
-  %shl = shl <3 x i16> %lshr, <i16 undef, i16 6, i16 6>
+  %shl = shl <3 x i16> %lshr, <i16 poison, i16 6, i16 6>
   %r = and <3 x i16> %shl, <i16 128, i16 128, i16 128>
   ret <3 x i16> %r
 }
 
-define <3 x i16> @lshr_shl_pow2_const_case1_undef3_vec(<3 x i16> %x) {
-; CHECK-LABEL: @lshr_shl_pow2_const_case1_undef3_vec(
+define <3 x i16> @lshr_shl_pow2_const_case1_poison3_vec(<3 x i16> %x) {
+; CHECK-LABEL: @lshr_shl_pow2_const_case1_poison3_vec(
 ; CHECK-NEXT:    [[LSHR:%.*]] = lshr <3 x i16> <i16 8192, i16 8192, i16 8192>, [[X:%.*]]
 ; CHECK-NEXT:    [[SHL:%.*]] = shl <3 x i16> [[LSHR]], <i16 6, i16 6, i16 6>
-; CHECK-NEXT:    [[R:%.*]] = and <3 x i16> [[SHL]], <i16 undef, i16 128, i16 128>
+; CHECK-NEXT:    [[R:%.*]] = and <3 x i16> [[SHL]], <i16 poison, i16 128, i16 128>
 ; CHECK-NEXT:    ret <3 x i16> [[R]]
 ;
   %lshr = lshr <3 x i16> <i16 8192, i16 8192, i16 8192>, %x
   %shl = shl <3 x i16> %lshr, <i16 6, i16 6, i16 6>
-  %r = and <3 x i16> %shl, <i16 undef, i16 128, i16 128>
+  %r = and <3 x i16> %shl, <i16 poison, i16 128, i16 128>
   ret <3 x i16> %r
 }
 
diff --git a/llvm/test/Transforms/InstCombine/and2.ll b/llvm/test/Transforms/InstCombine/and2.ll
index 73bdadc86710e..104486e7638f5 100644
--- a/llvm/test/Transforms/InstCombine/and2.ll
+++ b/llvm/test/Transforms/InstCombine/and2.ll
@@ -168,14 +168,14 @@ define <2 x i8> @and1_shl1_is_cmp_eq_0_vec(<2 x i8> %x) {
   ret <2 x i8> %and
 }
 
-define <2 x i8> @and1_shl1_is_cmp_eq_0_vec_undef(<2 x i8> %x) {
-; CHECK-LABEL: @and1_shl1_is_cmp_eq_0_vec_undef(
+define <2 x i8> @and1_shl1_is_cmp_eq_0_vec_poison(<2 x i8> %x) {
+; CHECK-LABEL: @and1_shl1_is_cmp_eq_0_vec_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> [[X:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[AND:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i8>
 ; CHECK-NEXT:    ret <2 x i8> [[AND]]
 ;
-  %sh = shl <2 x i8> <i8 1, i8 undef>, %x
-  %and = and <2 x i8> %sh, <i8 1, i8 undef>
+  %sh = shl <2 x i8> <i8 1, i8 poison>, %x
+  %and = and <2 x i8> %sh, <i8 1, i8 poison>
   ret <2 x i8> %and
 }
 
@@ -215,14 +215,13 @@ define <2 x i8> @and1_lshr1_is_cmp_eq_0_vec(<2 x i8> %x) {
   ret <2 x i8> %and
 }
 
-define <2 x i8> @and1_lshr1_is_cmp_eq_0_vec_undef(<2 x i8> %x) {
-; CHECK-LABEL: @and1_lshr1_is_cmp_eq_0_vec_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> [[X:%.*]], zeroinitializer
-; CHECK-NEXT:    [[AND:%.*]] = zext <2 x i1> [[TMP1]] to <2 x i8>
+define <2 x i8> @and1_lshr1_is_cmp_eq_0_vec_poison(<2 x i8> %x) {
+; CHECK-LABEL: @and1_lshr1_is_cmp_eq_0_vec_poison(
+; CHECK-NEXT:    [[AND:%.*]] = lshr <2 x i8> <i8 1, i8 poison>, [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x i8> [[AND]]
 ;
-  %sh = lshr <2 x i8> <i8 1, i8 undef>, %x
-  %and = and <2 x i8> %sh, <i8 1, i8 undef>
+  %sh = lshr <2 x i8> <i8 1, i8 poison>, %x
+  %and = and <2 x i8> %sh, <i8 1, i8 poison>
   ret <2 x i8> %and
 }
 
diff --git a/llvm/test/Transforms/InstCombine/ashr-lshr.ll b/llvm/test/Transforms/InstCombine/ashr-lshr.ll
index 60fa5b2597ba9..ac206dc7999dd 100644
--- a/llvm/test/Transforms/InstCombine/ashr-lshr.ll
+++ b/llvm/test/Transforms/InstCombine/ashr-lshr.ll
@@ -229,24 +229,24 @@ define <2 x i32> @ashr_lshr_inv_nonsplat_vec(<2 x i32> %x, <2 x i32> %y) {
   ret <2 x i32> %ret
 }
 
-define <2 x i32> @ashr_lshr_vec_undef(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: @ashr_lshr_vec_undef(
+define <2 x i32> @ashr_lshr_vec_poison(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @ashr_lshr_vec_poison(
 ; CHECK-NEXT:    [[CMP12:%.*]] = ashr <2 x i32> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <2 x i32> [[CMP12]]
 ;
-  %cmp = icmp sgt <2 x i32> %x, <i32 undef, i32 -1>
+  %cmp = icmp sgt <2 x i32> %x, <i32 poison, i32 -1>
   %l = lshr <2 x i32> %x, %y
   %r = ashr exact <2 x i32> %x, %y
   %ret = select <2 x i1> %cmp, <2 x i32> %l, <2 x i32> %r
   ret <2 x i32> %ret
 }
 
-define <2 x i32> @ashr_lshr_vec_undef2(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: @ashr_lshr_vec_undef2(
+define <2 x i32> @ashr_lshr_vec_poison2(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @ashr_lshr_vec_poison2(
 ; CHECK-NEXT:    [[CMP1:%.*]] = ashr exact <2 x i32> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <2 x i32> [[CMP1]]
 ;
-  %cmp = icmp slt <2 x i32> %x, <i32 1, i32 undef>
+  %cmp = icmp slt <2 x i32> %x, <i32 1, i32 poison>
   %l = lshr exact <2 x i32> %x, %y
   %r = ashr exact <2 x i32> %x, %y
   %ret = select <2 x i1> %cmp, <2 x i32> %r, <2 x i32> %l
@@ -498,14 +498,14 @@ define <3 x i42> @lshr_sub_nsw_splat(<3 x i42> %x, <3 x i42> %y) {
   ret <3 x i42> %shr
 }
 
-define <3 x i42> @lshr_sub_nsw_splat_undef(<3 x i42> %x, <3 x i42> %y) {
-; CHECK-LABEL: @lshr_sub_nsw_splat_undef(
+define <3 x i42> @lshr_sub_nsw_splat_poison(<3 x i42> %x, <3 x i42> %y) {
+; CHECK-LABEL: @lshr_sub_nsw_splat_poison(
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw <3 x i42> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[SHR:%.*]] = lshr <3 x i42> [[SUB]], <i42 41, i42 undef, i42 41>
+; CHECK-NEXT:    [[SHR:%.*]] = lshr <3 x i42> [[SUB]], <i42 41, i42 poison, i42 41>
 ; CHECK-NEXT:    ret <3 x i42> [[SHR]]
 ;
   %sub = sub nsw <3 x i42> %x, %y
-  %shr = lshr <3 x i42> %sub, <i42 41, i42 undef, i42 41>
+  %shr = lshr <3 x i42> %sub, <i42 41, i42 poison, i42 41>
   ret <3 x i42> %shr
 }
 
@@ -572,14 +572,14 @@ define <3 x i43> @ashr_sub_nsw_splat(<3 x i43> %x, <3 x i43> %y) {
   ret <3 x i43> %shr
 }
 
-define <3 x i43> @ashr_sub_nsw_splat_undef(<3 x i43> %x, <3 x i43> %y) {
-; CHECK-LABEL: @ashr_sub_nsw_splat_undef(
+define <3 x i43> @ashr_sub_nsw_splat_poison(<3 x i43> %x, <3 x i43> %y) {
+; CHECK-LABEL: @ashr_sub_nsw_splat_poison(
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw <3 x i43> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[SHR:%.*]] = ashr <3 x i43> [[SUB]], <i43 42, i43 undef, i43 42>
+; CHECK-NEXT:    [[SHR:%.*]] = ashr <3 x i43> [[SUB]], <i43 42, i43 poison, i43 42>
 ; CHECK-NEXT:    ret <3 x i43> [[SHR]]
 ;
   %sub = sub nsw <3 x i43> %x, %y
-  %shr = ashr <3 x i43> %sub, <i43 42, i43 undef, i43 42>
+  %shr = ashr <3 x i43> %sub, <i43 42, i43 poison, i43 42>
   ret <3 x i43> %shr
 }
 
diff --git a/llvm/test/Transforms/InstCombine/ashr-or-mul-abs.ll b/llvm/test/Transforms/InstCombine/ashr-or-mul-abs.ll
index 3cf312e426edf..46a7f2f1189e2 100644
--- a/llvm/test/Transforms/InstCombine/ashr-or-mul-abs.ll
+++ b/llvm/test/Transforms/InstCombine/ashr-or-mul-abs.ll
@@ -62,13 +62,13 @@ define <4 x i32> @ashr_or_mul_to_abs_vec2(<4 x i32> %X) {
   ret <4 x i32> %i2
 }
 
-define <4 x i32> @ashr_or_mul_to_abs_vec3_undef(<4 x i32> %X) {
-; CHECK-LABEL: @ashr_or_mul_to_abs_vec3_undef(
+define <4 x i32> @ashr_or_mul_to_abs_vec3_poison(<4 x i32> %X) {
+; CHECK-LABEL: @ashr_or_mul_to_abs_vec3_poison(
 ; CHECK-NEXT:    [[I2:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[X:%.*]], i1 false)
 ; CHECK-NEXT:    ret <4 x i32> [[I2]]
 ;
-  %i = ashr <4 x i32> %X, <i32 31, i32 undef, i32 31, i32 31>
-  %i1 = or <4 x i32> %i, <i32 1, i32 1, i32 1, i32 undef>
+  %i = ashr <4 x i32> %X, <i32 31, i32 poison, i32 31, i32 31>
+  %i1 = or <4 x i32> %i, <i32 1, i32 1, i32 1, i32 poison>
   %i2 = mul <4 x i32> %i1, %X
   ret <4 x i32> %i2
 }
diff --git a/llvm/test/Transforms/InstCombine/binop-and-shifts.ll b/llvm/test/Transforms/InstCombine/binop-and-shifts.ll
index 148963894b89f..f776dc13bb4e5 100644
--- a/llvm/test/Transforms/InstCombine/binop-and-shifts.ll
+++ b/llvm/test/Transforms/InstCombine/binop-and-shifts.ll
@@ -178,27 +178,27 @@ define <2 x i8> @shl_xor_and(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @shl_xor_and(
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i8> [[Y:%.*]], <i8 11, i8 poison>
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i8> [[TMP1]], [[X:%.*]]
-; CHECK-NEXT:    [[BW1:%.*]] = shl <2 x i8> [[TMP2]], <i8 2, i8 undef>
+; CHECK-NEXT:    [[BW1:%.*]] = shl <2 x i8> [[TMP2]], <i8 2, i8 poison>
 ; CHECK-NEXT:    ret <2 x i8> [[BW1]]
 ;
-  %shift1 = shl <2 x i8> %x, <i8 2, i8 undef>
-  %shift2 = shl <2 x i8> %y, <i8 2, i8 undef>
-  %bw2 = xor <2 x i8> %shift2, <i8 44, i8 undef>
+  %shift1 = shl <2 x i8> %x, <i8 2, i8 poison>
+  %shift2 = shl <2 x i8> %y, <i8 2, i8 poison>
+  %bw2 = xor <2 x i8> %shift2, <i8 44, i8 poison>
   %bw1 = and <2 x i8> %bw2, %shift1
   ret <2 x i8> %bw1
 }
 
 define <2 x i8> @shl_xor_and_fail(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @shl_xor_and_fail(
-; CHECK-NEXT:    [[SHIFT1:%.*]] = shl <2 x i8> [[X:%.*]], <i8 2, i8 undef>
-; CHECK-NEXT:    [[SHIFT2:%.*]] = shl <2 x i8> [[Y:%.*]], <i8 undef, i8 2>
-; CHECK-NEXT:    [[BW2:%.*]] = xor <2 x i8> [[SHIFT2]], <i8 44, i8 undef>
+; CHECK-NEXT:    [[SHIFT1:%.*]] = shl <2 x i8> [[X:%.*]], <i8 2, i8 poison>
+; CHECK-NEXT:    [[SHIFT2:%.*]] = shl <2 x i8> [[Y:%.*]], <i8 poison, i8 2>
+; CHECK-NEXT:    [[BW2:%.*]] = xor <2 x i8> [[SHIFT2]], <i8 44, i8 poison>
 ; CHECK-NEXT:    [[BW1:%.*]] = and <2 x i8> [[SHIFT1]], [[BW2]]
 ; CHECK-NEXT:    ret <2 x i8> [[BW1]]
 ;
-  %shift1 = shl <2 x i8> %x, <i8 2, i8 undef>
-  %shift2 = shl <2 x i8> %y, <i8 undef, i8 2>
-  %bw2 = xor <2 x i8> %shift2, <i8 44, i8 undef>
+  %shift1 = shl <2 x i8> %x, <i8 2, i8 poison>
+  %shift2 = shl <2 x i8> %y, <i8 poison, i8 2>
+  %bw2 = xor <2 x i8> %shift2, <i8 44, i8 poison>
   %bw1 = and <2 x i8> %shift1, %bw2
   ret <2 x i8> %bw1
 }
@@ -321,13 +321,13 @@ define <2 x i8> @lshr_add_and(<2 x i8> %x, <2 x i8> %y) {
 define <2 x i8> @lshr_add_or_fail_dif_masks(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @lshr_add_or_fail_dif_masks(
 ; CHECK-NEXT:    [[SHIFT1:%.*]] = lshr <2 x i8> [[X:%.*]], <i8 3, i8 4>
-; CHECK-NEXT:    [[SHIFT2:%.*]] = lshr <2 x i8> [[Y:%.*]], <i8 undef, i8 3>
-; CHECK-NEXT:    [[BW2:%.*]] = add <2 x i8> [[SHIFT2]], <i8 -1, i8 1>
+; CHECK-NEXT:    [[SHIFT2:%.*]] = lshr <2 x i8> [[Y:%.*]], <i8 poison, i8 3>
+; CHECK-NEXT:    [[BW2:%.*]] = add nsw <2 x i8> [[SHIFT2]], <i8 -1, i8 1>
 ; CHECK-NEXT:    [[BW1:%.*]] = and <2 x i8> [[SHIFT1]], [[BW2]]
 ; CHECK-NEXT:    ret <2 x i8> [[BW1]]
 ;
   %shift1 = lshr <2 x i8> %x, <i8 3, i8 4>
-  %shift2 = lshr <2 x i8> %y, <i8 undef, i8 3>
+  %shift2 = lshr <2 x i8> %y, <i8 poison, i8 3>
   %bw2 = add <2 x i8> %shift2, <i8 255, i8 1>
   %bw1 = and <2 x i8> %shift1, %bw2
   ret <2 x i8> %bw1
@@ -659,8 +659,8 @@ define <4 x i8> @and_ashr_not_vec_commuted(<4 x i8> %x, <4 x i8> %y, <4 x i8> %s
   ret <4 x i8> %and
 }
 
-define <4 x i8> @and_ashr_not_vec_undef_1(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) {
-; CHECK-LABEL: @and_ashr_not_vec_undef_1(
+define <4 x i8> @and_ashr_not_vec_poison_1(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) {
+; CHECK-LABEL: @and_ashr_not_vec_poison_1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i8> [[Y:%.*]], <i8 -1, i8 -1, i8 -1, i8 -1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <4 x i8> [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    [[AND:%.*]] = ashr <4 x i8> [[TMP2]], [[SHAMT:%.*]]
@@ -668,18 +668,18 @@ define <4 x i8> @and_ashr_not_vec_undef_1(<4 x i8> %x, <4 x i8> %y, <4 x i8> %sh
 ;
   %x.shift = ashr <4 x i8> %x, %shamt
   %y.shift = ashr <4 x i8> %y, %shamt
-  %y.shift.not = xor <4 x i8> %y.shift, <i8 -1, i8 undef, i8 undef, i8 undef>
+  %y.shift.not = xor <4 x i8> %y.shift, <i8 -1, i8 poison, i8 poison, i8 poison>
   %and = and <4 x i8> %x.shift, %y.shift.not
   ret <4 x i8> %and
 }
 
-define <4 x i8> @and_ashr_not_vec_undef_2(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) {
-; CHECK-LABEL: @and_ashr_not_vec_undef_2(
-; CHECK-NEXT:    ret <4 x i8> zeroinitializer
+define <4 x i8> @and_ashr_not_vec_poison_2(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) {
+; CHECK-LABEL: @and_ashr_not_vec_poison_2(
+; CHECK-NEXT:    ret <4 x i8> poison
 ;
   %x.shift = ashr <4 x i8> %x, %shamt
   %y.shift = ashr <4 x i8> %y, %shamt
-  %y.shift.not = xor <4 x i8> %y.shift, <i8 undef, i8 undef, i8 undef, i8 undef>
+  %y.shift.not = xor <4 x i8> %y.shift, <i8 poison, i8 poison, i8 poison, i8 poison>
   %and = and <4 x i8> %x.shift, %y.shift.not
   ret <4 x i8> %and
 }
@@ -793,8 +793,8 @@ define <4 x i8> @or_ashr_not_vec_commuted(<4 x i8> %x, <4 x i8> %y, <4 x i8> %sh
   ret <4 x i8> %or
 }
 
-define <4 x i8> @or_ashr_not_vec_undef_1(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) {
-; CHECK-LABEL: @or_ashr_not_vec_undef_1(
+define <4 x i8> @or_ashr_not_vec_poison_1(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) {
+; CHECK-LABEL: @or_ashr_not_vec_poison_1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i8> [[Y:%.*]], <i8 -1, i8 -1, i8 -1, i8 -1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i8> [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    [[OR:%.*]] = ashr <4 x i8> [[TMP2]], [[SHAMT:%.*]]
@@ -802,18 +802,18 @@ define <4 x i8> @or_ashr_not_vec_undef_1(<4 x i8> %x, <4 x i8> %y, <4 x i8> %sha
 ;
   %x.shift = ashr <4 x i8> %x, %shamt
   %y.shift = ashr <4 x i8> %y, %shamt
-  %y.shift.not = xor <4 x i8> %y.shift, <i8 -1, i8 undef, i8 undef, i8 undef>
+  %y.shift.not = xor <4 x i8> %y.shift, <i8 -1, i8 poison, i8 poison, i8 poison>
   %or = or <4 x i8> %x.shift, %y.shift.not
   ret <4 x i8> %or
 }
 
-define <4 x i8> @or_ashr_not_vec_undef_2(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) {
-; CHECK-LABEL: @or_ashr_not_vec_undef_2(
-; CHECK-NEXT:    ret <4 x i8> <i8 -1, i8 -1, i8 -1, i8 -1>
+define <4 x i8> @or_ashr_not_vec_poison_2(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) {
+; CHECK-LABEL: @or_ashr_not_vec_poison_2(
+; CHECK-NEXT:    ret <4 x i8> poison
 ;
   %x.shift = ashr <4 x i8> %x, %shamt
   %y.shift = ashr <4 x i8> %y, %shamt
-  %y.shift.not = xor <4 x i8> %y.shift, <i8 undef, i8 undef, i8 undef, i8 undef>
+  %y.shift.not = xor <4 x i8> %y.shift, <i8 poison, i8 poison, i8 poison, i8 poison>
   %or = or <4 x i8> %x.shift, %y.shift.not
   ret <4 x i8> %or
 }
@@ -926,8 +926,8 @@ define <4 x i8> @xor_ashr_not_vec_commuted(<4 x i8> %x, <4 x i8> %y, <4 x i8> %s
   ret <4 x i8> %xor
 }
 
-define <4 x i8> @xor_ashr_not_vec_undef_1(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) {
-; CHECK-LABEL: @xor_ashr_not_vec_undef_1(
+define <4 x i8> @xor_ashr_not_vec_poison_1(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) {
+; CHECK-LABEL: @xor_ashr_not_vec_poison_1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i8> [[Y:%.*]], [[X:%.*]]
 ; CHECK-NEXT:    [[DOTNOT:%.*]] = ashr <4 x i8> [[TMP1]], [[SHAMT:%.*]]
 ; CHECK-NEXT:    [[XOR:%.*]] = xor <4 x i8> [[DOTNOT]], <i8 -1, i8 -1, i8 -1, i8 -1>
@@ -935,18 +935,18 @@ define <4 x i8> @xor_ashr_not_vec_undef_1(<4 x i8> %x, <4 x i8> %y, <4 x i8> %sh
 ;
   %x.shift = ashr <4 x i8> %x, %shamt
   %y.shift = ashr <4 x i8> %y, %shamt
-  %y.shift.not = xor <4 x i8> %y.shift, <i8 -1, i8 undef, i8 undef, i8 undef>
+  %y.shift.not = xor <4 x i8> %y.shift, <i8 -1, i8 poison, i8 poison, i8 poison>
   %xor = xor <4 x i8> %x.shift, %y.shift.not
   ret <4 x i8> %xor
 }
 
-define <4 x i8> @xor_ashr_not_vec_undef_2(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) {
-; CHECK-LABEL: @xor_ashr_not_vec_undef_2(
-; CHECK-NEXT:    ret <4 x i8> undef
+define <4 x i8> @xor_ashr_not_vec_poison_2(<4 x i8> %x, <4 x i8> %y, <4 x i8> %shamt) {
+; CHECK-LABEL: @xor_ashr_not_vec_poison_2(
+; CHECK-NEXT:    ret <4 x i8> poison
 ;
   %x.shift = ashr <4 x i8> %x, %shamt
   %y.shift = ashr <4 x i8> %y, %shamt
-  %y.shift.not = xor <4 x i8> %y.shift, <i8 undef, i8 undef, i8 undef, i8 undef>
+  %y.shift.not = xor <4 x i8> %y.shift, <i8 poison, i8 poison, i8 poison, i8 poison>
   %xor = xor <4 x i8> %x.shift, %y.shift.not
   ret <4 x i8> %xor
 }
diff --git a/llvm/test/Transforms/InstCombine/binop-itofp.ll b/llvm/test/Transforms/InstCombine/binop-itofp.ll
index d72a54e8babc9..097a8196af80f 100644
--- a/llvm/test/Transforms/InstCombine/binop-itofp.ll
+++ b/llvm/test/Transforms/InstCombine/binop-itofp.ll
@@ -21,7 +21,7 @@ define half @test_ui_ui_i8_add_fail_overflow(i8 noundef %x_in, i8 noundef %y_in)
 ; CHECK-LABEL: @test_ui_ui_i8_add_fail_overflow(
 ; CHECK-NEXT:    [[X:%.*]] = and i8 [[X_IN:%.*]], 127
 ; CHECK-NEXT:    [[Y:%.*]] = and i8 [[Y_IN:%.*]], -127
-; CHECK-NEXT:    [[XF:%.*]] = uitofp i8 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = uitofp nneg i8 [[X]] to half
 ; CHECK-NEXT:    [[YF:%.*]] = uitofp i8 [[Y]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fadd half [[XF]], [[YF]]
 ; CHECK-NEXT:    ret half [[R]]
@@ -49,7 +49,7 @@ define half @test_ui_ui_i8_add_C(i8 noundef %x_in) {
 define half @test_ui_ui_i8_add_C_fail_no_repr(i8 noundef %x_in) {
 ; CHECK-LABEL: @test_ui_ui_i8_add_C_fail_no_repr(
 ; CHECK-NEXT:    [[X:%.*]] = and i8 [[X_IN:%.*]], 127
-; CHECK-NEXT:    [[XF:%.*]] = uitofp i8 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = uitofp nneg i8 [[X]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fadd half [[XF]], 0xH57F8
 ; CHECK-NEXT:    ret half [[R]]
 ;
@@ -62,7 +62,7 @@ define half @test_ui_ui_i8_add_C_fail_no_repr(i8 noundef %x_in) {
 define half @test_ui_ui_i8_add_C_fail_overflow(i8 noundef %x_in) {
 ; CHECK-LABEL: @test_ui_ui_i8_add_C_fail_overflow(
 ; CHECK-NEXT:    [[X:%.*]] = and i8 [[X_IN:%.*]], 127
-; CHECK-NEXT:    [[XF:%.*]] = uitofp i8 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = uitofp nneg i8 [[X]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fadd half [[XF]], 0xH5808
 ; CHECK-NEXT:    ret half [[R]]
 ;
@@ -110,7 +110,7 @@ define half @test_ui_si_i8_add(i8 noundef %x_in, i8 noundef %y_in) {
 ; CHECK-NEXT:    [[X:%.*]] = and i8 [[X_IN:%.*]], 63
 ; CHECK-NEXT:    [[Y:%.*]] = and i8 [[Y_IN:%.*]], 63
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i8 [[X]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = uitofp i8 [[TMP1]] to half
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i8 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %x = and i8 %x_in, 63
@@ -140,7 +140,7 @@ define half @test_ui_si_i8_add_overflow(i8 noundef %x_in, i8 noundef %y_in) {
 define half @test_ui_ui_i8_sub_C(i8 noundef %x_in) {
 ; CHECK-LABEL: @test_ui_ui_i8_sub_C(
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[X_IN:%.*]], 127
-; CHECK-NEXT:    [[R:%.*]] = uitofp i8 [[TMP1]] to half
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i8 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %x = or i8 %x_in, 128
@@ -166,7 +166,7 @@ define half @test_si_si_i8_sub(i8 noundef %x_in, i8 noundef %y_in) {
 ; CHECK-NEXT:    [[X:%.*]] = and i8 [[X_IN:%.*]], 63
 ; CHECK-NEXT:    [[Y:%.*]] = or i8 [[Y_IN:%.*]], -64
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw i8 [[X]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = sitofp i8 [[TMP1]] to half
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i8 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %x = and i8 %x_in, 63
@@ -181,7 +181,7 @@ define half @test_si_si_i8_sub_fail_overflow(i8 noundef %x_in, i8 noundef %y_in)
 ; CHECK-LABEL: @test_si_si_i8_sub_fail_overflow(
 ; CHECK-NEXT:    [[X:%.*]] = and i8 [[X_IN:%.*]], 63
 ; CHECK-NEXT:    [[Y:%.*]] = or i8 [[Y_IN:%.*]], -65
-; CHECK-NEXT:    [[XF:%.*]] = sitofp i8 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = uitofp nneg i8 [[X]] to half
 ; CHECK-NEXT:    [[YF:%.*]] = sitofp i8 [[Y]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fsub half [[XF]], [[YF]]
 ; CHECK-NEXT:    ret half [[R]]
@@ -198,7 +198,7 @@ define half @test_si_si_i8_sub_C(i8 noundef %x_in) {
 ; CHECK-LABEL: @test_si_si_i8_sub_C(
 ; CHECK-NEXT:    [[X:%.*]] = and i8 [[X_IN:%.*]], 63
 ; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint i8 [[X]], 64
-; CHECK-NEXT:    [[R:%.*]] = sitofp i8 [[TMP1]] to half
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i8 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %x = and i8 %x_in, 63
@@ -283,7 +283,7 @@ define half @test_ui_ui_i8_mul_C(i8 noundef %x_in) {
 define half @test_ui_ui_i8_mul_C_fail_overlow(i8 noundef %x_in) {
 ; CHECK-LABEL: @test_ui_ui_i8_mul_C_fail_overlow(
 ; CHECK-NEXT:    [[X:%.*]] = and i8 [[X_IN:%.*]], 14
-; CHECK-NEXT:    [[XF:%.*]] = uitofp i8 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = uitofp nneg i8 [[X]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fmul half [[XF]], 0xH4CC0
 ; CHECK-NEXT:    ret half [[R]]
 ;
@@ -315,7 +315,7 @@ define half @test_si_si_i8_mul_fail_maybe_zero(i8 noundef %x_in, i8 noundef %y_i
 ; CHECK-LABEL: @test_si_si_i8_mul_fail_maybe_zero(
 ; CHECK-NEXT:    [[X:%.*]] = and i8 [[X_IN:%.*]], 7
 ; CHECK-NEXT:    [[Y:%.*]] = or i8 [[Y_IN:%.*]], -8
-; CHECK-NEXT:    [[XF:%.*]] = sitofp i8 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = uitofp nneg i8 [[X]] to half
 ; CHECK-NEXT:    [[YF:%.*]] = sitofp i8 [[Y]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fmul half [[XF]], [[YF]]
 ; CHECK-NEXT:    ret half [[R]]
@@ -332,7 +332,7 @@ define half @test_si_si_i8_mul_C_fail_no_repr(i8 noundef %x_in) {
 ; CHECK-LABEL: @test_si_si_i8_mul_C_fail_no_repr(
 ; CHECK-NEXT:    [[XX:%.*]] = and i8 [[X_IN:%.*]], 6
 ; CHECK-NEXT:    [[X:%.*]] = or disjoint i8 [[XX]], 1
-; CHECK-NEXT:    [[XF:%.*]] = sitofp i8 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = uitofp nneg i8 [[X]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fmul half [[XF]], 0xHC780
 ; CHECK-NEXT:    ret half [[R]]
 ;
@@ -347,7 +347,7 @@ define half @test_si_si_i8_mul_C_fail_overflow(i8 noundef %x_in) {
 ; CHECK-LABEL: @test_si_si_i8_mul_C_fail_overflow(
 ; CHECK-NEXT:    [[XX:%.*]] = and i8 [[X_IN:%.*]], 6
 ; CHECK-NEXT:    [[X:%.*]] = or disjoint i8 [[XX]], 1
-; CHECK-NEXT:    [[XF:%.*]] = sitofp i8 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = uitofp nneg i8 [[X]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fmul half [[XF]], 0xHCCC0
 ; CHECK-NEXT:    ret half [[R]]
 ;
@@ -365,7 +365,7 @@ define half @test_ui_si_i8_mul(i8 noundef %x_in, i8 noundef %y_in) {
 ; CHECK-NEXT:    [[YY:%.*]] = and i8 [[Y_IN:%.*]], 7
 ; CHECK-NEXT:    [[Y:%.*]] = add nuw nsw i8 [[YY]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw i8 [[X]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = uitofp i8 [[TMP1]] to half
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i8 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %xx = and i8 %x_in, 6
@@ -384,7 +384,7 @@ define half @test_ui_si_i8_mul_fail_maybe_zero(i8 noundef %x_in, i8 noundef %y_i
 ; CHECK-NEXT:    [[X:%.*]] = add nuw nsw i8 [[XX]], 1
 ; CHECK-NEXT:    [[Y:%.*]] = and i8 [[Y_IN:%.*]], 7
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw i8 [[X]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = uitofp i8 [[TMP1]] to half
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i8 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %xx = and i8 %x_in, 7
@@ -401,7 +401,7 @@ define half @test_ui_si_i8_mul_fail_signed(i8 noundef %x_in, i8 noundef %y_in) {
 ; CHECK-NEXT:    [[XX:%.*]] = and i8 [[X_IN:%.*]], 7
 ; CHECK-NEXT:    [[X:%.*]] = add nuw nsw i8 [[XX]], 1
 ; CHECK-NEXT:    [[Y:%.*]] = or i8 [[Y_IN:%.*]], -4
-; CHECK-NEXT:    [[XF:%.*]] = sitofp i8 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = uitofp nneg i8 [[X]] to half
 ; CHECK-NEXT:    [[YF:%.*]] = uitofp i8 [[Y]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fmul half [[XF]], [[YF]]
 ; CHECK-NEXT:    ret half [[R]]
@@ -420,7 +420,7 @@ define half @test_ui_ui_i16_add(i16 noundef %x_in, i16 noundef %y_in) {
 ; CHECK-NEXT:    [[X:%.*]] = and i16 [[X_IN:%.*]], 2047
 ; CHECK-NEXT:    [[Y:%.*]] = and i16 [[Y_IN:%.*]], 2047
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i16 [[X]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = uitofp i16 [[TMP1]] to half
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i16 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %x = and i16 %x_in, 2047
@@ -435,8 +435,8 @@ define half @test_ui_ui_i16_add_fail_not_promotable(i16 noundef %x_in, i16 nound
 ; CHECK-LABEL: @test_ui_ui_i16_add_fail_not_promotable(
 ; CHECK-NEXT:    [[X:%.*]] = and i16 [[X_IN:%.*]], 2049
 ; CHECK-NEXT:    [[Y:%.*]] = and i16 [[Y_IN:%.*]], 2047
-; CHECK-NEXT:    [[XF:%.*]] = uitofp i16 [[X]] to half
-; CHECK-NEXT:    [[YF:%.*]] = uitofp i16 [[Y]] to half
+; CHECK-NEXT:    [[XF:%.*]] = uitofp nneg i16 [[X]] to half
+; CHECK-NEXT:    [[YF:%.*]] = uitofp nneg i16 [[Y]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fadd half [[XF]], [[YF]]
 ; CHECK-NEXT:    ret half [[R]]
 ;
@@ -463,7 +463,7 @@ define half @test_ui_ui_i16_add_C(i16 noundef %x_in) {
 define half @test_ui_ui_i16_add_C_fail_overflow(i16 noundef %x_in) {
 ; CHECK-LABEL: @test_ui_ui_i16_add_C_fail_overflow(
 ; CHECK-NEXT:    [[X:%.*]] = and i16 [[X_IN:%.*]], 2047
-; CHECK-NEXT:    [[XF:%.*]] = uitofp i16 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = uitofp nneg i16 [[X]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fadd half [[XF]], 0xH7BD0
 ; CHECK-NEXT:    ret half [[R]]
 ;
@@ -541,7 +541,7 @@ define half @test_si_si_i16_sub_fail_no_promotion(i16 noundef %x_in, i16 noundef
 ; CHECK-LABEL: @test_si_si_i16_sub_fail_no_promotion(
 ; CHECK-NEXT:    [[X:%.*]] = and i16 [[X_IN:%.*]], 2047
 ; CHECK-NEXT:    [[Y:%.*]] = or i16 [[Y_IN:%.*]], -2049
-; CHECK-NEXT:    [[XF:%.*]] = sitofp i16 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = uitofp nneg i16 [[X]] to half
 ; CHECK-NEXT:    [[YF:%.*]] = sitofp i16 [[Y]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fsub half [[XF]], [[YF]]
 ; CHECK-NEXT:    ret half [[R]]
@@ -575,7 +575,7 @@ define half @test_ui_si_i16_sub_fail_maybe_signed(i16 noundef %x_in, i16 noundef
 ; CHECK-NEXT:    [[X:%.*]] = or i16 [[X_IN:%.*]], -2048
 ; CHECK-NEXT:    [[Y:%.*]] = and i16 [[Y_IN:%.*]], 2047
 ; CHECK-NEXT:    [[XF:%.*]] = uitofp i16 [[X]] to half
-; CHECK-NEXT:    [[YF:%.*]] = sitofp i16 [[Y]] to half
+; CHECK-NEXT:    [[YF:%.*]] = uitofp nneg i16 [[Y]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fsub half [[XF]], [[YF]]
 ; CHECK-NEXT:    ret half [[R]]
 ;
@@ -607,8 +607,8 @@ define half @test_ui_ui_i16_mul_fail_no_promotion(i16 noundef %x_in, i16 noundef
 ; CHECK-LABEL: @test_ui_ui_i16_mul_fail_no_promotion(
 ; CHECK-NEXT:    [[X:%.*]] = and i16 [[X_IN:%.*]], 4095
 ; CHECK-NEXT:    [[Y:%.*]] = and i16 [[Y_IN:%.*]], 3
-; CHECK-NEXT:    [[XF:%.*]] = uitofp i16 [[X]] to half
-; CHECK-NEXT:    [[YF:%.*]] = uitofp i16 [[Y]] to half
+; CHECK-NEXT:    [[XF:%.*]] = uitofp nneg i16 [[X]] to half
+; CHECK-NEXT:    [[YF:%.*]] = uitofp nneg i16 [[Y]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fmul half [[XF]], [[YF]]
 ; CHECK-NEXT:    ret half [[R]]
 ;
@@ -643,7 +643,7 @@ define half @test_si_si_i16_mul_fail_overflow(i16 noundef %x_in, i16 noundef %y_
 ; CHECK-NEXT:    [[XX:%.*]] = and i16 [[X_IN:%.*]], 126
 ; CHECK-NEXT:    [[X:%.*]] = or disjoint i16 [[XX]], 1
 ; CHECK-NEXT:    [[Y:%.*]] = or i16 [[Y_IN:%.*]], -257
-; CHECK-NEXT:    [[XF:%.*]] = sitofp i16 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = uitofp nneg i16 [[X]] to half
 ; CHECK-NEXT:    [[YF:%.*]] = sitofp i16 [[Y]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fmul half [[XF]], [[YF]]
 ; CHECK-NEXT:    ret half [[R]]
@@ -690,7 +690,7 @@ define half @test_ui_si_i16_mul(i16 noundef %x_in, i16 noundef %y_in) {
 ; CHECK-NEXT:    [[YY:%.*]] = and i16 [[Y_IN:%.*]], 126
 ; CHECK-NEXT:    [[Y:%.*]] = or disjoint i16 [[YY]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw i16 [[X]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = uitofp i16 [[TMP1]] to half
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i16 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %xx = and i16 %x_in, 126
@@ -723,7 +723,7 @@ define half @test_ui_ui_i12_add_fail_overflow(i12 noundef %x_in, i12 noundef %y_
 ; CHECK-LABEL: @test_ui_ui_i12_add_fail_overflow(
 ; CHECK-NEXT:    [[X:%.*]] = and i12 [[X_IN:%.*]], 2047
 ; CHECK-NEXT:    [[Y:%.*]] = and i12 [[Y_IN:%.*]], -2047
-; CHECK-NEXT:    [[XF:%.*]] = uitofp i12 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = uitofp nneg i12 [[X]] to half
 ; CHECK-NEXT:    [[YF:%.*]] = uitofp i12 [[Y]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fadd half [[XF]], [[YF]]
 ; CHECK-NEXT:    ret half [[R]]
@@ -821,7 +821,7 @@ define half @test_si_si_i12_sub(i12 noundef %x_in, i12 noundef %y_in) {
 ; CHECK-NEXT:    [[X:%.*]] = and i12 [[X_IN:%.*]], 1023
 ; CHECK-NEXT:    [[Y:%.*]] = or i12 [[Y_IN:%.*]], -1024
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw i12 [[X]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = sitofp i12 [[TMP1]] to half
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i12 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %x = and i12 %x_in, 1023
@@ -850,7 +850,7 @@ define half @test_ui_ui_i12_mul(i12 noundef %x_in, i12 noundef %y_in) {
 ; CHECK-NEXT:    [[X:%.*]] = and i12 [[X_IN:%.*]], 31
 ; CHECK-NEXT:    [[Y:%.*]] = and i12 [[Y_IN:%.*]], 63
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw i12 [[X]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = uitofp i12 [[TMP1]] to half
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i12 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %x = and i12 %x_in, 31
@@ -883,7 +883,7 @@ define half @test_ui_ui_i12_mul_C(i12 noundef %x_in) {
 ; CHECK-LABEL: @test_ui_ui_i12_mul_C(
 ; CHECK-NEXT:    [[X:%.*]] = shl i12 [[X_IN:%.*]], 6
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i12 [[X]], 1984
-; CHECK-NEXT:    [[R:%.*]] = uitofp i12 [[TMP1]] to half
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i12 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %x = and i12 %x_in, 31
@@ -915,7 +915,7 @@ define half @test_si_si_i12_mul_fail_overflow(i12 noundef %x_in, i12 noundef %y_
 ; CHECK-NEXT:    [[XX:%.*]] = and i12 [[X_IN:%.*]], 30
 ; CHECK-NEXT:    [[X:%.*]] = or disjoint i12 [[XX]], 1
 ; CHECK-NEXT:    [[Y:%.*]] = or i12 [[Y_IN:%.*]], -128
-; CHECK-NEXT:    [[XF:%.*]] = sitofp i12 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = uitofp nneg i12 [[X]] to half
 ; CHECK-NEXT:    [[YF:%.*]] = sitofp i12 [[Y]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fmul half [[XF]], [[YF]]
 ; CHECK-NEXT:    ret half [[R]]
@@ -933,7 +933,7 @@ define half @test_si_si_i12_mul_fail_maybe_non_zero(i12 noundef %x_in, i12 nound
 ; CHECK-LABEL: @test_si_si_i12_mul_fail_maybe_non_zero(
 ; CHECK-NEXT:    [[X:%.*]] = and i12 [[X_IN:%.*]], 30
 ; CHECK-NEXT:    [[Y:%.*]] = or i12 [[Y_IN:%.*]], -128
-; CHECK-NEXT:    [[XF:%.*]] = sitofp i12 [[X]] to half
+; CHECK-NEXT:    [[XF:%.*]] = uitofp nneg i12 [[X]] to half
 ; CHECK-NEXT:    [[YF:%.*]] = sitofp i12 [[Y]] to half
 ; CHECK-NEXT:    [[R:%.*]] = fmul half [[XF]], [[YF]]
 ; CHECK-NEXT:    ret half [[R]]
@@ -950,7 +950,7 @@ define half @test_si_si_i12_mul_C(i12 noundef %x_in) {
 ; CHECK-LABEL: @test_si_si_i12_mul_C(
 ; CHECK-NEXT:    [[X:%.*]] = or i12 [[X_IN:%.*]], -64
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nsw i12 [[X]], -16
-; CHECK-NEXT:    [[R:%.*]] = sitofp i12 [[TMP1]] to half
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i12 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %x = or i12 %x_in, -64
@@ -979,7 +979,7 @@ define half @test_ui_si_i12_mul_nsw(i12 noundef %x_in, i12 noundef %y_in) {
 ; CHECK-NEXT:    [[YY:%.*]] = and i12 [[Y_IN:%.*]], 30
 ; CHECK-NEXT:    [[Y:%.*]] = or disjoint i12 [[YY]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw nsw i12 [[X]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = uitofp i12 [[TMP1]] to half
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i12 [[TMP1]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %xx = and i12 %x_in, 31
diff --git a/llvm/test/Transforms/InstCombine/binop-of-displaced-shifts.ll b/llvm/test/Transforms/InstCombine/binop-of-displaced-shifts.ll
index 27a3c8743368a..a16ad4ddb806f 100644
--- a/llvm/test/Transforms/InstCombine/binop-of-displaced-shifts.ll
+++ b/llvm/test/Transforms/InstCombine/binop-of-displaced-shifts.ll
@@ -202,41 +202,41 @@ define <2 x i8> @shl_or_non_splat(<2 x i8> %x) {
   ret <2 x i8> %binop
 }
 
-define <2 x i8> @shl_or_undef_in_add(<2 x i8> %x) {
-; CHECK-LABEL: define <2 x i8> @shl_or_undef_in_add
+define <2 x i8> @shl_or_poison_in_add(<2 x i8> %x) {
+; CHECK-LABEL: define <2 x i8> @shl_or_poison_in_add
 ; CHECK-SAME: (<2 x i8> [[X:%.*]]) {
 ; CHECK-NEXT:    [[BINOP:%.*]] = shl <2 x i8> <i8 22, i8 poison>, [[X]]
 ; CHECK-NEXT:    ret <2 x i8> [[BINOP]]
 ;
   %shift = shl <2 x i8> <i8 16, i8 16>, %x
-  %add = add <2 x i8> %x, <i8 1, i8 undef>
+  %add = add <2 x i8> %x, <i8 1, i8 poison>
   %shift2 = shl <2 x i8> <i8 3, i8 3>, %add
   %binop = or <2 x i8> %shift, %shift2
   ret <2 x i8> %binop
 }
 
-define <2 x i8> @shl_or_undef_in_shift1(<2 x i8> %x) {
-; CHECK-LABEL: define <2 x i8> @shl_or_undef_in_shift1
+define <2 x i8> @shl_or_poison_in_shift1(<2 x i8> %x) {
+; CHECK-LABEL: define <2 x i8> @shl_or_poison_in_shift1
 ; CHECK-SAME: (<2 x i8> [[X:%.*]]) {
-; CHECK-NEXT:    [[BINOP:%.*]] = shl <2 x i8> <i8 22, i8 -1>, [[X]]
+; CHECK-NEXT:    [[BINOP:%.*]] = shl <2 x i8> <i8 22, i8 poison>, [[X]]
 ; CHECK-NEXT:    ret <2 x i8> [[BINOP]]
 ;
-  %shift = shl <2 x i8> <i8 16, i8 undef>, %x
+  %shift = shl <2 x i8> <i8 16, i8 poison>, %x
   %add = add <2 x i8> %x, <i8 1, i8 1>
   %shift2 = shl <2 x i8> <i8 3, i8 3>, %add
   %binop = or <2 x i8> %shift, %shift2
   ret <2 x i8> %binop
 }
 
-define <2 x i8> @shl_or_undef_in_shift2(<2 x i8> %x) {
-; CHECK-LABEL: define <2 x i8> @shl_or_undef_in_shift2
+define <2 x i8> @shl_or_poison_in_shift2(<2 x i8> %x) {
+; CHECK-LABEL: define <2 x i8> @shl_or_poison_in_shift2
 ; CHECK-SAME: (<2 x i8> [[X:%.*]]) {
-; CHECK-NEXT:    [[BINOP:%.*]] = shl <2 x i8> <i8 22, i8 16>, [[X]]
+; CHECK-NEXT:    [[BINOP:%.*]] = shl <2 x i8> <i8 22, i8 poison>, [[X]]
 ; CHECK-NEXT:    ret <2 x i8> [[BINOP]]
 ;
   %shift = shl <2 x i8> <i8 16, i8 16>, %x
   %add = add <2 x i8> %x, <i8 1, i8 1>
-  %shift2 = shl <2 x i8> <i8 3, i8 undef>, %add
+  %shift2 = shl <2 x i8> <i8 3, i8 poison>, %add
   %binop = or <2 x i8> %shift, %shift2
   ret <2 x i8> %binop
 }
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-clamp-like-pattern-between-zero-and-positive-threshold.ll b/llvm/test/Transforms/InstCombine/canonicalize-clamp-like-pattern-between-zero-and-positive-threshold.ll
index 4547008b76093..c555970ea4348 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-clamp-like-pattern-between-zero-and-positive-threshold.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-clamp-like-pattern-between-zero-and-positive-threshold.ll
@@ -338,22 +338,22 @@ define <2 x i32> @t18_ult_slt_vec_nonsplat(<2 x i32> %x, <2 x i32> %replacement_
   ret <2 x i32> %r
 }
 
-define <3 x i32> @t19_ult_slt_vec_undef0(<3 x i32> %x, <3 x i32> %replacement_low, <3 x i32> %replacement_high) {
-; CHECK-LABEL: @t19_ult_slt_vec_undef0(
+define <3 x i32> @t19_ult_slt_vec_poison0(<3 x i32> %x, <3 x i32> %replacement_low, <3 x i32> %replacement_high) {
+; CHECK-LABEL: @t19_ult_slt_vec_poison0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <3 x i32> [[X:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <3 x i32> [[X]], <i32 65535, i32 65535, i32 65535>
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <3 x i1> [[TMP1]], <3 x i32> [[REPLACEMENT_LOW:%.*]], <3 x i32> [[X]]
 ; CHECK-NEXT:    [[R:%.*]] = select <3 x i1> [[TMP2]], <3 x i32> [[REPLACEMENT_HIGH:%.*]], <3 x i32> [[TMP3]]
 ; CHECK-NEXT:    ret <3 x i32> [[R]]
 ;
-  %t0 = icmp slt <3 x i32> %x, <i32 65536, i32 undef, i32 65536>
+  %t0 = icmp slt <3 x i32> %x, <i32 65536, i32 poison, i32 65536>
   %t1 = select <3 x i1> %t0, <3 x i32> %replacement_low, <3 x i32> %replacement_high
   %t2 = icmp ult <3 x i32> %x, <i32 65536, i32 65536, i32 65536>
   %r = select <3 x i1> %t2, <3 x i32> %x, <3 x i32> %t1
   ret <3 x i32> %r
 }
-define <3 x i32> @t20_ult_slt_vec_undef1(<3 x i32> %x, <3 x i32> %replacement_low, <3 x i32> %replacement_high) {
-; CHECK-LABEL: @t20_ult_slt_vec_undef1(
+define <3 x i32> @t20_ult_slt_vec_poison1(<3 x i32> %x, <3 x i32> %replacement_low, <3 x i32> %replacement_high) {
+; CHECK-LABEL: @t20_ult_slt_vec_poison1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <3 x i32> [[X:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <3 x i32> [[X]], <i32 65535, i32 65535, i32 65535>
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <3 x i1> [[TMP1]], <3 x i32> [[REPLACEMENT_LOW:%.*]], <3 x i32> [[X]]
@@ -362,21 +362,21 @@ define <3 x i32> @t20_ult_slt_vec_undef1(<3 x i32> %x, <3 x i32> %replacement_lo
 ;
   %t0 = icmp slt <3 x i32> %x, <i32 65536, i32 65537, i32 65536>
   %t1 = select <3 x i1> %t0, <3 x i32> %replacement_low, <3 x i32> %replacement_high
-  %t2 = icmp ult <3 x i32> %x, <i32 65536, i32 undef, i32 65536>
+  %t2 = icmp ult <3 x i32> %x, <i32 65536, i32 poison, i32 65536>
   %r = select <3 x i1> %t2, <3 x i32> %x, <3 x i32> %t1
   ret <3 x i32> %r
 }
-define <3 x i32> @t21_ult_slt_vec_undef2(<3 x i32> %x, <3 x i32> %replacement_low, <3 x i32> %replacement_high) {
-; CHECK-LABEL: @t21_ult_slt_vec_undef2(
+define <3 x i32> @t21_ult_slt_vec_poison2(<3 x i32> %x, <3 x i32> %replacement_low, <3 x i32> %replacement_high) {
+; CHECK-LABEL: @t21_ult_slt_vec_poison2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt <3 x i32> [[X:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <3 x i32> [[X]], <i32 65535, i32 65535, i32 65535>
 ; CHECK-NEXT:    [[TMP3:%.*]] = select <3 x i1> [[TMP1]], <3 x i32> [[REPLACEMENT_LOW:%.*]], <3 x i32> [[X]]
 ; CHECK-NEXT:    [[R:%.*]] = select <3 x i1> [[TMP2]], <3 x i32> [[REPLACEMENT_HIGH:%.*]], <3 x i32> [[TMP3]]
 ; CHECK-NEXT:    ret <3 x i32> [[R]]
 ;
-  %t0 = icmp slt <3 x i32> %x, <i32 65536, i32 undef, i32 65536>
+  %t0 = icmp slt <3 x i32> %x, <i32 65536, i32 poison, i32 65536>
   %t1 = select <3 x i1> %t0, <3 x i32> %replacement_low, <3 x i32> %replacement_high
-  %t2 = icmp ult <3 x i32> %x, <i32 65536, i32 undef, i32 65536>
+  %t2 = icmp ult <3 x i32> %x, <i32 65536, i32 poison, i32 65536>
   %r = select <3 x i1> %t2, <3 x i32> %x, <3 x i32> %t1
   ret <3 x i32> %r
 }
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-eq-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-eq-to-icmp-ule.ll
index 5b7a99d53c308..759770688cf20 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-eq-to-icmp-ule.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-eq-to-icmp-ule.ll
@@ -79,12 +79,12 @@ define <2 x i1> @p2_vec_nonsplat_edgecase1(<2 x i8> %x) {
   ret <2 x i1> %ret
 }
 
-define <3 x i1> @p3_vec_splat_undef(<3 x i8> %x) {
-; CHECK-LABEL: @p3_vec_splat_undef(
+define <3 x i1> @p3_vec_splat_poison(<3 x i8> %x) {
+; CHECK-LABEL: @p3_vec_splat_poison(
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ult <3 x i8> [[X:%.*]], <i8 4, i8 4, i8 4>
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
-  %tmp0 = and <3 x i8> %x, <i8 3, i8 undef, i8 3>
+  %tmp0 = and <3 x i8> %x, <i8 3, i8 poison, i8 3>
   %ret = icmp eq <3 x i8> %tmp0, %x
   ret <3 x i1> %ret
 }
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll
index 160d968b9ac4c..95e6d5ac6a5f8 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll
@@ -79,22 +79,22 @@ define <2 x i1> @p2_vec_nonsplat_edgecase1(<2 x i8> %x) {
   ret <2 x i1> %ret
 }
 
-define <3 x i1> @p3_vec_splat_undef(<3 x i8> %x) {
-; CHECK-LABEL: @p3_vec_splat_undef(
+define <3 x i1> @p3_vec_splat_poison(<3 x i8> %x) {
+; CHECK-LABEL: @p3_vec_splat_poison(
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ugt <3 x i8> [[X:%.*]], <i8 3, i8 3, i8 3>
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
-  %tmp0 = and <3 x i8> %x, <i8 3, i8 undef, i8 3>
+  %tmp0 = and <3 x i8> %x, <i8 3, i8 poison, i8 3>
   %ret = icmp ne <3 x i8> %tmp0, %x
   ret <3 x i1> %ret
 }
 
-define <3 x i1> @p3_vec_nonsplat_undef(<3 x i8> %x) {
-; CHECK-LABEL: @p3_vec_nonsplat_undef(
+define <3 x i1> @p3_vec_nonsplat_poison(<3 x i8> %x) {
+; CHECK-LABEL: @p3_vec_nonsplat_poison(
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ugt <3 x i8> [[X:%.*]], <i8 -1, i8 -1, i8 3>
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
-  %tmp0 = and <3 x i8> %x, <i8 -1, i8 undef, i8 3>
+  %tmp0 = and <3 x i8> %x, <i8 -1, i8 poison, i8 3>
   %ret = icmp ne <3 x i8> %tmp0, %x
   ret <3 x i1> %ret
 }
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll
index 60921042d5243..ae503bfb1cfe2 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll
@@ -58,12 +58,12 @@ define <2 x i1> @p2_vec_nonsplat_edgecase(<2 x i8> %x) {
   ret <2 x i1> %ret
 }
 
-define <3 x i1> @p3_vec_splat_undef(<3 x i8> %x) {
-; CHECK-LABEL: @p3_vec_splat_undef(
+define <3 x i1> @p3_vec_splat_poison(<3 x i8> %x) {
+; CHECK-LABEL: @p3_vec_splat_poison(
 ; CHECK-NEXT:    [[RET:%.*]] = icmp slt <3 x i8> [[X:%.*]], <i8 4, i8 4, i8 4>
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
-  %tmp0 = and <3 x i8> %x, <i8 3, i8 undef, i8 3>
+  %tmp0 = and <3 x i8> %x, <i8 3, i8 poison, i8 3>
   %ret = icmp sge <3 x i8> %tmp0, %x
   ret <3 x i1> %ret
 }
@@ -175,11 +175,11 @@ define <2 x i1> @n3_vec(<2 x i8> %x) {
 
 define <3 x i1> @n4_vec(<3 x i8> %x) {
 ; CHECK-LABEL: @n4_vec(
-; CHECK-NEXT:    [[TMP0:%.*]] = and <3 x i8> [[X:%.*]], <i8 3, i8 undef, i8 -1>
+; CHECK-NEXT:    [[TMP0:%.*]] = and <3 x i8> [[X:%.*]], <i8 3, i8 poison, i8 -1>
 ; CHECK-NEXT:    [[RET:%.*]] = icmp sge <3 x i8> [[TMP0]], [[X]]
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
-  %tmp0 = and <3 x i8> %x, <i8 3, i8 undef, i8 -1>
+  %tmp0 = and <3 x i8> %x, <i8 3, i8 poison, i8 -1>
   %ret = icmp sge <3 x i8> %tmp0, %x
   ret <3 x i1> %ret
 }
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sgt-to-icmp-sgt.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sgt-to-icmp-sgt.ll
index 6345e70d7220e..f1333fed2c517 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sgt-to-icmp-sgt.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sgt-to-icmp-sgt.ll
@@ -72,26 +72,26 @@ define <2 x i1> @p2_vec_nonsplat_edgecase() {
   ret <2 x i1> %ret
 }
 
-define <3 x i1> @p3_vec_splat_undef() {
-; CHECK-LABEL: @p3_vec_splat_undef(
+define <3 x i1> @p3_vec_splat_poison() {
+; CHECK-LABEL: @p3_vec_splat_poison(
 ; CHECK-NEXT:    [[X:%.*]] = call <3 x i8> @gen3x8()
 ; CHECK-NEXT:    [[RET:%.*]] = icmp sgt <3 x i8> [[X]], <i8 3, i8 3, i8 3>
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
   %x = call <3 x i8> @gen3x8()
-  %tmp0 = and <3 x i8> %x, <i8 3, i8 undef, i8 3>
+  %tmp0 = and <3 x i8> %x, <i8 3, i8 poison, i8 3>
   %ret = icmp sgt <3 x i8> %x, %tmp0
   ret <3 x i1> %ret
 }
 
-define <3 x i1> @p3_vec_nonsplat_undef() {
-; CHECK-LABEL: @p3_vec_nonsplat_undef(
+define <3 x i1> @p3_vec_nonsplat_poison() {
+; CHECK-LABEL: @p3_vec_nonsplat_poison(
 ; CHECK-NEXT:    [[X:%.*]] = call <3 x i8> @gen3x8()
 ; CHECK-NEXT:    [[RET:%.*]] = icmp sgt <3 x i8> [[X]], <i8 15, i8 3, i8 15>
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
   %x = call <3 x i8> @gen3x8()
-  %tmp0 = and <3 x i8> %x, <i8 15, i8 3, i8 undef>
+  %tmp0 = and <3 x i8> %x, <i8 15, i8 3, i8 poison>
   %ret = icmp sgt <3 x i8> %x, %tmp0
   ret <3 x i1> %ret
 }
@@ -212,12 +212,12 @@ define <2 x i1> @n3_vec() {
 define <3 x i1> @n4_vec() {
 ; CHECK-LABEL: @n4_vec(
 ; CHECK-NEXT:    [[X:%.*]] = call <3 x i8> @gen3x8()
-; CHECK-NEXT:    [[TMP0:%.*]] = and <3 x i8> [[X]], <i8 3, i8 undef, i8 -1>
+; CHECK-NEXT:    [[TMP0:%.*]] = and <3 x i8> [[X]], <i8 3, i8 poison, i8 -1>
 ; CHECK-NEXT:    [[RET:%.*]] = icmp sgt <3 x i8> [[X]], [[TMP0]]
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
   %x = call <3 x i8> @gen3x8()
-  %tmp0 = and <3 x i8> %x, <i8 3, i8 undef, i8 -1>
+  %tmp0 = and <3 x i8> %x, <i8 3, i8 poison, i8 -1>
   %ret = icmp sgt <3 x i8> %x, %tmp0
   ret <3 x i1> %ret
 }
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sle-to-icmp-sle.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sle-to-icmp-sle.ll
index b7aec53fed676..4bed21a525f05 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sle-to-icmp-sle.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sle-to-icmp-sle.ll
@@ -72,14 +72,14 @@ define <2 x i1> @p2_vec_nonsplat_edgecase() {
   ret <2 x i1> %ret
 }
 
-define <3 x i1> @p3_vec_splat_undef() {
-; CHECK-LABEL: @p3_vec_splat_undef(
+define <3 x i1> @p3_vec_splat_poison() {
+; CHECK-LABEL: @p3_vec_splat_poison(
 ; CHECK-NEXT:    [[X:%.*]] = call <3 x i8> @gen3x8()
 ; CHECK-NEXT:    [[RET:%.*]] = icmp slt <3 x i8> [[X]], <i8 4, i8 4, i8 4>
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
   %x = call <3 x i8> @gen3x8()
-  %tmp0 = and <3 x i8> %x, <i8 3, i8 undef, i8 3>
+  %tmp0 = and <3 x i8> %x, <i8 3, i8 poison, i8 3>
   %ret = icmp sle <3 x i8> %x, %tmp0
   ret <3 x i1> %ret
 }
@@ -200,12 +200,12 @@ define <2 x i1> @n3_vec() {
 define <3 x i1> @n4_vec() {
 ; CHECK-LABEL: @n4_vec(
 ; CHECK-NEXT:    [[X:%.*]] = call <3 x i8> @gen3x8()
-; CHECK-NEXT:    [[TMP0:%.*]] = and <3 x i8> [[X]], <i8 3, i8 undef, i8 -1>
+; CHECK-NEXT:    [[TMP0:%.*]] = and <3 x i8> [[X]], <i8 3, i8 poison, i8 -1>
 ; CHECK-NEXT:    [[RET:%.*]] = icmp sle <3 x i8> [[X]], [[TMP0]]
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
   %x = call <3 x i8> @gen3x8()
-  %tmp0 = and <3 x i8> %x, <i8 3, i8 undef, i8 -1>
+  %tmp0 = and <3 x i8> %x, <i8 3, i8 poison, i8 -1>
   %ret = icmp sle <3 x i8> %x, %tmp0
   ret <3 x i1> %ret
 }
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll
index 56661d335c4f6..be6e3d0306bcd 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll
@@ -58,22 +58,22 @@ define <2 x i1> @p2_vec_nonsplat_edgecase(<2 x i8> %x) {
   ret <2 x i1> %ret
 }
 
-define <3 x i1> @p3_vec_splat_undef(<3 x i8> %x) {
-; CHECK-LABEL: @p3_vec_splat_undef(
+define <3 x i1> @p3_vec_splat_poison(<3 x i8> %x) {
+; CHECK-LABEL: @p3_vec_splat_poison(
 ; CHECK-NEXT:    [[RET:%.*]] = icmp sgt <3 x i8> [[X:%.*]], <i8 3, i8 3, i8 3>
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
-  %tmp0 = and <3 x i8> %x, <i8 3, i8 undef, i8 3>
+  %tmp0 = and <3 x i8> %x, <i8 3, i8 poison, i8 3>
   %ret = icmp slt <3 x i8> %tmp0, %x
   ret <3 x i1> %ret
 }
 
-define <3 x i1> @p3_vec_nonsplat_undef(<3 x i8> %x) {
-; CHECK-LABEL: @p3_vec_nonsplat_undef(
+define <3 x i1> @p3_vec_nonsplat_poison(<3 x i8> %x) {
+; CHECK-LABEL: @p3_vec_nonsplat_poison(
 ; CHECK-NEXT:    [[RET:%.*]] = icmp sgt <3 x i8> [[X:%.*]], <i8 15, i8 15, i8 3>
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
-  %tmp0 = and <3 x i8> %x, <i8 undef, i8 15, i8 3>
+  %tmp0 = and <3 x i8> %x, <i8 poison, i8 15, i8 3>
   %ret = icmp slt <3 x i8> %tmp0, %x
   ret <3 x i1> %ret
 }
@@ -185,11 +185,11 @@ define <2 x i1> @n3(<2 x i8> %x) {
 
 define <3 x i1> @n4(<3 x i8> %x) {
 ; CHECK-LABEL: @n4(
-; CHECK-NEXT:    [[TMP0:%.*]] = and <3 x i8> [[X:%.*]], <i8 3, i8 undef, i8 -1>
+; CHECK-NEXT:    [[TMP0:%.*]] = and <3 x i8> [[X:%.*]], <i8 3, i8 poison, i8 -1>
 ; CHECK-NEXT:    [[RET:%.*]] = icmp slt <3 x i8> [[TMP0]], [[X]]
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
-  %tmp0 = and <3 x i8> %x, <i8 3, i8 undef, i8 -1>
+  %tmp0 = and <3 x i8> %x, <i8 3, i8 poison, i8 -1>
   %ret = icmp slt <3 x i8> %tmp0, %x
   ret <3 x i1> %ret
 }
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-uge-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-uge-to-icmp-ule.ll
index a93e8f779435f..cfd48821b2c1d 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-uge-to-icmp-ule.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-uge-to-icmp-ule.ll
@@ -79,12 +79,12 @@ define <2 x i1> @p2_vec_nonsplat_edgecase1(<2 x i8> %x) {
   ret <2 x i1> %ret
 }
 
-define <3 x i1> @p3_vec_splat_undef(<3 x i8> %x) {
-; CHECK-LABEL: @p3_vec_splat_undef(
+define <3 x i1> @p3_vec_splat_poison(<3 x i8> %x) {
+; CHECK-LABEL: @p3_vec_splat_poison(
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ult <3 x i8> [[X:%.*]], <i8 4, i8 4, i8 4>
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
-  %tmp0 = and <3 x i8> %x, <i8 3, i8 undef, i8 3>
+  %tmp0 = and <3 x i8> %x, <i8 3, i8 poison, i8 3>
   %ret = icmp uge <3 x i8> %tmp0, %x
   ret <3 x i1> %ret
 }
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ugt-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ugt-to-icmp-ugt.ll
index 73ea4d456d246..6f6ba95a81c76 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ugt-to-icmp-ugt.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ugt-to-icmp-ugt.ll
@@ -95,26 +95,26 @@ define <2 x i1> @p2_vec_nonsplat_edgecase1() {
   ret <2 x i1> %ret
 }
 
-define <3 x i1> @p3_vec_splat_undef() {
-; CHECK-LABEL: @p3_vec_splat_undef(
+define <3 x i1> @p3_vec_splat_poison() {
+; CHECK-LABEL: @p3_vec_splat_poison(
 ; CHECK-NEXT:    [[X:%.*]] = call <3 x i8> @gen3x8()
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ugt <3 x i8> [[X]], <i8 3, i8 3, i8 3>
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
   %x = call <3 x i8> @gen3x8()
-  %tmp0 = and <3 x i8> %x, <i8 3, i8 undef, i8 3>
+  %tmp0 = and <3 x i8> %x, <i8 3, i8 poison, i8 3>
   %ret = icmp ugt <3 x i8> %x, %tmp0
   ret <3 x i1> %ret
 }
 
-define <3 x i1> @p3_vec_nonsplat_undef() {
-; CHECK-LABEL: @p3_vec_nonsplat_undef(
+define <3 x i1> @p3_vec_nonsplat_poison() {
+; CHECK-LABEL: @p3_vec_nonsplat_poison(
 ; CHECK-NEXT:    [[X:%.*]] = call <3 x i8> @gen3x8()
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ugt <3 x i8> [[X]], <i8 3, i8 3, i8 15>
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
   %x = call <3 x i8> @gen3x8()
-  %tmp0 = and <3 x i8> %x, <i8 3, i8 undef, i8 15>
+  %tmp0 = and <3 x i8> %x, <i8 3, i8 poison, i8 15>
   %ret = icmp ugt <3 x i8> %x, %tmp0
   ret <3 x i1> %ret
 }
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ule-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ule-to-icmp-ule.ll
index 53886b5f2dc9c..54f00321c4cf0 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ule-to-icmp-ule.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ule-to-icmp-ule.ll
@@ -95,14 +95,14 @@ define <2 x i1> @p2_vec_nonsplat_edgecase1() {
   ret <2 x i1> %ret
 }
 
-define <3 x i1> @p3_vec_splat_undef() {
-; CHECK-LABEL: @p3_vec_splat_undef(
+define <3 x i1> @p3_vec_splat_poison() {
+; CHECK-LABEL: @p3_vec_splat_poison(
 ; CHECK-NEXT:    [[X:%.*]] = call <3 x i8> @gen3x8()
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ult <3 x i8> [[X]], <i8 4, i8 4, i8 4>
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
   %x = call <3 x i8> @gen3x8()
-  %tmp0 = and <3 x i8> %x, <i8 3, i8 undef, i8 3>
+  %tmp0 = and <3 x i8> %x, <i8 3, i8 poison, i8 3>
   %ret = icmp ule <3 x i8> %x, %tmp0
   ret <3 x i1> %ret
 }
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ult-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ult-to-icmp-ugt.ll
index d66be571008c2..008fc6d2d6eda 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ult-to-icmp-ugt.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-ult-to-icmp-ugt.ll
@@ -80,22 +80,22 @@ define <2 x i1> @p2_vec_nonsplat_edgecase1(<2 x i8> %x) {
   ret <2 x i1> %ret
 }
 
-define <3 x i1> @p3_vec_splat_undef(<3 x i8> %x) {
-; CHECK-LABEL: @p3_vec_splat_undef(
+define <3 x i1> @p3_vec_splat_poison(<3 x i8> %x) {
+; CHECK-LABEL: @p3_vec_splat_poison(
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ugt <3 x i8> [[X:%.*]], <i8 3, i8 3, i8 3>
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
-  %tmp0 = and <3 x i8> %x, <i8 3, i8 undef, i8 3>
+  %tmp0 = and <3 x i8> %x, <i8 3, i8 poison, i8 3>
   %ret = icmp ult <3 x i8> %tmp0, %x
   ret <3 x i1> %ret
 }
 
-define <3 x i1> @p3_vec_nonsplat_undef(<3 x i8> %x) {
-; CHECK-LABEL: @p3_vec_nonsplat_undef(
+define <3 x i1> @p3_vec_nonsplat_poison(<3 x i8> %x) {
+; CHECK-LABEL: @p3_vec_nonsplat_poison(
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ugt <3 x i8> [[X:%.*]], <i8 7, i8 31, i8 7>
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
-  %tmp0 = and <3 x i8> %x, <i8 7, i8 31, i8 undef>
+  %tmp0 = and <3 x i8> %x, <i8 7, i8 31, i8 poison>
   %ret = icmp ult <3 x i8> %tmp0, %x
   ret <3 x i1> %ret
 }
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-eq-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-eq-to-icmp-ule.ll
index 38611d8b53a98..dc5658d302d99 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-eq-to-icmp-ule.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-eq-to-icmp-ule.ll
@@ -40,13 +40,13 @@ define <2 x i1> @p1_vec(<2 x i8> %x, <2 x i8> %y) {
   ret <2 x i1> %ret
 }
 
-define <3 x i1> @p2_vec_undef(<3 x i8> %x, <3 x i8> %y) {
-; CHECK-LABEL: @p2_vec_undef(
-; CHECK-NEXT:    [[TMP0:%.*]] = lshr <3 x i8> <i8 -1, i8 undef, i8 -1>, [[Y:%.*]]
+define <3 x i1> @p2_vec_poison(<3 x i8> %x, <3 x i8> %y) {
+; CHECK-LABEL: @p2_vec_poison(
+; CHECK-NEXT:    [[TMP0:%.*]] = lshr <3 x i8> <i8 -1, i8 poison, i8 -1>, [[Y:%.*]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp uge <3 x i8> [[TMP0]], [[X:%.*]]
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
-  %tmp0 = lshr <3 x i8> <i8 -1, i8 undef, i8 -1>, %y
+  %tmp0 = lshr <3 x i8> <i8 -1, i8 poison, i8 -1>, %y
   %tmp1 = and <3 x i8> %tmp0, %x
   %ret = icmp eq <3 x i8> %tmp1, %x
   ret <3 x i1> %ret
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll
index 37d317b695f60..8fbbd2bb9907d 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-and-icmp-ne-to-icmp-ugt.ll
@@ -40,13 +40,13 @@ define <2 x i1> @p1_vec(<2 x i8> %x, <2 x i8> %y) {
   ret <2 x i1> %ret
 }
 
-define <3 x i1> @p2_vec_undef(<3 x i8> %x, <3 x i8> %y) {
-; CHECK-LABEL: @p2_vec_undef(
-; CHECK-NEXT:    [[TMP0:%.*]] = lshr <3 x i8> <i8 -1, i8 undef, i8 -1>, [[Y:%.*]]
+define <3 x i1> @p2_vec_poison(<3 x i8> %x, <3 x i8> %y) {
+; CHECK-LABEL: @p2_vec_poison(
+; CHECK-NEXT:    [[TMP0:%.*]] = lshr <3 x i8> <i8 -1, i8 poison, i8 -1>, [[Y:%.*]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ult <3 x i8> [[TMP0]], [[X:%.*]]
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
-  %tmp0 = lshr <3 x i8> <i8 -1, i8 undef, i8 -1>, %y
+  %tmp0 = lshr <3 x i8> <i8 -1, i8 poison, i8 -1>, %y
   %tmp1 = and <3 x i8> %tmp0, %x
   %ret = icmp ne <3 x i8> %tmp1, %x
   ret <3 x i1> %ret
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll
index dfd67eae8aafd..88487b38e2c70 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-eq-to-icmp-ule.ll
@@ -44,40 +44,40 @@ define <2 x i1> @p1_vec(<2 x i8> %x, <2 x i8> %y) {
   ret <2 x i1> %ret
 }
 
-define <3 x i1> @p2_vec_undef0(<3 x i8> %x, <3 x i8> %y) {
-; CHECK-LABEL: @p2_vec_undef0(
+define <3 x i1> @p2_vec_poison0(<3 x i8> %x, <3 x i8> %y) {
+; CHECK-LABEL: @p2_vec_poison0(
 ; CHECK-NEXT:    [[X_HIGHBITS:%.*]] = lshr <3 x i8> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp eq <3 x i8> [[X_HIGHBITS]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
-  %t0 = shl <3 x i8> <i8 -1, i8 undef, i8 -1>, %y
+  %t0 = shl <3 x i8> <i8 -1, i8 poison, i8 -1>, %y
   %t1 = xor <3 x i8> %t0, <i8 -1, i8 -1, i8 -1>
   %t2 = and <3 x i8> %t1, %x
   %ret = icmp eq <3 x i8> %t2, %x
   ret <3 x i1> %ret
 }
 
-define <3 x i1> @p3_vec_undef0(<3 x i8> %x, <3 x i8> %y) {
-; CHECK-LABEL: @p3_vec_undef0(
+define <3 x i1> @p3_vec_poison0(<3 x i8> %x, <3 x i8> %y) {
+; CHECK-LABEL: @p3_vec_poison0(
 ; CHECK-NEXT:    [[X_HIGHBITS:%.*]] = lshr <3 x i8> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp eq <3 x i8> [[X_HIGHBITS]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
   %t0 = shl <3 x i8> <i8 -1, i8 -1, i8 -1>, %y
-  %t1 = xor <3 x i8> %t0, <i8 -1, i8 undef, i8 -1>
+  %t1 = xor <3 x i8> %t0, <i8 -1, i8 poison, i8 -1>
   %t2 = and <3 x i8> %t1, %x
   %ret = icmp eq <3 x i8> %t2, %x
   ret <3 x i1> %ret
 }
 
-define <3 x i1> @p4_vec_undef2(<3 x i8> %x, <3 x i8> %y) {
-; CHECK-LABEL: @p4_vec_undef2(
+define <3 x i1> @p4_vec_poison2(<3 x i8> %x, <3 x i8> %y) {
+; CHECK-LABEL: @p4_vec_poison2(
 ; CHECK-NEXT:    [[X_HIGHBITS:%.*]] = lshr <3 x i8> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp eq <3 x i8> [[X_HIGHBITS]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
-  %t0 = shl <3 x i8> <i8 -1, i8 undef, i8 -1>, %y
-  %t1 = xor <3 x i8> %t0, <i8 -1, i8 undef, i8 -1>
+  %t0 = shl <3 x i8> <i8 -1, i8 poison, i8 -1>, %y
+  %t1 = xor <3 x i8> %t0, <i8 -1, i8 poison, i8 -1>
   %t2 = and <3 x i8> %t1, %x
   %ret = icmp eq <3 x i8> %t2, %x
   ret <3 x i1> %ret
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll
index 608e133ec7f73..b717925fd644f 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v2-and-icmp-ne-to-icmp-ugt.ll
@@ -44,40 +44,40 @@ define <2 x i1> @p1_vec(<2 x i8> %x, <2 x i8> %y) {
   ret <2 x i1> %ret
 }
 
-define <3 x i1> @p2_vec_undef0(<3 x i8> %x, <3 x i8> %y) {
-; CHECK-LABEL: @p2_vec_undef0(
+define <3 x i1> @p2_vec_poison0(<3 x i8> %x, <3 x i8> %y) {
+; CHECK-LABEL: @p2_vec_poison0(
 ; CHECK-NEXT:    [[X_HIGHBITS:%.*]] = lshr <3 x i8> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ne <3 x i8> [[X_HIGHBITS]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
-  %t0 = shl <3 x i8> <i8 -1, i8 undef, i8 -1>, %y
+  %t0 = shl <3 x i8> <i8 -1, i8 poison, i8 -1>, %y
   %t1 = xor <3 x i8> %t0, <i8 -1, i8 -1, i8 -1>
   %t2 = and <3 x i8> %t1, %x
   %ret = icmp ne <3 x i8> %t2, %x
   ret <3 x i1> %ret
 }
 
-define <3 x i1> @p3_vec_undef0(<3 x i8> %x, <3 x i8> %y) {
-; CHECK-LABEL: @p3_vec_undef0(
+define <3 x i1> @p3_vec_poison0(<3 x i8> %x, <3 x i8> %y) {
+; CHECK-LABEL: @p3_vec_poison0(
 ; CHECK-NEXT:    [[X_HIGHBITS:%.*]] = lshr <3 x i8> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ne <3 x i8> [[X_HIGHBITS]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
   %t0 = shl <3 x i8> <i8 -1, i8 -1, i8 -1>, %y
-  %t1 = xor <3 x i8> %t0, <i8 -1, i8 undef, i8 -1>
+  %t1 = xor <3 x i8> %t0, <i8 -1, i8 poison, i8 -1>
   %t2 = and <3 x i8> %t1, %x
   %ret = icmp ne <3 x i8> %t2, %x
   ret <3 x i1> %ret
 }
 
-define <3 x i1> @p4_vec_undef2(<3 x i8> %x, <3 x i8> %y) {
-; CHECK-LABEL: @p4_vec_undef2(
+define <3 x i1> @p4_vec_poison2(<3 x i8> %x, <3 x i8> %y) {
+; CHECK-LABEL: @p4_vec_poison2(
 ; CHECK-NEXT:    [[X_HIGHBITS:%.*]] = lshr <3 x i8> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ne <3 x i8> [[X_HIGHBITS]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
-  %t0 = shl <3 x i8> <i8 -1, i8 undef, i8 -1>, %y
-  %t1 = xor <3 x i8> %t0, <i8 -1, i8 undef, i8 -1>
+  %t0 = shl <3 x i8> <i8 -1, i8 poison, i8 -1>, %y
+  %t1 = xor <3 x i8> %t0, <i8 -1, i8 poison, i8 -1>
   %t2 = and <3 x i8> %t1, %x
   %ret = icmp ne <3 x i8> %t2, %x
   ret <3 x i1> %ret
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-eq-to-icmp-ule.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-eq-to-icmp-ule.ll
index d13129c1248a4..f48d284e085bc 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-eq-to-icmp-ule.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-eq-to-icmp-ule.ll
@@ -54,15 +54,15 @@ define <2 x i1> @p1_vec(<2 x i8> %x, <2 x i8> %y) {
   ret <2 x i1> %ret
 }
 
-define <3 x i1> @p2_vec_undef0(<3 x i8> %x, <3 x i8> %y) {
-; CHECK-LABEL: @p2_vec_undef0(
-; CHECK-NEXT:    [[T0:%.*]] = shl <3 x i8> <i8 -1, i8 undef, i8 -1>, [[Y:%.*]]
+define <3 x i1> @p2_vec_poison0(<3 x i8> %x, <3 x i8> %y) {
+; CHECK-LABEL: @p2_vec_poison0(
+; CHECK-NEXT:    [[T0:%.*]] = shl nsw <3 x i8> <i8 -1, i8 poison, i8 -1>, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use3i8(<3 x i8> [[T0]])
 ; CHECK-NEXT:    [[T1:%.*]] = lshr <3 x i8> <i8 -1, i8 -1, i8 -1>, [[Y]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp uge <3 x i8> [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
-  %t0 = shl <3 x i8> <i8 -1, i8 undef, i8 -1>, %y
+  %t0 = shl <3 x i8> <i8 -1, i8 poison, i8 -1>, %y
   call void @use3i8(<3 x i8> %t0)
   %t1 = lshr <3 x i8> %t0, %y
   %t2 = and <3 x i8> %t1, %x
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-ne-to-icmp-ugt.ll b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-ne-to-icmp-ugt.ll
index a1517b36d0b9d..f4b3c67164e49 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-ne-to-icmp-ugt.ll
+++ b/llvm/test/Transforms/InstCombine/canonicalize-low-bit-mask-v4-and-icmp-ne-to-icmp-ugt.ll
@@ -54,15 +54,15 @@ define <2 x i1> @p1_vec(<2 x i8> %x, <2 x i8> %y) {
   ret <2 x i1> %ret
 }
 
-define <3 x i1> @p2_vec_undef0(<3 x i8> %x, <3 x i8> %y) {
-; CHECK-LABEL: @p2_vec_undef0(
-; CHECK-NEXT:    [[T0:%.*]] = shl <3 x i8> <i8 -1, i8 undef, i8 -1>, [[Y:%.*]]
+define <3 x i1> @p2_vec_poison0(<3 x i8> %x, <3 x i8> %y) {
+; CHECK-LABEL: @p2_vec_poison0(
+; CHECK-NEXT:    [[T0:%.*]] = shl nsw <3 x i8> <i8 -1, i8 poison, i8 -1>, [[Y:%.*]]
 ; CHECK-NEXT:    call void @use3i8(<3 x i8> [[T0]])
 ; CHECK-NEXT:    [[T1:%.*]] = lshr <3 x i8> <i8 -1, i8 -1, i8 -1>, [[Y]]
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ult <3 x i8> [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    ret <3 x i1> [[RET]]
 ;
-  %t0 = shl <3 x i8> <i8 -1, i8 undef, i8 -1>, %y
+  %t0 = shl <3 x i8> <i8 -1, i8 poison, i8 -1>, %y
   call void @use3i8(<3 x i8> %t0)
   %t1 = lshr <3 x i8> %t0, %y
   %t2 = and <3 x i8> %t1, %x
diff --git a/llvm/test/Transforms/InstCombine/cast-int-icmp-eq-0.ll b/llvm/test/Transforms/InstCombine/cast-int-icmp-eq-0.ll
index 9b51a7649992f..7b6d07a14a30e 100644
--- a/llvm/test/Transforms/InstCombine/cast-int-icmp-eq-0.ll
+++ b/llvm/test/Transforms/InstCombine/cast-int-icmp-eq-0.ll
@@ -603,7 +603,7 @@ define i1 @i16_cast_cmp_sgt_int_m1_sitofp_half(i16 %i) {
   ret i1 %cmp
 }
 
-; Verify that vector types and vector constants including undef elements are transformed too.
+; Verify that vector types and vector constants including poison elements are transformed too.
 
 define <3 x i1> @i32_cast_cmp_ne_int_0_sitofp_double_vec(<3 x i32> %i) {
 ; CHECK-LABEL: @i32_cast_cmp_ne_int_0_sitofp_double_vec(
@@ -616,38 +616,38 @@ define <3 x i1> @i32_cast_cmp_ne_int_0_sitofp_double_vec(<3 x i32> %i) {
   ret <3 x i1> %cmp
 }
 
-; TODO: Can we propagate the constant vector with undef element?
+; TODO: Can we propagate the constant vector with poison element?
 
-define <3 x i1> @i32_cast_cmp_eq_int_0_sitofp_float_vec_undef(<3 x i32> %i) {
-; CHECK-LABEL: @i32_cast_cmp_eq_int_0_sitofp_float_vec_undef(
+define <3 x i1> @i32_cast_cmp_eq_int_0_sitofp_float_vec_poison(<3 x i32> %i) {
+; CHECK-LABEL: @i32_cast_cmp_eq_int_0_sitofp_float_vec_poison(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <3 x i32> [[I:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[CMP]]
 ;
   %f = sitofp <3 x i32> %i to  <3 x float>
   %b = bitcast <3 x float> %f to <3 x i32>
-  %cmp = icmp eq <3 x i32> %b, <i32 0, i32 undef, i32 0>
+  %cmp = icmp eq <3 x i32> %b, <i32 0, i32 poison, i32 0>
   ret <3 x i1> %cmp
 }
 
-define <3 x i1> @i64_cast_cmp_slt_int_1_sitofp_half_vec_undef(<3 x i64> %i) {
-; CHECK-LABEL: @i64_cast_cmp_slt_int_1_sitofp_half_vec_undef(
+define <3 x i1> @i64_cast_cmp_slt_int_1_sitofp_half_vec_poison(<3 x i64> %i) {
+; CHECK-LABEL: @i64_cast_cmp_slt_int_1_sitofp_half_vec_poison(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <3 x i64> [[I:%.*]], <i64 1, i64 1, i64 1>
 ; CHECK-NEXT:    ret <3 x i1> [[CMP]]
 ;
   %f = sitofp <3 x i64> %i to  <3 x half>
   %b = bitcast <3 x half> %f to <3 x i16>
-  %cmp = icmp slt <3 x i16> %b, <i16 1, i16 undef, i16 1>
+  %cmp = icmp slt <3 x i16> %b, <i16 1, i16 poison, i16 1>
   ret <3 x i1> %cmp
 }
 
-define <3 x i1> @i16_cast_cmp_sgt_int_m1_sitofp_float_vec_undef(<3 x i16> %i) {
-; CHECK-LABEL: @i16_cast_cmp_sgt_int_m1_sitofp_float_vec_undef(
+define <3 x i1> @i16_cast_cmp_sgt_int_m1_sitofp_float_vec_poison(<3 x i16> %i) {
+; CHECK-LABEL: @i16_cast_cmp_sgt_int_m1_sitofp_float_vec_poison(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <3 x i16> [[I:%.*]], <i16 -1, i16 -1, i16 -1>
 ; CHECK-NEXT:    ret <3 x i1> [[CMP]]
 ;
   %f = sitofp <3 x i16> %i to  <3 x float>
   %b = bitcast <3 x float> %f to <3 x i32>
-  %cmp = icmp sgt <3 x i32> %b, <i32 -1, i32 undef, i32 -1>
+  %cmp = icmp sgt <3 x i32> %b, <i32 -1, i32 poison, i32 -1>
   ret <3 x i1> %cmp
 }
 
diff --git a/llvm/test/Transforms/InstCombine/cast-unsigned-icmp-eqcmp-0.ll b/llvm/test/Transforms/InstCombine/cast-unsigned-icmp-eqcmp-0.ll
index 0752576fad45f..1565fb7c0a6a9 100644
--- a/llvm/test/Transforms/InstCombine/cast-unsigned-icmp-eqcmp-0.ll
+++ b/llvm/test/Transforms/InstCombine/cast-unsigned-icmp-eqcmp-0.ll
@@ -27,14 +27,14 @@ define <2 x i1> @i32_cast_cmp_eq_int_0_uitofp_float_vec(<2 x i32> %i) {
   ret <2 x i1> %cmp
 }
 
-define <3 x i1> @i32_cast_cmp_eq_int_0_uitofp_float_vec_undef(<3 x i32> %i) {
-; CHECK-LABEL: @i32_cast_cmp_eq_int_0_uitofp_float_vec_undef(
+define <3 x i1> @i32_cast_cmp_eq_int_0_uitofp_float_vec_poison(<3 x i32> %i) {
+; CHECK-LABEL: @i32_cast_cmp_eq_int_0_uitofp_float_vec_poison(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <3 x i32> [[I:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[CMP]]
 ;
   %f = uitofp <3 x i32> %i to <3 x float>
   %b = bitcast <3 x float> %f to <3 x i32>
-  %cmp = icmp eq <3 x i32> %b, <i32 0, i32 undef, i32 0>
+  %cmp = icmp eq <3 x i32> %b, <i32 0, i32 poison, i32 0>
   ret <3 x i1> %cmp
 }
 
@@ -60,14 +60,14 @@ define <2 x i1> @i32_cast_cmp_ne_int_0_uitofp_float_vec(<2 x i32> %i) {
   ret <2 x i1> %cmp
 }
 
-define <3 x i1> @i32_cast_cmp_ne_int_0_uitofp_float_vec_undef(<3 x i32> %i) {
-; CHECK-LABEL: @i32_cast_cmp_ne_int_0_uitofp_float_vec_undef(
+define <3 x i1> @i32_cast_cmp_ne_int_0_uitofp_float_vec_poison(<3 x i32> %i) {
+; CHECK-LABEL: @i32_cast_cmp_ne_int_0_uitofp_float_vec_poison(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <3 x i32> [[I:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[CMP]]
 ;
   %f = uitofp <3 x i32> %i to <3 x float>
   %b = bitcast <3 x float> %f to <3 x i32>
-  %cmp = icmp ne <3 x i32> %b, <i32 0, i32 undef, i32 0>
+  %cmp = icmp ne <3 x i32> %b, <i32 0, i32 poison, i32 0>
   ret <3 x i1> %cmp
 }
 
@@ -93,14 +93,14 @@ define <2 x i1> @i32_cast_cmp_eq_int_0_uitofp_double_vec(<2 x i32> %i) {
   ret <2 x i1> %cmp
 }
 
-define <3 x i1> @i32_cast_cmp_eq_int_0_uitofp_double_vec_undef(<3 x i32> %i) {
-; CHECK-LABEL: @i32_cast_cmp_eq_int_0_uitofp_double_vec_undef(
+define <3 x i1> @i32_cast_cmp_eq_int_0_uitofp_double_vec_poison(<3 x i32> %i) {
+; CHECK-LABEL: @i32_cast_cmp_eq_int_0_uitofp_double_vec_poison(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <3 x i32> [[I:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[CMP]]
 ;
   %f = uitofp <3 x i32> %i to <3 x double>
   %b = bitcast <3 x double> %f to <3 x i64>
-  %cmp = icmp eq <3 x i64> %b, <i64 0, i64 undef, i64 0>
+  %cmp = icmp eq <3 x i64> %b, <i64 0, i64 poison, i64 0>
   ret <3 x i1> %cmp
 }
 
@@ -126,14 +126,14 @@ define <2 x i1> @i32_cast_cmp_ne_int_0_uitofp_double_vec(<2 x i32> %i) {
   ret <2 x i1> %cmp
 }
 
-define <3 x i1> @i32_cast_cmp_ne_int_0_uitofp_double_vec_undef(<3 x i32> %i) {
-; CHECK-LABEL: @i32_cast_cmp_ne_int_0_uitofp_double_vec_undef(
+define <3 x i1> @i32_cast_cmp_ne_int_0_uitofp_double_vec_poison(<3 x i32> %i) {
+; CHECK-LABEL: @i32_cast_cmp_ne_int_0_uitofp_double_vec_poison(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <3 x i32> [[I:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[CMP]]
 ;
   %f = uitofp <3 x i32> %i to <3 x double>
   %b = bitcast <3 x double> %f to <3 x i64>
-  %cmp = icmp ne <3 x i64> %b, <i64 0, i64 undef, i64 0>
+  %cmp = icmp ne <3 x i64> %b, <i64 0, i64 poison, i64 0>
   ret <3 x i1> %cmp
 }
 
@@ -159,14 +159,14 @@ define <2 x i1> @i32_cast_cmp_eq_int_0_uitofp_half_vec(<2 x i32> %i) {
   ret <2 x i1> %cmp
 }
 
-define <3 x i1> @i32_cast_cmp_eq_int_0_uitofp_half_vec_undef(<3 x i32> %i) {
-; CHECK-LABEL: @i32_cast_cmp_eq_int_0_uitofp_half_vec_undef(
+define <3 x i1> @i32_cast_cmp_eq_int_0_uitofp_half_vec_poison(<3 x i32> %i) {
+; CHECK-LABEL: @i32_cast_cmp_eq_int_0_uitofp_half_vec_poison(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <3 x i32> [[I:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[CMP]]
 ;
   %f = uitofp <3 x i32> %i to <3 x half>
   %b = bitcast <3 x half> %f to <3 x i16>
-  %cmp = icmp eq <3 x i16> %b, <i16 0, i16 undef, i16 0>
+  %cmp = icmp eq <3 x i16> %b, <i16 0, i16 poison, i16 0>
   ret <3 x i1> %cmp
 }
 
@@ -192,13 +192,13 @@ define <2 x i1> @i32_cast_cmp_ne_int_0_uitofp_half_vec(<2 x i32> %i) {
   ret <2 x i1> %cmp
 }
 
-define <3 x i1> @i32_cast_cmp_ne_int_0_uitofp_half_vec_undef(<3 x i32> %i) {
-; CHECK-LABEL: @i32_cast_cmp_ne_int_0_uitofp_half_vec_undef(
+define <3 x i1> @i32_cast_cmp_ne_int_0_uitofp_half_vec_poison(<3 x i32> %i) {
+; CHECK-LABEL: @i32_cast_cmp_ne_int_0_uitofp_half_vec_poison(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <3 x i32> [[I:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[CMP]]
 ;
   %f = uitofp <3 x i32> %i to <3 x half>
   %b = bitcast <3 x half> %f to <3 x i16>
-  %cmp = icmp ne <3 x i16> %b, <i16 0, i16 undef, i16 0>
+  %cmp = icmp ne <3 x i16> %b, <i16 0, i16 poison, i16 0>
   ret <3 x i1> %cmp
 }
diff --git a/llvm/test/Transforms/InstCombine/cast.ll b/llvm/test/Transforms/InstCombine/cast.ll
index d9c93ba277295..04a3e8931e62c 100644
--- a/llvm/test/Transforms/InstCombine/cast.ll
+++ b/llvm/test/Transforms/InstCombine/cast.ll
@@ -508,18 +508,16 @@ define <2 x i16> @test40vec_nonuniform(<2 x i16> %a) {
   ret <2 x i16> %r
 }
 
-define <2 x i16> @test40vec_undef(<2 x i16> %a) {
-; ALL-LABEL: @test40vec_undef(
-; ALL-NEXT:    [[T:%.*]] = zext <2 x i16> [[A:%.*]] to <2 x i32>
-; ALL-NEXT:    [[T21:%.*]] = lshr <2 x i32> [[T]], <i32 9, i32 undef>
-; ALL-NEXT:    [[T5:%.*]] = shl <2 x i32> [[T]], <i32 8, i32 undef>
-; ALL-NEXT:    [[T32:%.*]] = or <2 x i32> [[T21]], [[T5]]
-; ALL-NEXT:    [[R:%.*]] = trunc <2 x i32> [[T32]] to <2 x i16>
+define <2 x i16> @test40vec_poison(<2 x i16> %a) {
+; ALL-LABEL: @test40vec_poison(
+; ALL-NEXT:    [[T21:%.*]] = lshr <2 x i16> [[A:%.*]], <i16 9, i16 poison>
+; ALL-NEXT:    [[T5:%.*]] = shl <2 x i16> [[A]], <i16 8, i16 poison>
+; ALL-NEXT:    [[R:%.*]] = or disjoint <2 x i16> [[T21]], [[T5]]
 ; ALL-NEXT:    ret <2 x i16> [[R]]
 ;
   %t = zext <2 x i16> %a to <2 x i32>
-  %t21 = lshr <2 x i32> %t, <i32 9, i32 undef>
-  %t5 = shl <2 x i32> %t, <i32 8, i32 undef>
+  %t21 = lshr <2 x i32> %t, <i32 9, i32 poison>
+  %t5 = shl <2 x i32> %t, <i32 8, i32 poison>
   %t32 = or <2 x i32> %t21, %t5
   %r = trunc <2 x i32> %t32 to <2 x i16>
   ret <2 x i16> %r
@@ -1452,7 +1450,7 @@ define i32 @test89() {
 ; LE-LABEL: @test89(
 ; LE-NEXT:    ret i32 6
 ;
-  ret i32 bitcast (<2 x i16> <i16 6, i16 undef> to i32)
+  ret i32 bitcast (<2 x i16> <i16 6, i16 poison> to i32)
 }
 
 define <2 x i32> @test90() {
@@ -1462,7 +1460,7 @@ define <2 x i32> @test90() {
 ; LE-LABEL: @test90(
 ; LE-NEXT:    ret <2 x i32> <i32 0, i32 1006632960>
 ;
-  %t6 = bitcast <4 x half> <half undef, half undef, half undef, half 0xH3C00> to <2 x i32>
+  %t6 = bitcast <4 x half> <half poison, half poison, half poison, half 0xH3C00> to <2 x i32>
   ret <2 x i32> %t6
 }
 
@@ -1537,13 +1535,13 @@ define <2 x i8> @trunc_lshr_sext_uniform(<2 x i8> %A) {
   ret <2 x i8> %D
 }
 
-define <2 x i8> @trunc_lshr_sext_uniform_undef(<2 x i8> %A) {
-; ALL-LABEL: @trunc_lshr_sext_uniform_undef(
-; ALL-NEXT:    [[D:%.*]] = ashr <2 x i8> [[A:%.*]], <i8 6, i8 undef>
+define <2 x i8> @trunc_lshr_sext_uniform_poison(<2 x i8> %A) {
+; ALL-LABEL: @trunc_lshr_sext_uniform_poison(
+; ALL-NEXT:    [[D:%.*]] = ashr <2 x i8> [[A:%.*]], <i8 6, i8 poison>
 ; ALL-NEXT:    ret <2 x i8> [[D]]
 ;
   %B = sext <2 x i8> %A to <2 x i32>
-  %C = lshr <2 x i32> %B, <i32 6, i32 undef>
+  %C = lshr <2 x i32> %B, <i32 6, i32 poison>
   %D = trunc <2 x i32> %C to <2 x i8>
   ret <2 x i8> %D
 }
@@ -1559,13 +1557,13 @@ define <2 x i8> @trunc_lshr_sext_nonuniform(<2 x i8> %A) {
   ret <2 x i8> %D
 }
 
-define <3 x i8> @trunc_lshr_sext_nonuniform_undef(<3 x i8> %A) {
-; ALL-LABEL: @trunc_lshr_sext_nonuniform_undef(
-; ALL-NEXT:    [[D:%.*]] = ashr <3 x i8> [[A:%.*]], <i8 6, i8 2, i8 undef>
+define <3 x i8> @trunc_lshr_sext_nonuniform_poison(<3 x i8> %A) {
+; ALL-LABEL: @trunc_lshr_sext_nonuniform_poison(
+; ALL-NEXT:    [[D:%.*]] = ashr <3 x i8> [[A:%.*]], <i8 6, i8 2, i8 poison>
 ; ALL-NEXT:    ret <3 x i8> [[D]]
 ;
   %B = sext <3 x i8> %A to <3 x i32>
-  %C = lshr <3 x i32> %B, <i32 6, i32 2, i32 undef>
+  %C = lshr <3 x i32> %B, <i32 6, i32 2, i32 poison>
   %D = trunc <3 x i32> %C to <3 x i8>
   ret <3 x i8> %D
 }
@@ -2014,15 +2012,13 @@ define <2 x i8> @trunc_lshr_zext_uniform(<2 x i8> %A) {
   ret <2 x i8> %D
 }
 
-define <2 x i8> @trunc_lshr_zext_uniform_undef(<2 x i8> %A) {
-; ALL-LABEL: @trunc_lshr_zext_uniform_undef(
-; ALL-NEXT:    [[B:%.*]] = zext <2 x i8> [[A:%.*]] to <2 x i32>
-; ALL-NEXT:    [[C:%.*]] = lshr <2 x i32> [[B]], <i32 6, i32 undef>
-; ALL-NEXT:    [[D:%.*]] = trunc nuw <2 x i32> [[C]] to <2 x i8>
+define <2 x i8> @trunc_lshr_zext_uniform_poison(<2 x i8> %A) {
+; ALL-LABEL: @trunc_lshr_zext_uniform_poison(
+; ALL-NEXT:    [[D:%.*]] = lshr <2 x i8> [[A:%.*]], <i8 6, i8 poison>
 ; ALL-NEXT:    ret <2 x i8> [[D]]
 ;
   %B = zext <2 x i8> %A to <2 x i32>
-  %C = lshr <2 x i32> %B, <i32 6, i32 undef>
+  %C = lshr <2 x i32> %B, <i32 6, i32 poison>
   %D = trunc <2 x i32> %C to <2 x i8>
   ret <2 x i8> %D
 }
@@ -2038,15 +2034,13 @@ define <2 x i8> @trunc_lshr_zext_nonuniform(<2 x i8> %A) {
   ret <2 x i8> %D
 }
 
-define <3 x i8> @trunc_lshr_zext_nonuniform_undef(<3 x i8> %A) {
-; ALL-LABEL: @trunc_lshr_zext_nonuniform_undef(
-; ALL-NEXT:    [[B:%.*]] = zext <3 x i8> [[A:%.*]] to <3 x i32>
-; ALL-NEXT:    [[C:%.*]] = lshr <3 x i32> [[B]], <i32 6, i32 2, i32 undef>
-; ALL-NEXT:    [[D:%.*]] = trunc nuw <3 x i32> [[C]] to <3 x i8>
+define <3 x i8> @trunc_lshr_zext_nonuniform_poison(<3 x i8> %A) {
+; ALL-LABEL: @trunc_lshr_zext_nonuniform_poison(
+; ALL-NEXT:    [[D:%.*]] = lshr <3 x i8> [[A:%.*]], <i8 6, i8 2, i8 poison>
 ; ALL-NEXT:    ret <3 x i8> [[D]]
 ;
   %B = zext <3 x i8> %A to <3 x i32>
-  %C = lshr <3 x i32> %B, <i32 6, i32 2, i32 undef>
+  %C = lshr <3 x i32> %B, <i32 6, i32 2, i32 poison>
   %D = trunc <3 x i32> %C to <3 x i8>
   ret <3 x i8> %D
 }
diff --git a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
index 9da9eb36d381f..1dd0b17e9f46d 100644
--- a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
+++ b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
@@ -472,7 +472,7 @@ define float @ui32_clamp_and_cast_to_float(i32 %x) {
 ; CHECK-LABEL: @ui32_clamp_and_cast_to_float(
 ; CHECK-NEXT:    [[LO_CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
 ; CHECK-NEXT:    [[MIN1:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 255)
-; CHECK-NEXT:    [[MIN:%.*]] = uitofp i32 [[MIN1]] to float
+; CHECK-NEXT:    [[MIN:%.*]] = uitofp nneg i32 [[MIN1]] to float
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[LO_CMP]], float 1.000000e+00, float [[MIN]]
 ; CHECK-NEXT:    ret float [[R]]
 ;
@@ -488,7 +488,7 @@ define float @ui64_clamp_and_cast_to_float(i64 %x) {
 ; CHECK-LABEL: @ui64_clamp_and_cast_to_float(
 ; CHECK-NEXT:    [[LO_CMP:%.*]] = icmp eq i64 [[X:%.*]], 0
 ; CHECK-NEXT:    [[MIN1:%.*]] = call i64 @llvm.umin.i64(i64 [[X]], i64 255)
-; CHECK-NEXT:    [[MIN:%.*]] = uitofp i64 [[MIN1]] to float
+; CHECK-NEXT:    [[MIN:%.*]] = uitofp nneg i64 [[MIN1]] to float
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[LO_CMP]], float 1.000000e+00, float [[MIN]]
 ; CHECK-NEXT:    ret float [[R]]
 ;
@@ -504,7 +504,7 @@ define float @mixed_clamp_to_float_1(i32 %x) {
 ; CHECK-LABEL: @mixed_clamp_to_float_1(
 ; CHECK-NEXT:    [[SI_MIN:%.*]] = call i32 @llvm.smin.i32(i32 [[X:%.*]], i32 255)
 ; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.smax.i32(i32 [[SI_MIN]], i32 1)
-; CHECK-NEXT:    [[R:%.*]] = sitofp i32 [[R1]] to float
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i32 [[R1]] to float
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %si_min_cmp = icmp sgt i32 %x, 255
@@ -539,7 +539,7 @@ define float @mixed_clamp_to_float_2(i32 %x) {
 ; CHECK-LABEL: @mixed_clamp_to_float_2(
 ; CHECK-NEXT:    [[SI_MIN:%.*]] = call i32 @llvm.smin.i32(i32 [[X:%.*]], i32 255)
 ; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.smax.i32(i32 [[SI_MIN]], i32 1)
-; CHECK-NEXT:    [[R:%.*]] = sitofp i32 [[R1]] to float
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i32 [[R1]] to float
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %si_min_cmp = icmp sgt i32 %x, 255
@@ -572,7 +572,7 @@ define <2 x float> @mixed_clamp_to_float_vec(<2 x i32> %x) {
 ; CHECK-LABEL: @mixed_clamp_to_float_vec(
 ; CHECK-NEXT:    [[SI_MIN:%.*]] = call <2 x i32> @llvm.smin.v2i32(<2 x i32> [[X:%.*]], <2 x i32> <i32 255, i32 255>)
 ; CHECK-NEXT:    [[R1:%.*]] = call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[SI_MIN]], <2 x i32> <i32 1, i32 1>)
-; CHECK-NEXT:    [[R:%.*]] = sitofp <2 x i32> [[R1]] to <2 x float>
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg <2 x i32> [[R1]] to <2 x float>
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
   %si_min_cmp = icmp sgt <2 x i32> %x, <i32 255, i32 255>
diff --git a/llvm/test/Transforms/InstCombine/ctpop-cttz.ll b/llvm/test/Transforms/InstCombine/ctpop-cttz.ll
index 5d27f374d8921..70868554bdc1b 100644
--- a/llvm/test/Transforms/InstCombine/ctpop-cttz.ll
+++ b/llvm/test/Transforms/InstCombine/ctpop-cttz.ll
@@ -116,14 +116,14 @@ define <2 x i32> @ctpop3v(<2 x i32> %0) {
   ret <2 x i32> %5
 }
 
-define <2 x i32> @ctpop3v_undef(<2 x i32> %0) {
-; CHECK-LABEL: @ctpop3v_undef(
+define <2 x i32> @ctpop3v_poison(<2 x i32> %0) {
+; CHECK-LABEL: @ctpop3v_poison(
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> [[TMP0:%.*]], i1 false), !range [[RNG0]]
 ; CHECK-NEXT:    ret <2 x i32> [[TMP2]]
 ;
   %2 = sub <2 x i32> zeroinitializer, %0
   %3 = and <2 x i32> %2, %0
-  %4 = add <2 x i32> %3, <i32 -1, i32 undef>
+  %4 = add <2 x i32> %3, <i32 -1, i32 poison>
   %5 = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %4)
   ret <2 x i32> %5
 }
diff --git a/llvm/test/Transforms/InstCombine/ctpop.ll b/llvm/test/Transforms/InstCombine/ctpop.ll
index 27194724b7d83..b3653e5071ba2 100644
--- a/llvm/test/Transforms/InstCombine/ctpop.ll
+++ b/llvm/test/Transforms/InstCombine/ctpop.ll
@@ -155,28 +155,27 @@ define <2 x i32> @_parity_of_not_vec(<2 x i32> %x) {
   ret <2 x i32> %r
 }
 
-define <2 x i32> @_parity_of_not_undef(<2 x i32> %x) {
-; CHECK-LABEL: @_parity_of_not_undef(
+define <2 x i32> @_parity_of_not_poison(<2 x i32> %x) {
+; CHECK-LABEL: @_parity_of_not_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[X:%.*]]), !range [[RNG1]]
 ; CHECK-NEXT:    [[R:%.*]] = and <2 x i32> [[TMP1]], <i32 1, i32 1>
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %neg = xor <2 x i32> %x, <i32 undef ,i32 -1>
+  %neg = xor <2 x i32> %x, <i32 poison ,i32 -1>
   %cnt = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %neg)
   %r = and <2 x i32> %cnt, <i32 1 ,i32 1>
   ret <2 x i32> %r
 }
 
-define <2 x i32> @_parity_of_not_undef2(<2 x i32> %x) {
-; CHECK-LABEL: @_parity_of_not_undef2(
-; CHECK-NEXT:    [[NEG:%.*]] = xor <2 x i32> [[X:%.*]], <i32 -1, i32 -1>
-; CHECK-NEXT:    [[CNT:%.*]] = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[NEG]]), !range [[RNG1]]
-; CHECK-NEXT:    [[R:%.*]] = and <2 x i32> [[CNT]], <i32 1, i32 undef>
+define <2 x i32> @_parity_of_not_poison2(<2 x i32> %x) {
+; CHECK-LABEL: @_parity_of_not_poison2(
+; CHECK-NEXT:    [[CNT:%.*]] = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[X:%.*]]), !range [[RNG1]]
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i32> [[CNT]], <i32 1, i32 poison>
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %neg = xor <2 x i32> %x, <i32 -1 ,i32 -1>
   %cnt = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %neg)
-  %r = and <2 x i32> %cnt, <i32 1 ,i32 undef>
+  %r = and <2 x i32> %cnt, <i32 1 ,i32 poison>
   ret <2 x i32> %r
 }
 
diff --git a/llvm/test/Transforms/InstCombine/fabs-as-int.ll b/llvm/test/Transforms/InstCombine/fabs-as-int.ll
index f32c00e453f22..4e49ff159f875 100644
--- a/llvm/test/Transforms/InstCombine/fabs-as-int.ll
+++ b/llvm/test/Transforms/InstCombine/fabs-as-int.ll
@@ -137,15 +137,15 @@ define <2 x i32> @not_fabs_as_int_v2f32_nonsplat(<2 x float> %x) {
   ret <2 x i32> %and
 }
 
-define <3 x i32> @fabs_as_int_v3f32_undef(<3 x float> %x) {
-; CHECK-LABEL: define <3 x i32> @fabs_as_int_v3f32_undef
+define <3 x i32> @fabs_as_int_v3f32_poison(<3 x float> %x) {
+; CHECK-LABEL: define <3 x i32> @fabs_as_int_v3f32_poison
 ; CHECK-SAME: (<3 x float> [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <3 x float> @llvm.fabs.v3f32(<3 x float> [[X]])
 ; CHECK-NEXT:    [[AND:%.*]] = bitcast <3 x float> [[TMP1]] to <3 x i32>
 ; CHECK-NEXT:    ret <3 x i32> [[AND]]
 ;
   %bc = bitcast <3 x float> %x to <3 x i32>
-  %and = and <3 x i32> %bc, <i32 2147483647, i32 undef, i32 2147483647>
+  %and = and <3 x i32> %bc, <i32 2147483647, i32 poison, i32 2147483647>
   ret <3 x i32> %and
 }
 
diff --git a/llvm/test/Transforms/InstCombine/fabs.ll b/llvm/test/Transforms/InstCombine/fabs.ll
index 7e380c2e4590a..5ec65784e7a34 100644
--- a/llvm/test/Transforms/InstCombine/fabs.ll
+++ b/llvm/test/Transforms/InstCombine/fabs.ll
@@ -321,7 +321,7 @@ define <2 x float> @select_fcmp_nnan_ole_negzero(<2 x float> %x) {
 ; CHECK-NEXT:    ret <2 x float> [[FABS]]
 ;
   %lezero = fcmp ole <2 x float> %x, <float -0.0, float -0.0>
-  %negx = fsub nnan <2 x float> <float 0.0, float undef>, %x
+  %negx = fsub nnan <2 x float> <float 0.0, float poison>, %x
   %fabs = select <2 x i1> %lezero, <2 x float> %negx, <2 x float> %x
   ret <2 x float> %fabs
 }
@@ -332,7 +332,7 @@ define <2 x float> @select_nnan_fcmp_nnan_ole_negzero(<2 x float> %x) {
 ; CHECK-NEXT:    ret <2 x float> [[FABS]]
 ;
   %lezero = fcmp ole <2 x float> %x, <float -0.0, float -0.0>
-  %negx = fsub nnan <2 x float> <float 0.0, float undef>, %x
+  %negx = fsub nnan <2 x float> <float 0.0, float poison>, %x
   %fabs = select nnan <2 x i1> %lezero, <2 x float> %negx, <2 x float> %x
   ret <2 x float> %fabs
 }
diff --git a/llvm/test/Transforms/InstCombine/fast-math.ll b/llvm/test/Transforms/InstCombine/fast-math.ll
index 129d7811cfb86..83f2091244e52 100644
--- a/llvm/test/Transforms/InstCombine/fast-math.ll
+++ b/llvm/test/Transforms/InstCombine/fast-math.ll
@@ -541,12 +541,12 @@ define float @fneg2(float %x) {
   ret float %sub
 }
 
-define <2 x float> @fneg2_vec_undef(<2 x float> %x) {
-; CHECK-LABEL: @fneg2_vec_undef(
+define <2 x float> @fneg2_vec_poison(<2 x float> %x) {
+; CHECK-LABEL: @fneg2_vec_poison(
 ; CHECK-NEXT:    [[SUB:%.*]] = fneg nsz <2 x float> [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x float> [[SUB]]
 ;
-  %sub = fsub nsz <2 x float> <float undef, float 0.0>, %x
+  %sub = fsub nsz <2 x float> <float poison, float 0.0>, %x
   ret <2 x float> %sub
 }
 
@@ -562,7 +562,7 @@ define float @fdiv1(float %x) {
 ; CHECK-NEXT:    [[DIV1:%.*]] = fmul fast float [[X:%.*]], 0x3FD7303B60000000
 ; CHECK-NEXT:    ret float [[DIV1]]
 ;
-  %div = fdiv float %x, 0x3FF3333340000000
+  %div = fdiv fast float %x, 0x3FF3333340000000
   %div1 = fdiv fast float %div, 0x4002666660000000
   ret float %div1
 ; 0x3FF3333340000000 = 1.2f
@@ -603,7 +603,7 @@ define float @fdiv3(float %x) {
 ; CHECK-NEXT:    [[DIV1:%.*]] = fdiv fast float [[TMP1]], 0x47EFFFFFE0000000
 ; CHECK-NEXT:    ret float [[DIV1]]
 ;
-  %div = fdiv float %x, 0x47EFFFFFE0000000
+  %div = fdiv fast float %x, 0x47EFFFFFE0000000
   %div1 = fdiv fast float %div, 0x4002666660000000
   ret float %div1
 }
diff --git a/llvm/test/Transforms/InstCombine/fcmp-special.ll b/llvm/test/Transforms/InstCombine/fcmp-special.ll
index 88bfe930ffdd6..64bc86f4266c7 100644
--- a/llvm/test/Transforms/InstCombine/fcmp-special.ll
+++ b/llvm/test/Transforms/InstCombine/fcmp-special.ll
@@ -144,21 +144,21 @@ define <2 x i1> @uno_vec_with_nan(<2 x double> %x) {
   ret <2 x i1> %f
 }
 
-define <2 x i1> @uno_vec_with_undef(<2 x double> %x) {
-; CHECK-LABEL: @uno_vec_with_undef(
+define <2 x i1> @uno_vec_with_poison(<2 x double> %x) {
+; CHECK-LABEL: @uno_vec_with_poison(
 ; CHECK-NEXT:    [[F:%.*]] = fcmp uno <2 x double> [[X:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[F]]
 ;
-  %f = fcmp uno <2 x double> %x, <double 3.0, double undef>
+  %f = fcmp uno <2 x double> %x, <double 3.0, double poison>
   ret <2 x i1> %f
 }
 
-define <2 x i1> @ord_vec_with_undef(<2 x double> %x) {
-; CHECK-LABEL: @ord_vec_with_undef(
-; CHECK-NEXT:    [[F:%.*]] = fcmp ord <2 x double> [[X:%.*]], <double 0.000000e+00, double undef>
+define <2 x i1> @ord_vec_with_poison(<2 x double> %x) {
+; CHECK-LABEL: @ord_vec_with_poison(
+; CHECK-NEXT:    [[F:%.*]] = fcmp ord <2 x double> [[X:%.*]], <double 0.000000e+00, double poison>
 ; CHECK-NEXT:    ret <2 x i1> [[F]]
 ;
-  %f = fcmp ord <2 x double> %x, <double 0.0, double undef>
+  %f = fcmp ord <2 x double> %x, <double 0.0, double poison>
   ret <2 x i1> %f
 }
 
@@ -224,12 +224,12 @@ define <2 x i1> @negative_zero_olt_vec(<2 x float> %x) {
   ret <2 x i1> %r
 }
 
-define <2 x i1> @negative_zero_une_vec_undef(<2 x double> %x) {
-; CHECK-LABEL: @negative_zero_une_vec_undef(
+define <2 x i1> @negative_zero_une_vec_poison(<2 x double> %x) {
+; CHECK-LABEL: @negative_zero_une_vec_poison(
 ; CHECK-NEXT:    [[R:%.*]] = fcmp nnan une <2 x double> [[X:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
-  %r = fcmp nnan une <2 x double> %x, <double -0.0, double undef>
+  %r = fcmp nnan une <2 x double> %x, <double -0.0, double poison>
   ret <2 x i1> %r
 }
 
diff --git a/llvm/test/Transforms/InstCombine/fcmp.ll b/llvm/test/Transforms/InstCombine/fcmp.ll
index f2701d16d0f3d..389264e2f7075 100644
--- a/llvm/test/Transforms/InstCombine/fcmp.ll
+++ b/llvm/test/Transforms/InstCombine/fcmp.ll
@@ -102,12 +102,12 @@ define <2 x i1> @unary_fneg_constant_swap_pred_vec(<2 x float> %x) {
   ret <2 x i1> %cmp
 }
 
-define <2 x i1> @fneg_constant_swap_pred_vec_undef(<2 x float> %x) {
-; CHECK-LABEL: @fneg_constant_swap_pred_vec_undef(
+define <2 x i1> @fneg_constant_swap_pred_vec_poison(<2 x float> %x) {
+; CHECK-LABEL: @fneg_constant_swap_pred_vec_poison(
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt <2 x float> [[X:%.*]], <float -1.000000e+00, float -2.000000e+00>
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
-  %neg = fsub <2 x float> <float undef, float -0.0>, %x
+  %neg = fsub <2 x float> <float poison, float -0.0>, %x
   %cmp = fcmp ogt <2 x float> %neg, <float 1.0, float 2.0>
   ret <2 x i1> %cmp
 }
@@ -234,34 +234,34 @@ define <2 x i1> @fneg_unary_fneg_swap_pred_vec(<2 x float> %x, <2 x float> %y) {
   ret <2 x i1> %cmp
 }
 
-define <2 x i1> @fneg_fneg_swap_pred_vec_undef(<2 x float> %x, <2 x float> %y) {
-; CHECK-LABEL: @fneg_fneg_swap_pred_vec_undef(
+define <2 x i1> @fneg_fneg_swap_pred_vec_poison(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @fneg_fneg_swap_pred_vec_poison(
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt <2 x float> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
-  %neg1 = fsub <2 x float> <float -0.0, float undef>, %x
-  %neg2 = fsub <2 x float> <float undef, float -0.0>, %y
+  %neg1 = fsub <2 x float> <float -0.0, float poison>, %x
+  %neg2 = fsub <2 x float> <float poison, float -0.0>, %y
   %cmp = fcmp olt <2 x float> %neg1, %neg2
   ret <2 x i1> %cmp
 }
 
-define <2 x i1> @unary_fneg_fneg_swap_pred_vec_undef(<2 x float> %x, <2 x float> %y) {
-; CHECK-LABEL: @unary_fneg_fneg_swap_pred_vec_undef(
+define <2 x i1> @unary_fneg_fneg_swap_pred_vec_poison(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @unary_fneg_fneg_swap_pred_vec_poison(
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt <2 x float> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %neg1 = fneg <2 x float> %x
-  %neg2 = fsub <2 x float> <float undef, float -0.0>, %y
+  %neg2 = fsub <2 x float> <float poison, float -0.0>, %y
   %cmp = fcmp olt <2 x float> %neg1, %neg2
   ret <2 x i1> %cmp
 }
 
-define <2 x i1> @fneg_unary_fneg_swap_pred_vec_undef(<2 x float> %x, <2 x float> %y) {
-; CHECK-LABEL: @fneg_unary_fneg_swap_pred_vec_undef(
+define <2 x i1> @fneg_unary_fneg_swap_pred_vec_poison(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @fneg_unary_fneg_swap_pred_vec_poison(
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt <2 x float> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
-  %neg1 = fsub <2 x float> <float -0.0, float undef>, %x
+  %neg1 = fsub <2 x float> <float -0.0, float poison>, %x
   %neg2 = fneg <2 x float> %y
   %cmp = fcmp olt <2 x float> %neg1, %neg2
   ret <2 x i1> %cmp
@@ -1284,3 +1284,205 @@ define <1 x i1> @bitcast_1vec_eq0(i32 %x) {
   %cmp = fcmp oeq <1 x float> %f, zeroinitializer
   ret <1 x i1> %cmp
 }
+
+; Simplify fcmp (x + 0.0), y => fcmp x, y
+
+define i1 @fcmp_fadd_zero_ugt(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_ugt(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp ugt float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_uge(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_uge(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp uge float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp uge float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_ogt(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_ogt(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp ogt float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_oge(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_oge(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oge float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp oge float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_ult(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_ult(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ult float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp ult float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_ule(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_ule(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ule float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp ule float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_olt(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_olt(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp olt float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_ole(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_ole(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ole float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp ole float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_oeq(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_oeq(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp oeq float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_one(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_one(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp one float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp one float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_ueq(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_ueq(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp ueq float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_une(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_une(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp une float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_ord(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_ord(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp ord float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_uno(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_uno(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp uno float [[ADD:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp uno float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_neg_zero(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_neg_zero(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, -0.000000e+00
+  %cmp = fcmp ugt float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_zero_switched(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_switched(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ult float [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %y, 0.000000e+00
+  %cmp = fcmp ugt float %x, %add
+  ret i1 %cmp
+}
+
+define <2 x i1> @fcmp_fadd_zero_vec(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @fcmp_fadd_zero_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt <2 x float> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %add = fadd <2 x float> %x, <float 0.0, float -0.0>
+  %cmp = fcmp ugt <2 x float> %add, %y
+  ret <2 x i1> %cmp
+}
+
+define i1 @fcmp_fast_fadd_fast_zero(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fast_fadd_fast_zero(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp fast ugt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd fast float %x, 0.000000e+00
+  %cmp = fcmp fast ugt float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fast_fadd_zero(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fast_fadd_zero(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp fast ugt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd float %x, 0.000000e+00
+  %cmp = fcmp fast ugt float %add, %y
+  ret i1 %cmp
+}
+
+define i1 @fcmp_fadd_fast_zero(float %x, float %y) {
+; CHECK-LABEL: @fcmp_fadd_fast_zero(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ugt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = fadd fast float %x, 0.000000e+00
+  %cmp = fcmp ugt float %add, %y
+  ret i1 %cmp
+}
diff --git a/llvm/test/Transforms/InstCombine/fdiv.ll b/llvm/test/Transforms/InstCombine/fdiv.ll
index a0710c2bb0484..ca11685c98417 100644
--- a/llvm/test/Transforms/InstCombine/fdiv.ll
+++ b/llvm/test/Transforms/InstCombine/fdiv.ll
@@ -141,12 +141,12 @@ define <2 x float> @not_exact_inverse_vec_arcp(<2 x float> %x) {
   ret <2 x float> %div
 }
 
-define <2 x float> @not_exact_inverse_vec_arcp_with_undef_elt(<2 x float> %x) {
-; CHECK-LABEL: @not_exact_inverse_vec_arcp_with_undef_elt(
-; CHECK-NEXT:    [[DIV:%.*]] = fdiv arcp <2 x float> [[X:%.*]], <float undef, float 3.000000e+00>
+define <2 x float> @not_exact_inverse_vec_arcp_with_poison_elt(<2 x float> %x) {
+; CHECK-LABEL: @not_exact_inverse_vec_arcp_with_poison_elt(
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv arcp <2 x float> [[X:%.*]], <float poison, float 3.000000e+00>
 ; CHECK-NEXT:    ret <2 x float> [[DIV]]
 ;
-  %div = fdiv arcp <2 x float> %x, <float undef, float 3.0>
+  %div = fdiv arcp <2 x float> %x, <float poison, float 3.0>
   ret <2 x float> %div
 }
 
@@ -333,13 +333,13 @@ define <2 x float> @unary_fneg_fneg_vec(<2 x float> %x, <2 x float> %y) {
   ret <2 x float> %div
 }
 
-define <2 x float> @fneg_fneg_vec_undef_elts(<2 x float> %x, <2 x float> %y) {
-; CHECK-LABEL: @fneg_fneg_vec_undef_elts(
+define <2 x float> @fneg_fneg_vec_poison_elts(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @fneg_fneg_vec_poison_elts(
 ; CHECK-NEXT:    [[DIV:%.*]] = fdiv <2 x float> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <2 x float> [[DIV]]
 ;
-  %xneg = fsub <2 x float> <float undef, float -0.0>, %x
-  %yneg = fsub <2 x float> <float -0.0, float undef>, %y
+  %xneg = fsub <2 x float> <float poison, float -0.0>, %x
+  %yneg = fsub <2 x float> <float -0.0, float poison>, %y
   %div = fdiv <2 x float> %xneg, %yneg
   ret <2 x float> %div
 }
@@ -404,12 +404,12 @@ define <2 x float> @unary_fneg_dividend_constant_divisor_vec(<2 x float> %x) {
   ret <2 x float> %div
 }
 
-define <2 x float> @fneg_dividend_constant_divisor_vec_undef_elt(<2 x float> %x) {
-; CHECK-LABEL: @fneg_dividend_constant_divisor_vec_undef_elt(
+define <2 x float> @fneg_dividend_constant_divisor_vec_poison_elt(<2 x float> %x) {
+; CHECK-LABEL: @fneg_dividend_constant_divisor_vec_poison_elt(
 ; CHECK-NEXT:    [[DIV:%.*]] = fdiv ninf <2 x float> [[X:%.*]], <float -3.000000e+00, float 8.000000e+00>
 ; CHECK-NEXT:    ret <2 x float> [[DIV]]
 ;
-  %neg = fsub <2 x float> <float undef, float -0.0>, %x
+  %neg = fsub <2 x float> <float poison, float -0.0>, %x
   %div = fdiv ninf <2 x float> %neg, <float 3.0, float -8.0>
   ret <2 x float> %div
 }
diff --git a/llvm/test/Transforms/InstCombine/fma.ll b/llvm/test/Transforms/InstCombine/fma.ll
index 8b413ae6f664b..cf3d7f3c525a5 100644
--- a/llvm/test/Transforms/InstCombine/fma.ll
+++ b/llvm/test/Transforms/InstCombine/fma.ll
@@ -60,13 +60,13 @@ define <2 x float> @fma_unary_fneg_x_unary_fneg_y_vec(<2 x float> %x, <2 x float
   ret <2 x float> %fma
 }
 
-define <2 x float> @fma_fneg_x_fneg_y_vec_undef(<2 x float> %x, <2 x float> %y, <2 x float> %z) {
-; CHECK-LABEL: @fma_fneg_x_fneg_y_vec_undef(
+define <2 x float> @fma_fneg_x_fneg_y_vec_poison(<2 x float> %x, <2 x float> %y, <2 x float> %z) {
+; CHECK-LABEL: @fma_fneg_x_fneg_y_vec_poison(
 ; CHECK-NEXT:    [[FMA:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[X:%.*]], <2 x float> [[Y:%.*]], <2 x float> [[Z:%.*]])
 ; CHECK-NEXT:    ret <2 x float> [[FMA]]
 ;
-  %xn = fsub <2 x float> <float -0.0, float undef>, %x
-  %yn = fsub <2 x float> <float undef, float -0.0>, %y
+  %xn = fsub <2 x float> <float -0.0, float poison>, %x
+  %yn = fsub <2 x float> <float poison, float -0.0>, %y
   %fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %xn, <2 x float> %yn, <2 x float> %z)
   ret <2 x float> %fma
 }
diff --git a/llvm/test/Transforms/InstCombine/fmul-pow.ll b/llvm/test/Transforms/InstCombine/fmul-pow.ll
index 63458e136074c..84592d220d62c 100644
--- a/llvm/test/Transforms/InstCombine/fmul-pow.ll
+++ b/llvm/test/Transforms/InstCombine/fmul-pow.ll
@@ -85,8 +85,8 @@ define double @pow_ab_recip_a_reassoc(double %a, double %b)  {
 ; CHECK-NEXT:    [[M:%.*]] = call reassoc double @llvm.pow.f64(double [[A:%.*]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[M]]
 ;
-  %r = fdiv double 1.0, %a
-  %p = call double @llvm.pow.f64(double %a, double %b)
+  %r = fdiv reassoc double 1.0, %a
+  %p = call reassoc double @llvm.pow.f64(double %a, double %b)
   %m = fmul reassoc double %r, %p
   ret double %m
 }
@@ -99,8 +99,8 @@ define double @pow_ab_recip_a_reassoc_commute(double %a, double %b)  {
 ; CHECK-NEXT:    [[M:%.*]] = call reassoc double @llvm.pow.f64(double [[A:%.*]], double [[TMP1]])
 ; CHECK-NEXT:    ret double [[M]]
 ;
-  %r = fdiv double 1.0, %a
-  %p = call double @llvm.pow.f64(double %a, double %b)
+  %r = fdiv reassoc double 1.0, %a
+  %p = call reassoc double @llvm.pow.f64(double %a, double %b)
   %m = fmul reassoc double %p, %r
   ret double %m
 }
@@ -109,14 +109,14 @@ define double @pow_ab_recip_a_reassoc_commute(double %a, double %b)  {
 
 define double @pow_ab_recip_a_reassoc_use1(double %a, double %b)  {
 ; CHECK-LABEL: @pow_ab_recip_a_reassoc_use1(
-; CHECK-NEXT:    [[R:%.*]] = fdiv double 1.000000e+00, [[A:%.*]]
-; CHECK-NEXT:    [[P:%.*]] = call double @llvm.pow.f64(double [[A]], double [[B:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = fdiv reassoc double 1.000000e+00, [[A:%.*]]
+; CHECK-NEXT:    [[P:%.*]] = call reassoc double @llvm.pow.f64(double [[A]], double [[B:%.*]])
 ; CHECK-NEXT:    [[M:%.*]] = fmul reassoc double [[R]], [[P]]
 ; CHECK-NEXT:    call void @use(double [[R]])
 ; CHECK-NEXT:    ret double [[M]]
 ;
-  %r = fdiv double 1.0, %a
-  %p = call double @llvm.pow.f64(double %a, double %b)
+  %r = fdiv reassoc double 1.0, %a
+  %p = call reassoc double @llvm.pow.f64(double %a, double %b)
   %m = fmul reassoc double %r, %p
   call void @use(double %r)
   ret double %m
@@ -126,13 +126,13 @@ define double @pow_ab_recip_a_reassoc_use1(double %a, double %b)  {
 
 define double @pow_ab_recip_a_reassoc_use2(double %a, double %b)  {
 ; CHECK-LABEL: @pow_ab_recip_a_reassoc_use2(
-; CHECK-NEXT:    [[P:%.*]] = call double @llvm.pow.f64(double [[A:%.*]], double [[B:%.*]])
+; CHECK-NEXT:    [[P:%.*]] = call reassoc double @llvm.pow.f64(double [[A:%.*]], double [[B:%.*]])
 ; CHECK-NEXT:    [[M:%.*]] = fdiv reassoc double [[P]], [[A]]
 ; CHECK-NEXT:    call void @use(double [[P]])
 ; CHECK-NEXT:    ret double [[M]]
 ;
-  %r = fdiv double 1.0, %a
-  %p = call double @llvm.pow.f64(double %a, double %b)
+  %r = fdiv reassoc double 1.0, %a
+  %p = call reassoc double @llvm.pow.f64(double %a, double %b)
   %m = fmul reassoc double %r, %p
   call void @use(double %p)
   ret double %m
@@ -142,15 +142,15 @@ define double @pow_ab_recip_a_reassoc_use2(double %a, double %b)  {
 
 define double @pow_ab_recip_a_reassoc_use3(double %a, double %b)  {
 ; CHECK-LABEL: @pow_ab_recip_a_reassoc_use3(
-; CHECK-NEXT:    [[R:%.*]] = fdiv double 1.000000e+00, [[A:%.*]]
-; CHECK-NEXT:    [[P:%.*]] = call double @llvm.pow.f64(double [[A]], double [[B:%.*]])
+; CHECK-NEXT:    [[R:%.*]] = fdiv reassoc double 1.000000e+00, [[A:%.*]]
+; CHECK-NEXT:    [[P:%.*]] = call reassoc double @llvm.pow.f64(double [[A]], double [[B:%.*]])
 ; CHECK-NEXT:    [[M:%.*]] = fmul reassoc double [[R]], [[P]]
 ; CHECK-NEXT:    call void @use(double [[R]])
 ; CHECK-NEXT:    call void @use(double [[P]])
 ; CHECK-NEXT:    ret double [[M]]
 ;
-  %r = fdiv double 1.0, %a
-  %p = call double @llvm.pow.f64(double %a, double %b)
+  %r = fdiv reassoc double 1.0, %a
+  %p = call reassoc double @llvm.pow.f64(double %a, double %b)
   %m = fmul reassoc double %r, %p
   call void @use(double %r)
   call void @use(double %p)
diff --git a/llvm/test/Transforms/InstCombine/fmul.ll b/llvm/test/Transforms/InstCombine/fmul.ll
index f6435f0032891..e9c86a1270493 100644
--- a/llvm/test/Transforms/InstCombine/fmul.ll
+++ b/llvm/test/Transforms/InstCombine/fmul.ll
@@ -42,12 +42,12 @@ define <2 x float> @unary_neg_constant_vec(<2 x float> %x) {
   ret <2 x float> %mul
 }
 
-define <2 x float> @neg_constant_vec_undef(<2 x float> %x) {
-; CHECK-LABEL: @neg_constant_vec_undef(
+define <2 x float> @neg_constant_vec_poison(<2 x float> %x) {
+; CHECK-LABEL: @neg_constant_vec_poison(
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul ninf <2 x float> [[X:%.*]], <float -2.000000e+00, float -3.000000e+00>
 ; CHECK-NEXT:    ret <2 x float> [[MUL]]
 ;
-  %sub = fsub <2 x float> <float undef, float -0.0>, %x
+  %sub = fsub <2 x float> <float poison, float -0.0>, %x
   %mul = fmul ninf <2 x float> %sub, <float 2.0, float 3.0>
   ret <2 x float> %mul
 }
@@ -162,34 +162,34 @@ define <2 x float> @neg_unary_neg_vec(<2 x float> %x, <2 x float> %y) {
   ret <2 x float> %mul
 }
 
-define <2 x float> @neg_neg_vec_undef(<2 x float> %x, <2 x float> %y) {
-; CHECK-LABEL: @neg_neg_vec_undef(
+define <2 x float> @neg_neg_vec_poison(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @neg_neg_vec_poison(
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul arcp <2 x float> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <2 x float> [[MUL]]
 ;
-  %sub1 = fsub <2 x float> <float -0.0, float undef>, %x
-  %sub2 = fsub <2 x float> <float undef, float -0.0>, %y
+  %sub1 = fsub <2 x float> <float -0.0, float poison>, %x
+  %sub2 = fsub <2 x float> <float poison, float -0.0>, %y
   %mul = fmul arcp <2 x float> %sub1, %sub2
   ret <2 x float> %mul
 }
 
-define <2 x float> @unary_neg_neg_vec_undef(<2 x float> %x, <2 x float> %y) {
-; CHECK-LABEL: @unary_neg_neg_vec_undef(
+define <2 x float> @unary_neg_neg_vec_poison(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @unary_neg_neg_vec_poison(
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul arcp <2 x float> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <2 x float> [[MUL]]
 ;
   %neg = fneg <2 x float> %x
-  %sub = fsub <2 x float> <float undef, float -0.0>, %y
+  %sub = fsub <2 x float> <float poison, float -0.0>, %y
   %mul = fmul arcp <2 x float> %neg, %sub
   ret <2 x float> %mul
 }
 
-define <2 x float> @neg_unary_neg_vec_undef(<2 x float> %x, <2 x float> %y) {
-; CHECK-LABEL: @neg_unary_neg_vec_undef(
+define <2 x float> @neg_unary_neg_vec_poison(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @neg_unary_neg_vec_poison(
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul arcp <2 x float> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <2 x float> [[MUL]]
 ;
-  %sub = fsub <2 x float> <float -0.0, float undef>, %x
+  %sub = fsub <2 x float> <float -0.0, float poison>, %x
   %neg = fneg <2 x float> %y
   %mul = fmul arcp <2 x float> %sub, %neg
   ret <2 x float> %mul
@@ -322,13 +322,13 @@ define <2 x float> @unary_neg_mul_vec(<2 x float> %x, <2 x float> %y) {
   ret <2 x float> %mul
 }
 
-define <2 x float> @neg_mul_vec_undef(<2 x float> %x, <2 x float> %y) {
-; CHECK-LABEL: @neg_mul_vec_undef(
+define <2 x float> @neg_mul_vec_poison(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @neg_mul_vec_poison(
 ; CHECK-NEXT:    [[SUB:%.*]] = fneg <2 x float> [[X:%.*]]
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x float> [[SUB]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <2 x float> [[MUL]]
 ;
-  %sub = fsub <2 x float> <float undef, float -0.0>, %x
+  %sub = fsub <2 x float> <float poison, float -0.0>, %x
   %mul = fmul <2 x float> %sub, %y
   ret <2 x float> %mul
 }
@@ -388,9 +388,9 @@ define void @test8(ptr %inout, i1 %c1) {
 entry:
   %0 = load i32, ptr %inout, align 4
   %conv = uitofp i32 %0 to float
-  %vecinit = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef>, float %conv, i32 3
+  %vecinit = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison>, float %conv, i32 3
   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vecinit
-  %1 = shufflevector <4 x float> %sub, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %1 = shufflevector <4 x float> %sub, <4 x float> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
   %mul = fmul <4 x float> zeroinitializer, %1
   br label %for.cond
 
@@ -633,15 +633,15 @@ define float @log2half(float %x, float %y) {
 
 define float @log2half_commute(float %x1, float %y) {
 ; CHECK-LABEL: @log2half_commute(
+; CHECK-NEXT:    [[X1:%.*]] = fmul fast float [[X2:%.*]], 0x3FC24924A0000000
 ; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.log2.f32(float [[Y:%.*]])
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast float [[TMP1]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast float [[TMP1]], [[X1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = fsub fast float [[TMP2]], [[X1]]
-; CHECK-NEXT:    [[MUL:%.*]] = fmul fast float [[TMP3]], 0x3FC24924A0000000
-; CHECK-NEXT:    ret float [[MUL]]
+; CHECK-NEXT:    ret float [[TMP3]]
 ;
-  %x = fdiv float %x1, 7.0 ; thwart complexity-based canonicalization
-  %halfy = fmul float %y, 0.5
-  %log2 = call float @llvm.log2.f32(float %halfy)
+  %x = fdiv fast float %x1, 7.0 ; thwart complexity-based canonicalization
+  %halfy = fmul fast float %y, 0.5
+  %log2 = call fast float @llvm.log2.f32(float %halfy)
   %mul = fmul fast float %x, %log2
   ret float %mul
 }
@@ -652,12 +652,50 @@ define float @fdiv_constant_numerator_fmul(float %x) {
 ; CHECK-LABEL: @fdiv_constant_numerator_fmul(
 ; CHECK-NEXT:    [[T3:%.*]] = fdiv reassoc float 1.200000e+07, [[X:%.*]]
 ; CHECK-NEXT:    ret float [[T3]]
+;
+  %t1 = fdiv reassoc float 2.0e+3, %x
+  %t3 = fmul reassoc float %t1, 6.0e+3
+  ret float %t3
+}
+
+; C1/X * C2 => (C1*C2) / X with mixed fast-math flags
+
+define float @fdiv_constant_numerator_fmul_mixed(float %x) {
+; CHECK-LABEL: @fdiv_constant_numerator_fmul_mixed(
+; CHECK-NEXT:    [[T3:%.*]] = fdiv reassoc float 1.200000e+07, [[X:%.*]]
+; CHECK-NEXT:    ret float [[T3]]
+;
+  %t1 = fdiv reassoc float 2.0e+3, %x
+  %t3 = fmul fast float %t1, 6.0e+3
+  ret float %t3
+}
+
+; C1/X * C2 => (C1*C2) / X with full fast-math flags
+
+define float @fdiv_constant_numerator_fmul_fast(float %x) {
+; CHECK-LABEL: @fdiv_constant_numerator_fmul_fast(
+; CHECK-NEXT:    [[T3:%.*]] = fdiv fast float 1.200000e+07, [[X:%.*]]
+; CHECK-NEXT:    ret float [[T3]]
+;
+  %t1 = fdiv fast float 2.0e+3, %x
+  %t3 = fmul fast float %t1, 6.0e+3
+  ret float %t3
+}
+
+; C1/X * C2 => (C1*C2) / X with no fast-math flags on the fdiv
+
+define float @fdiv_constant_numerator_fmul_precdiv(float %x) {
+; CHECK-LABEL: @fdiv_constant_numerator_fmul_precdiv(
+; CHECK-NEXT:    [[T1:%.*]] = fdiv float 2.000000e+03, [[X:%.*]]
+; CHECK-NEXT:    [[T4:%.*]] = fmul reassoc float [[T1]], 6.000000e+03
+; CHECK-NEXT:    ret float [[T4]]
 ;
   %t1 = fdiv float 2.0e+3, %x
   %t3 = fmul reassoc float %t1, 6.0e+3
   ret float %t3
 }
 
+
 ; C1/X * C2 => (C1*C2) / X is disabled if C1/X has multiple uses
 
 @fmul2_external = external global float
@@ -682,7 +720,7 @@ define float @fdiv_constant_denominator_fmul(float %x) {
 ; CHECK-NEXT:    [[T3:%.*]] = fmul reassoc float [[X:%.*]], 3.000000e+00
 ; CHECK-NEXT:    ret float [[T3]]
 ;
-  %t1 = fdiv float %x, 2.0e+3
+  %t1 = fdiv reassoc float %x, 2.0e+3
   %t3 = fmul reassoc float %t1, 6.0e+3
   ret float %t3
 }
@@ -692,7 +730,7 @@ define <4 x float> @fdiv_constant_denominator_fmul_vec(<4 x float> %x) {
 ; CHECK-NEXT:    [[T3:%.*]] = fmul reassoc <4 x float> [[X:%.*]], <float 3.000000e+00, float 2.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    ret <4 x float> [[T3]]
 ;
-  %t1 = fdiv <4 x float> %x, <float 2.0e+3, float 3.0e+3, float 2.0e+3, float 1.0e+3>
+  %t1 = fdiv reassoc <4 x float> %x, <float 2.0e+3, float 3.0e+3, float 2.0e+3, float 1.0e+3>
   %t3 = fmul reassoc <4 x float> %t1, <float 6.0e+3, float 6.0e+3, float 2.0e+3, float 1.0e+3>
   ret <4 x float> %t3
 }
@@ -704,8 +742,8 @@ define <4 x float> @fdiv_constant_denominator_fmul_vec_constexpr(<4 x float> %x)
 ; CHECK-NEXT:    [[T3:%.*]] = fmul reassoc <4 x float> [[X:%.*]], <float 3.000000e+00, float 2.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    ret <4 x float> [[T3]]
 ;
-  %constExprMul = bitcast i128 trunc (i160 bitcast (<5 x float> <float 6.0e+3, float 6.0e+3, float 2.0e+3, float 1.0e+3, float undef> to i160) to i128) to <4 x float>
-  %t1 = fdiv <4 x float> %x, <float 2.0e+3, float 3.0e+3, float 2.0e+3, float 1.0e+3>
+  %constExprMul = bitcast i128 trunc (i160 bitcast (<5 x float> <float 6.0e+3, float 6.0e+3, float 2.0e+3, float 1.0e+3, float poison> to i160) to i128) to <4 x float>
+  %t1 = fdiv reassoc <4 x float> %x, <float 2.0e+3, float 3.0e+3, float 2.0e+3, float 1.0e+3>
   %t3 = fmul reassoc <4 x float> %t1, %constExprMul
   ret <4 x float> %t3
 }
@@ -734,7 +772,7 @@ define float @fdiv_constant_denominator_fmul_denorm(float %x) {
 ; CHECK-NEXT:    [[T3:%.*]] = fmul fast float [[X:%.*]], 0x3760620000000000
 ; CHECK-NEXT:    ret float [[T3]]
 ;
-  %t1 = fdiv float %x, 2.0e+3
+  %t1 = fdiv fast float %x, 2.0e+3
   %t3 = fmul fast float %t1, 0x3810000000000000
   ret float %t3
 }
@@ -748,7 +786,7 @@ define float @fdiv_constant_denominator_fmul_denorm_try_harder(float %x) {
 ; CHECK-NEXT:    [[T3:%.*]] = fdiv reassoc float [[X:%.*]], 0x47E8000000000000
 ; CHECK-NEXT:    ret float [[T3]]
 ;
-  %t1 = fdiv float %x, 3.0
+  %t1 = fdiv reassoc float %x, 3.0
   %t3 = fmul reassoc float %t1, 0x3810000000000000
   ret float %t3
 }
@@ -776,7 +814,7 @@ define float @fmul_fadd_distribute(float %x) {
 ; CHECK-NEXT:    [[T3:%.*]] = fadd reassoc float [[TMP1]], 6.000000e+00
 ; CHECK-NEXT:    ret float [[T3]]
 ;
-  %t2 = fadd float %x, 2.0
+  %t2 = fadd reassoc float %x, 2.0
   %t3 = fmul reassoc float %t2, 3.0
   ret float %t3
 }
@@ -787,7 +825,7 @@ define <2 x float> @fmul_fadd_distribute_vec(<2 x float> %x) {
 ; CHECK-NEXT:    [[T3:%.*]] = fadd reassoc <2 x float> [[TMP1]], <float 1.200000e+07, float 1.200000e+07>
 ; CHECK-NEXT:    ret <2 x float> [[T3]]
 ;
-  %t1 = fadd <2 x float> <float 2.0e+3, float 2.0e+3>, %x
+  %t1 = fadd reassoc <2 x float> <float 2.0e+3, float 2.0e+3>, %x
   %t3 = fmul reassoc <2 x float> %t1, <float 6.0e+3, float 6.0e+3>
   ret <2 x float> %t3
 }
@@ -798,7 +836,7 @@ define <vscale x 2 x float> @fmul_fadd_distribute_scalablevec(<vscale x 2 x floa
 ; CHECK-NEXT:    [[T3:%.*]] = fadd reassoc <vscale x 2 x float> [[TMP1]], shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 1.200000e+07, i64 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
 ; CHECK-NEXT:    ret <vscale x 2 x float> [[T3]]
 ;
-  %t1 = fadd <vscale x 2 x float> splat (float 2.0e+3), %x
+  %t1 = fadd reassoc <vscale x 2 x float> splat (float 2.0e+3), %x
   %t3 = fmul reassoc <vscale x 2 x float> %t1, splat (float 6.0e+3)
 
 
@@ -813,7 +851,7 @@ define float @fmul_fsub_distribute1(float %x) {
 ; CHECK-NEXT:    [[T3:%.*]] = fadd reassoc float [[TMP1]], -6.000000e+00
 ; CHECK-NEXT:    ret float [[T3]]
 ;
-  %t2 = fsub float %x, 2.0
+  %t2 = fsub reassoc float %x, 2.0
   %t3 = fmul reassoc float %t2, 3.0
   ret float %t3
 }
@@ -826,7 +864,7 @@ define float @fmul_fsub_distribute2(float %x) {
 ; CHECK-NEXT:    [[T3:%.*]] = fsub reassoc float 6.000000e+00, [[TMP1]]
 ; CHECK-NEXT:    ret float [[T3]]
 ;
-  %t2 = fsub float 2.0, %x
+  %t2 = fsub reassoc float 2.0, %x
   %t3 = fmul reassoc float %t2, 3.0
   ret float %t3
 }
@@ -840,8 +878,8 @@ define float @fmul_fadd_fmul_distribute(float %x) {
 ; CHECK-NEXT:    [[T3:%.*]] = fadd fast float [[TMP1]], 1.000000e+01
 ; CHECK-NEXT:    ret float [[T3]]
 ;
-  %t1 = fmul float %x, 6.0
-  %t2 = fadd float %t1, 2.0
+  %t1 = fmul fast float %x, 6.0
+  %t2 = fadd fast float %t1, 2.0
   %t3 = fmul fast float %t2, 5.0
   ret float %t3
 }
@@ -872,8 +910,8 @@ define double @fmul_fadd_fdiv_distribute2(double %x) {
 ; CHECK-NEXT:    [[T3:%.*]] = fadd reassoc double [[TMP1]], 0x34000000000000
 ; CHECK-NEXT:    ret double [[T3]]
 ;
-  %t1 = fdiv double %x, 3.0
-  %t2 = fadd double %t1, 5.0
+  %t1 = fdiv reassoc double %x, 3.0
+  %t2 = fadd reassoc double %t1, 5.0
   %t3 = fmul reassoc double %t2, 0x10000000000000
   ret double %t3
 }
@@ -887,8 +925,8 @@ define double @fmul_fadd_fdiv_distribute3(double %x) {
 ; CHECK-NEXT:    [[T3:%.*]] = fadd reassoc double [[TMP1]], 0x34000000000000
 ; CHECK-NEXT:    ret double [[T3]]
 ;
-  %t1 = fdiv double %x, 3.0
-  %t2 = fadd double %t1, 5.0
+  %t1 = fdiv reassoc double %x, 3.0
+  %t2 = fadd reassoc double %t1, 5.0
   %t3 = fmul reassoc double %t2, 0x10000000000000
   ret double %t3
 }
@@ -902,8 +940,8 @@ define float @fmul_fsub_fmul_distribute(float %x) {
 ; CHECK-NEXT:    [[T3:%.*]] = fsub fast float 1.000000e+01, [[TMP1]]
 ; CHECK-NEXT:    ret float [[T3]]
 ;
-  %t1 = fmul float %x, 6.0
-  %t2 = fsub float 2.0, %t1
+  %t1 = fmul fast float %x, 6.0
+  %t2 = fsub fast float 2.0, %t1
   %t3 = fmul fast float %t2, 5.0
   ret float %t3
 }
@@ -932,8 +970,8 @@ define float @fmul_fsub_fmul_distribute2(float %x) {
 ; CHECK-NEXT:    [[T3:%.*]] = fadd fast float [[TMP1]], -1.000000e+01
 ; CHECK-NEXT:    ret float [[T3]]
 ;
-  %t1 = fmul float %x, 6.0
-  %t2 = fsub float %t1, 2.0
+  %t1 = fmul fast float %x, 6.0
+  %t2 = fsub fast float %t1, 2.0
   %t3 = fmul fast float %t2, 5.0
   ret float %t3
 }
@@ -986,8 +1024,8 @@ define double @fmul_fdivs_factor_common_denominator(double %x, double %y, double
 ; CHECK-NEXT:    [[MUL:%.*]] = fdiv fast double [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret double [[MUL]]
 ;
-  %div1 = fdiv double %x, %z
-  %div2 = fdiv double %y, %z
+  %div1 = fdiv fast double %x, %z
+  %div2 = fdiv fast double %y, %z
   %mul = fmul fast double %div1, %div2
   ret double %mul
 }
@@ -999,8 +1037,8 @@ define double @fmul_fdivs_factor(double %x, double %y, double %z, double %w) {
 ; CHECK-NEXT:    [[MUL:%.*]] = fdiv reassoc double [[TMP2]], [[Y:%.*]]
 ; CHECK-NEXT:    ret double [[MUL]]
 ;
-  %div1 = fdiv double %x, %y
-  %div2 = fdiv double %z, %w
+  %div1 = fdiv reassoc double %x, %y
+  %div2 = fdiv reassoc double %z, %w
   %mul = fmul reassoc double %div1, %div2
   ret double %mul
 }
@@ -1011,7 +1049,7 @@ define double @fmul_fdiv_factor(double %x, double %y, double %z) {
 ; CHECK-NEXT:    [[MUL:%.*]] = fdiv reassoc double [[TMP1]], [[Y:%.*]]
 ; CHECK-NEXT:    ret double [[MUL]]
 ;
-  %div = fdiv double %x, %y
+  %div = fdiv reassoc double %x, %y
   %mul = fmul reassoc double %div, %z
   ret double %mul
 }
@@ -1022,7 +1060,7 @@ define double @fmul_fdiv_factor_constant1(double %x, double %y) {
 ; CHECK-NEXT:    [[MUL:%.*]] = fdiv reassoc double [[TMP1]], [[Y:%.*]]
 ; CHECK-NEXT:    ret double [[MUL]]
 ;
-  %div = fdiv double %x, %y
+  %div = fdiv reassoc double %x, %y
   %mul = fmul reassoc double %div, 42.0
   ret double %mul
 }
@@ -1033,7 +1071,7 @@ define <2 x float> @fmul_fdiv_factor_constant2(<2 x float> %x, <2 x float> %y) {
 ; CHECK-NEXT:    [[MUL:%.*]] = fdiv reassoc <2 x float> [[TMP1]], <float 4.200000e+01, float 1.200000e+01>
 ; CHECK-NEXT:    ret <2 x float> [[MUL]]
 ;
-  %div = fdiv <2 x float> %x, <float 42.0, float 12.0>
+  %div = fdiv reassoc <2 x float> %x, <float 42.0, float 12.0>
   %mul = fmul reassoc <2 x float> %div, %y
   ret <2 x float> %mul
 }
@@ -1232,7 +1270,7 @@ define <vscale x 2 x float> @mul_scalable_splat_zero(<vscale x 2 x float> %z) {
 ; CHECK-LABEL: @mul_scalable_splat_zero(
 ; CHECK-NEXT:    ret <vscale x 2 x float> zeroinitializer
 ;
-  %shuf = shufflevector <vscale x 2 x float> insertelement (<vscale x 2 x float> undef, float 0.0, i32 0), <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
+  %shuf = shufflevector <vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 0.0, i32 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer
   %t3 = fmul fast <vscale x 2 x float> %shuf, %z
   ret <vscale x 2 x float> %t3
 }
@@ -1355,7 +1393,7 @@ define <3 x float> @mul_neg_zero_nnan_ninf_vec(<3 x float> nofpclass(inf nan) %a
 ; CHECK-NEXT:    ret <3 x float> [[RET]]
 ;
 entry:
-  %ret = fmul <3 x float> %a, <float -0.0, float undef, float poison>
+  %ret = fmul <3 x float> %a, <float -0.0, float poison, float poison>
   ret <3 x float> %ret
 }
 
diff --git a/llvm/test/Transforms/InstCombine/fneg-as-int.ll b/llvm/test/Transforms/InstCombine/fneg-as-int.ll
index d28e599cacf36..e3067b0d02461 100644
--- a/llvm/test/Transforms/InstCombine/fneg-as-int.ll
+++ b/llvm/test/Transforms/InstCombine/fneg-as-int.ll
@@ -139,15 +139,15 @@ define <2 x i32> @not_fneg_as_int_v2f32_nonsplat(<2 x float> %x) {
   ret <2 x i32> %xor
 }
 
-define <3 x i32> @fneg_as_int_v3f32_undef(<3 x float> %x) {
-; CHECK-LABEL: define <3 x i32> @fneg_as_int_v3f32_undef
+define <3 x i32> @fneg_as_int_v3f32_poison(<3 x float> %x) {
+; CHECK-LABEL: define <3 x i32> @fneg_as_int_v3f32_poison
 ; CHECK-SAME: (<3 x float> [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = fneg <3 x float> [[X]]
 ; CHECK-NEXT:    [[XOR:%.*]] = bitcast <3 x float> [[TMP1]] to <3 x i32>
 ; CHECK-NEXT:    ret <3 x i32> [[XOR]]
 ;
   %bc = bitcast <3 x float> %x to <3 x i32>
-  %xor = xor <3 x i32> %bc, <i32 -2147483648, i32 undef, i32 -2147483648>
+  %xor = xor <3 x i32> %bc, <i32 -2147483648, i32 poison, i32 -2147483648>
   ret <3 x i32> %xor
 }
 
diff --git a/llvm/test/Transforms/InstCombine/fneg-fabs-as-int.ll b/llvm/test/Transforms/InstCombine/fneg-fabs-as-int.ll
index 9aa8d4944e39a..8c3e6958fe083 100644
--- a/llvm/test/Transforms/InstCombine/fneg-fabs-as-int.ll
+++ b/llvm/test/Transforms/InstCombine/fneg-fabs-as-int.ll
@@ -158,8 +158,8 @@ define <2 x i32> @not_fneg_fabs_as_int_v2f32_nonsplat(<2 x float> %x) {
   ret <2 x i32> %or
 }
 
-define <3 x i32> @fneg_fabs_as_int_v3f32_undef(<3 x float> %x) {
-; CHECK-LABEL: define <3 x i32> @fneg_fabs_as_int_v3f32_undef
+define <3 x i32> @fneg_fabs_as_int_v3f32_poison(<3 x float> %x) {
+; CHECK-LABEL: define <3 x i32> @fneg_fabs_as_int_v3f32_poison
 ; CHECK-SAME: (<3 x float> [[X:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <3 x float> @llvm.fabs.v3f32(<3 x float> [[X]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = fneg <3 x float> [[TMP1]]
@@ -167,7 +167,7 @@ define <3 x i32> @fneg_fabs_as_int_v3f32_undef(<3 x float> %x) {
 ; CHECK-NEXT:    ret <3 x i32> [[OR]]
 ;
   %bc = bitcast <3 x float> %x to <3 x i32>
-  %or = or <3 x i32> %bc, <i32 -2147483648, i32 undef, i32 -2147483648>
+  %or = or <3 x i32> %bc, <i32 -2147483648, i32 poison, i32 -2147483648>
   ret <3 x i32> %or
 }
 
diff --git a/llvm/test/Transforms/InstCombine/fneg.ll b/llvm/test/Transforms/InstCombine/fneg.ll
index ed68ba50d36ee..7c9289c447113 100644
--- a/llvm/test/Transforms/InstCombine/fneg.ll
+++ b/llvm/test/Transforms/InstCombine/fneg.ll
@@ -87,24 +87,24 @@ define float @fmul_fneg_extra_use(float %x) {
   ret float %r
 }
 
-; Try a vector. Use special constants (NaN, INF, undef) because they don't change anything.
+; Try a vector. Use special constants (NaN, INF, poison) because they don't change anything.
 
 define <4 x double> @fmul_fsub_vec(<4 x double> %x) {
 ; CHECK-LABEL: @fmul_fsub_vec(
-; CHECK-NEXT:    [[R:%.*]] = fmul <4 x double> [[X:%.*]], <double -4.200000e+01, double 0xFFF8000000000000, double 0xFFF0000000000000, double undef>
+; CHECK-NEXT:    [[R:%.*]] = fmul <4 x double> [[X:%.*]], <double -4.200000e+01, double 0xFFF8000000000000, double 0xFFF0000000000000, double poison>
 ; CHECK-NEXT:    ret <4 x double> [[R]]
 ;
-  %m = fmul <4 x double> %x, <double 42.0, double 0x7FF8000000000000, double 0x7FF0000000000000, double undef>
+  %m = fmul <4 x double> %x, <double 42.0, double 0x7FF8000000000000, double 0x7FF0000000000000, double poison>
   %r = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %m
   ret <4 x double> %r
 }
 
 define <4 x double> @fmul_fneg_vec(<4 x double> %x) {
 ; CHECK-LABEL: @fmul_fneg_vec(
-; CHECK-NEXT:    [[R:%.*]] = fmul <4 x double> [[X:%.*]], <double -4.200000e+01, double 0xFFF8000000000000, double 0xFFF0000000000000, double undef>
+; CHECK-NEXT:    [[R:%.*]] = fmul <4 x double> [[X:%.*]], <double -4.200000e+01, double 0xFFF8000000000000, double 0xFFF0000000000000, double poison>
 ; CHECK-NEXT:    ret <4 x double> [[R]]
 ;
-  %m = fmul <4 x double> %x, <double 42.0, double 0x7FF8000000000000, double 0x7FF0000000000000, double undef>
+  %m = fmul <4 x double> %x, <double 42.0, double 0x7FF8000000000000, double 0x7FF0000000000000, double poison>
   %r = fneg <4 x double> %m
   ret <4 x double> %r
 }
@@ -181,24 +181,24 @@ define float @fdiv_op1_constant_fneg_extra_use(float %x) {
   ret float %r
 }
 
-; Try a vector. Use special constants (NaN, INF, undef) because they don't change anything.
+; Try a vector. Use special constants (NaN, INF, poison) because they don't change anything.
 
 define <4 x double> @fdiv_op1_constant_fsub_vec(<4 x double> %x) {
 ; CHECK-LABEL: @fdiv_op1_constant_fsub_vec(
-; CHECK-NEXT:    [[R:%.*]] = fdiv <4 x double> [[X:%.*]], <double 4.200000e+01, double 0x7FF800000ABCD000, double 0x7FF0000000000000, double undef>
+; CHECK-NEXT:    [[R:%.*]] = fdiv <4 x double> [[X:%.*]], <double 4.200000e+01, double 0x7FF800000ABCD000, double 0x7FF0000000000000, double poison>
 ; CHECK-NEXT:    ret <4 x double> [[R]]
 ;
-  %d = fdiv <4 x double> %x, <double -42.0, double 0xFFF800000ABCD000, double 0xFFF0000000000000, double undef>
+  %d = fdiv <4 x double> %x, <double -42.0, double 0xFFF800000ABCD000, double 0xFFF0000000000000, double poison>
   %r = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %d
   ret <4 x double> %r
 }
 
 define <4 x double> @fdiv_op1_constant_fneg_vec(<4 x double> %x) {
 ; CHECK-LABEL: @fdiv_op1_constant_fneg_vec(
-; CHECK-NEXT:    [[R:%.*]] = fdiv <4 x double> [[X:%.*]], <double 4.200000e+01, double 0x7FF800000ABCD000, double 0x7FF0000000000000, double undef>
+; CHECK-NEXT:    [[R:%.*]] = fdiv <4 x double> [[X:%.*]], <double 4.200000e+01, double 0x7FF800000ABCD000, double 0x7FF0000000000000, double poison>
 ; CHECK-NEXT:    ret <4 x double> [[R]]
 ;
-  %d = fdiv <4 x double> %x, <double -42.0, double 0xFFF800000ABCD000, double 0xFFF0000000000000, double undef>
+  %d = fdiv <4 x double> %x, <double -42.0, double 0xFFF800000ABCD000, double 0xFFF0000000000000, double poison>
   %r = fneg <4 x double> %d
   ret <4 x double> %r
 }
@@ -335,24 +335,24 @@ define float @fdiv_op0_constant_fneg_extra_use(float %x) {
   ret float %r
 }
 
-; Try a vector. Use special constants (NaN, INF, undef) because they don't change anything.
+; Try a vector. Use special constants (NaN, INF, poison) because they don't change anything.
 
 define <4 x double> @fdiv_op0_constant_fsub_vec(<4 x double> %x) {
 ; CHECK-LABEL: @fdiv_op0_constant_fsub_vec(
-; CHECK-NEXT:    [[R:%.*]] = fdiv <4 x double> <double 4.200000e+01, double 0xFFF8000000000000, double 0x7FF0000000000000, double undef>, [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = fdiv <4 x double> <double 4.200000e+01, double 0xFFF8000000000000, double 0x7FF0000000000000, double poison>, [[X:%.*]]
 ; CHECK-NEXT:    ret <4 x double> [[R]]
 ;
-  %d = fdiv <4 x double> <double -42.0, double 0x7FF8000000000000, double 0xFFF0000000000000, double undef>, %x
+  %d = fdiv <4 x double> <double -42.0, double 0x7FF8000000000000, double 0xFFF0000000000000, double poison>, %x
   %r = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %d
   ret <4 x double> %r
 }
 
 define <4 x double> @fdiv_op0_constant_fneg_vec(<4 x double> %x) {
 ; CHECK-LABEL: @fdiv_op0_constant_fneg_vec(
-; CHECK-NEXT:    [[R:%.*]] = fdiv <4 x double> <double 4.200000e+01, double 0xFFF8000000000000, double 0x7FF0000000000000, double undef>, [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = fdiv <4 x double> <double 4.200000e+01, double 0xFFF8000000000000, double 0x7FF0000000000000, double poison>, [[X:%.*]]
 ; CHECK-NEXT:    ret <4 x double> [[R]]
 ;
-  %d = fdiv <4 x double> <double -42.0, double 0x7FF8000000000000, double 0xFFF0000000000000, double undef>, %x
+  %d = fdiv <4 x double> <double -42.0, double 0x7FF8000000000000, double 0xFFF0000000000000, double poison>, %x
   %r = fneg <4 x double> %d
   ret <4 x double> %r
 }
@@ -584,11 +584,11 @@ define <2 x float> @fneg_nsz_fadd_constant_vec(<2 x float> %x) {
 
 define <2 x float> @fake_fneg_nsz_fadd_constant_vec(<2 x float> %x) {
 ; CHECK-LABEL: @fake_fneg_nsz_fadd_constant_vec(
-; CHECK-NEXT:    [[R:%.*]] = fsub nsz <2 x float> <float -4.200000e+01, float undef>, [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = fsub nsz <2 x float> <float -4.200000e+01, float poison>, [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
-  %a = fadd <2 x float> %x, <float 42.0, float undef>
-  %r = fsub nsz <2 x float> <float undef, float -0.0>, %a
+  %a = fadd <2 x float> %x, <float 42.0, float poison>
+  %r = fsub nsz <2 x float> <float poison, float -0.0>, %a
   ret <2 x float> %r
 }
 
diff --git a/llvm/test/Transforms/InstCombine/fold-inc-of-add-of-not-x-and-y-to-sub-x-from-y.ll b/llvm/test/Transforms/InstCombine/fold-inc-of-add-of-not-x-and-y-to-sub-x-from-y.ll
index b482cfdfde197..1fd570bf2635b 100644
--- a/llvm/test/Transforms/InstCombine/fold-inc-of-add-of-not-x-and-y-to-sub-x-from-y.ll
+++ b/llvm/test/Transforms/InstCombine/fold-inc-of-add-of-not-x-and-y-to-sub-x-from-y.ll
@@ -36,36 +36,36 @@ define <4 x i32> @t1_vec_splat(<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i32> %t2
 }
 
-define <4 x i32> @t2_vec_undef0(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: @t2_vec_undef0(
+define <4 x i32> @t2_vec_poison0(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @t2_vec_poison0(
 ; CHECK-NEXT:    [[T2:%.*]] = sub <4 x i32> [[Y:%.*]], [[X:%.*]]
 ; CHECK-NEXT:    ret <4 x i32> [[T2]]
 ;
-  %t0 = xor <4 x i32> %x, <i32 -1, i32 -1, i32 undef, i32 -1>
+  %t0 = xor <4 x i32> %x, <i32 -1, i32 -1, i32 poison, i32 -1>
   %t1 = add <4 x i32> %t0, %y
   %t2 = add <4 x i32> %t1, <i32 1, i32 1, i32 1, i32 1>
   ret <4 x i32> %t2
 }
 
-define <4 x i32> @t3_vec_undef1(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: @t3_vec_undef1(
+define <4 x i32> @t3_vec_poison1(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @t3_vec_poison1(
 ; CHECK-NEXT:    [[T2:%.*]] = sub <4 x i32> [[Y:%.*]], [[X:%.*]]
 ; CHECK-NEXT:    ret <4 x i32> [[T2]]
 ;
   %t0 = xor <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
   %t1 = add <4 x i32> %t0, %y
-  %t2 = add <4 x i32> %t1, <i32 1, i32 1, i32 undef, i32 1>
+  %t2 = add <4 x i32> %t1, <i32 1, i32 1, i32 poison, i32 1>
   ret <4 x i32> %t2
 }
 
-define <4 x i32> @t4_vec_undef2(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: @t4_vec_undef2(
+define <4 x i32> @t4_vec_poison2(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @t4_vec_poison2(
 ; CHECK-NEXT:    [[T2:%.*]] = sub <4 x i32> [[Y:%.*]], [[X:%.*]]
 ; CHECK-NEXT:    ret <4 x i32> [[T2]]
 ;
-  %t0 = xor <4 x i32> %x, <i32 -1, i32 -1, i32 undef, i32 -1>
+  %t0 = xor <4 x i32> %x, <i32 -1, i32 -1, i32 poison, i32 -1>
   %t1 = add <4 x i32> %t0, %y
-  %t2 = add <4 x i32> %t1, <i32 1, i32 1, i32 undef, i32 1>
+  %t2 = add <4 x i32> %t1, <i32 1, i32 1, i32 poison, i32 1>
   ret <4 x i32> %t2
 }
 
diff --git a/llvm/test/Transforms/InstCombine/fold-sub-of-not-to-inc-of-add.ll b/llvm/test/Transforms/InstCombine/fold-sub-of-not-to-inc-of-add.ll
index 6f311f05fb017..af580ba57513c 100644
--- a/llvm/test/Transforms/InstCombine/fold-sub-of-not-to-inc-of-add.ll
+++ b/llvm/test/Transforms/InstCombine/fold-sub-of-not-to-inc-of-add.ll
@@ -50,13 +50,13 @@ define <4 x i32> @p1_vector_splat(<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i32> %t1
 }
 
-define <4 x i32> @p2_vector_undef(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: @p2_vector_undef(
+define <4 x i32> @p2_vector_poison(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @p2_vector_poison(
 ; CHECK-NEXT:    [[T0_NEG:%.*]] = add <4 x i32> [[X:%.*]], <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[T1:%.*]] = add <4 x i32> [[T0_NEG]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <4 x i32> [[T1]]
 ;
-  %t0 = xor <4 x i32> %x, <i32 -1, i32 -1, i32 undef, i32 -1>
+  %t0 = xor <4 x i32> %x, <i32 -1, i32 -1, i32 poison, i32 -1>
   %t1 = sub <4 x i32> %y, %t0
   ret <4 x i32> %t1
 }
diff --git a/llvm/test/Transforms/InstCombine/fpcast.ll b/llvm/test/Transforms/InstCombine/fpcast.ll
index ac4b88fcddd7e..69daac773a645 100644
--- a/llvm/test/Transforms/InstCombine/fpcast.ll
+++ b/llvm/test/Transforms/InstCombine/fpcast.ll
@@ -51,13 +51,13 @@ define half @unary_fneg_fptrunc(float %a) {
   ret half %c
 }
 
-define <2 x half> @fneg_fptrunc_vec_undef(<2 x float> %a) {
-; CHECK-LABEL: @fneg_fptrunc_vec_undef(
+define <2 x half> @fneg_fptrunc_vec_poison(<2 x float> %a) {
+; CHECK-LABEL: @fneg_fptrunc_vec_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc <2 x float> [[A:%.*]] to <2 x half>
 ; CHECK-NEXT:    [[C:%.*]] = fneg <2 x half> [[TMP1]]
 ; CHECK-NEXT:    ret <2 x half> [[C]]
 ;
-  %b = fsub <2 x float> <float -0.0, float undef>, %a
+  %b = fsub <2 x float> <float -0.0, float poison>, %a
   %c = fptrunc <2 x float> %b to <2 x half>
   ret <2 x half> %c
 }
@@ -170,7 +170,7 @@ define half @sint_to_fptrunc(i32 %x) {
 define half @masked_sint_to_fptrunc1(i32 %x) {
 ; CHECK-LABEL: @masked_sint_to_fptrunc1(
 ; CHECK-NEXT:    [[M:%.*]] = and i32 [[X:%.*]], 16777215
-; CHECK-NEXT:    [[R:%.*]] = sitofp i32 [[M]] to half
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i32 [[M]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %m = and i32 %x, 16777215
@@ -182,7 +182,7 @@ define half @masked_sint_to_fptrunc1(i32 %x) {
 define half @masked_sint_to_fptrunc2(i32 %x) {
 ; CHECK-LABEL: @masked_sint_to_fptrunc2(
 ; CHECK-NEXT:    [[M:%.*]] = lshr i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[R:%.*]] = sitofp i32 [[M]] to half
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i32 [[M]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %m = lshr i32 %x, 8
@@ -194,7 +194,7 @@ define half @masked_sint_to_fptrunc2(i32 %x) {
 define half @masked_sint_to_fptrunc3(i32 %x) {
 ; CHECK-LABEL: @masked_sint_to_fptrunc3(
 ; CHECK-NEXT:    [[M:%.*]] = lshr i32 [[X:%.*]], 7
-; CHECK-NEXT:    [[F:%.*]] = sitofp i32 [[M]] to float
+; CHECK-NEXT:    [[F:%.*]] = uitofp nneg i32 [[M]] to float
 ; CHECK-NEXT:    [[R:%.*]] = fptrunc float [[F]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
@@ -218,7 +218,7 @@ define double @sint_to_fpext(i32 %x) {
 define double @masked_sint_to_fpext1(i32 %x) {
 ; CHECK-LABEL: @masked_sint_to_fpext1(
 ; CHECK-NEXT:    [[M:%.*]] = and i32 [[X:%.*]], 16777215
-; CHECK-NEXT:    [[R:%.*]] = sitofp i32 [[M]] to double
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i32 [[M]] to double
 ; CHECK-NEXT:    ret double [[R]]
 ;
   %m = and i32 %x, 16777215
@@ -230,7 +230,7 @@ define double @masked_sint_to_fpext1(i32 %x) {
 define double @masked_sint_to_fpext2(i32 %x) {
 ; CHECK-LABEL: @masked_sint_to_fpext2(
 ; CHECK-NEXT:    [[M:%.*]] = lshr i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[R:%.*]] = sitofp i32 [[M]] to double
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i32 [[M]] to double
 ; CHECK-NEXT:    ret double [[R]]
 ;
   %m = lshr i32 %x, 8
@@ -242,7 +242,7 @@ define double @masked_sint_to_fpext2(i32 %x) {
 define double @masked_sint_to_fpext3(i32 %x) {
 ; CHECK-LABEL: @masked_sint_to_fpext3(
 ; CHECK-NEXT:    [[M:%.*]] = lshr i32 [[X:%.*]], 7
-; CHECK-NEXT:    [[F:%.*]] = sitofp i32 [[M]] to float
+; CHECK-NEXT:    [[F:%.*]] = uitofp nneg i32 [[M]] to float
 ; CHECK-NEXT:    [[R:%.*]] = fpext float [[F]] to double
 ; CHECK-NEXT:    ret double [[R]]
 ;
@@ -266,7 +266,7 @@ define half @uint_to_fptrunc(i32 %x) {
 define half @masked_uint_to_fptrunc1(i32 %x) {
 ; CHECK-LABEL: @masked_uint_to_fptrunc1(
 ; CHECK-NEXT:    [[M:%.*]] = and i32 [[X:%.*]], 16777215
-; CHECK-NEXT:    [[R:%.*]] = uitofp i32 [[M]] to half
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i32 [[M]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %m = and i32 %x, 16777215
@@ -278,7 +278,7 @@ define half @masked_uint_to_fptrunc1(i32 %x) {
 define half @masked_uint_to_fptrunc2(i32 %x) {
 ; CHECK-LABEL: @masked_uint_to_fptrunc2(
 ; CHECK-NEXT:    [[M:%.*]] = lshr i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[R:%.*]] = uitofp i32 [[M]] to half
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i32 [[M]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
   %m = lshr i32 %x, 8
@@ -290,7 +290,7 @@ define half @masked_uint_to_fptrunc2(i32 %x) {
 define half @masked_uint_to_fptrunc3(i32 %x) {
 ; CHECK-LABEL: @masked_uint_to_fptrunc3(
 ; CHECK-NEXT:    [[M:%.*]] = lshr i32 [[X:%.*]], 7
-; CHECK-NEXT:    [[F:%.*]] = uitofp i32 [[M]] to float
+; CHECK-NEXT:    [[F:%.*]] = uitofp nneg i32 [[M]] to float
 ; CHECK-NEXT:    [[R:%.*]] = fptrunc float [[F]] to half
 ; CHECK-NEXT:    ret half [[R]]
 ;
@@ -314,7 +314,7 @@ define double @uint_to_fpext(i32 %x) {
 define double @masked_uint_to_fpext1(i32 %x) {
 ; CHECK-LABEL: @masked_uint_to_fpext1(
 ; CHECK-NEXT:    [[M:%.*]] = and i32 [[X:%.*]], 16777215
-; CHECK-NEXT:    [[R:%.*]] = uitofp i32 [[M]] to double
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i32 [[M]] to double
 ; CHECK-NEXT:    ret double [[R]]
 ;
   %m = and i32 %x, 16777215
@@ -326,7 +326,7 @@ define double @masked_uint_to_fpext1(i32 %x) {
 define double @masked_uint_to_fpext2(i32 %x) {
 ; CHECK-LABEL: @masked_uint_to_fpext2(
 ; CHECK-NEXT:    [[M:%.*]] = lshr i32 [[X:%.*]], 8
-; CHECK-NEXT:    [[R:%.*]] = uitofp i32 [[M]] to double
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i32 [[M]] to double
 ; CHECK-NEXT:    ret double [[R]]
 ;
   %m = lshr i32 %x, 8
@@ -338,7 +338,7 @@ define double @masked_uint_to_fpext2(i32 %x) {
 define double @masked_uint_to_fpext3(i32 %x) {
 ; CHECK-LABEL: @masked_uint_to_fpext3(
 ; CHECK-NEXT:    [[M:%.*]] = lshr i32 [[X:%.*]], 7
-; CHECK-NEXT:    [[F:%.*]] = uitofp i32 [[M]] to float
+; CHECK-NEXT:    [[F:%.*]] = uitofp nneg i32 [[M]] to float
 ; CHECK-NEXT:    [[R:%.*]] = fpext float [[F]] to double
 ; CHECK-NEXT:    ret double [[R]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/fsub.ll b/llvm/test/Transforms/InstCombine/fsub.ll
index 6e13c33b126d5..f1e7086e697e8 100644
--- a/llvm/test/Transforms/InstCombine/fsub.ll
+++ b/llvm/test/Transforms/InstCombine/fsub.ll
@@ -153,12 +153,12 @@ define <2 x float> @constant_op1_vec(<2 x float> %x, <2 x float> %y) {
   ret <2 x float> %r
 }
 
-define <2 x float> @constant_op1_vec_undef(<2 x float> %x, <2 x float> %y) {
-; CHECK-LABEL: @constant_op1_vec_undef(
-; CHECK-NEXT:    [[R:%.*]] = fadd <2 x float> [[X:%.*]], <float undef, float 4.200000e+01>
+define <2 x float> @constant_op1_vec_poison(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @constant_op1_vec_poison(
+; CHECK-NEXT:    [[R:%.*]] = fadd <2 x float> [[X:%.*]], <float poison, float 4.200000e+01>
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
-  %r = fsub <2 x float> %x, <float undef, float -42.0>
+  %r = fsub <2 x float> %x, <float poison, float -42.0>
   ret <2 x float> %r
 }
 
@@ -204,12 +204,12 @@ define <2 x float> @unary_neg_op1_vec(<2 x float> %x, <2 x float> %y) {
   ret <2 x float> %r
 }
 
-define <2 x float> @neg_op1_vec_undef(<2 x float> %x, <2 x float> %y) {
-; CHECK-LABEL: @neg_op1_vec_undef(
+define <2 x float> @neg_op1_vec_poison(<2 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @neg_op1_vec_poison(
 ; CHECK-NEXT:    [[R:%.*]] = fadd <2 x float> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
-  %negy = fsub <2 x float> <float -0.0, float undef>, %y
+  %negy = fsub <2 x float> <float -0.0, float poison>, %y
   %r = fsub <2 x float> %x, %negy
   ret <2 x float> %r
 }
diff --git a/llvm/test/Transforms/InstCombine/funnel.ll b/llvm/test/Transforms/InstCombine/funnel.ll
index 162519e648f3e..a54e6e4642b75 100644
--- a/llvm/test/Transforms/InstCombine/funnel.ll
+++ b/llvm/test/Transforms/InstCombine/funnel.ll
@@ -43,24 +43,24 @@ define <2 x i16> @fshl_v2i16_constant_splat(<2 x i16> %x, <2 x i16> %y) {
   ret <2 x i16> %r
 }
 
-define <2 x i16> @fshl_v2i16_constant_splat_undef0(<2 x i16> %x, <2 x i16> %y) {
-; CHECK-LABEL: @fshl_v2i16_constant_splat_undef0(
+define <2 x i16> @fshl_v2i16_constant_splat_poison0(<2 x i16> %x, <2 x i16> %y) {
+; CHECK-LABEL: @fshl_v2i16_constant_splat_poison0(
 ; CHECK-NEXT:    [[R:%.*]] = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> [[X:%.*]], <2 x i16> [[Y:%.*]], <2 x i16> <i16 1, i16 1>)
 ; CHECK-NEXT:    ret <2 x i16> [[R]]
 ;
-  %shl = shl <2 x i16> %x, <i16 undef, i16 1>
+  %shl = shl <2 x i16> %x, <i16 poison, i16 1>
   %shr = lshr <2 x i16> %y, <i16 15, i16 15>
   %r = or <2 x i16> %shl, %shr
   ret <2 x i16> %r
 }
 
-define <2 x i16> @fshl_v2i16_constant_splat_undef1(<2 x i16> %x, <2 x i16> %y) {
-; CHECK-LABEL: @fshl_v2i16_constant_splat_undef1(
+define <2 x i16> @fshl_v2i16_constant_splat_poison1(<2 x i16> %x, <2 x i16> %y) {
+; CHECK-LABEL: @fshl_v2i16_constant_splat_poison1(
 ; CHECK-NEXT:    [[R:%.*]] = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> [[X:%.*]], <2 x i16> [[Y:%.*]], <2 x i16> <i16 1, i16 1>)
 ; CHECK-NEXT:    ret <2 x i16> [[R]]
 ;
   %shl = shl <2 x i16> %x, <i16 1, i16 1>
-  %shr = lshr <2 x i16> %y, <i16 15, i16 undef>
+  %shr = lshr <2 x i16> %y, <i16 15, i16 poison>
   %r = or <2 x i16> %shl, %shr
   ret <2 x i16> %r
 }
@@ -78,30 +78,30 @@ define <2 x i17> @fshr_v2i17_constant_splat(<2 x i17> %x, <2 x i17> %y) {
   ret <2 x i17> %r
 }
 
-define <2 x i17> @fshr_v2i17_constant_splat_undef0(<2 x i17> %x, <2 x i17> %y) {
-; CHECK-LABEL: @fshr_v2i17_constant_splat_undef0(
+define <2 x i17> @fshr_v2i17_constant_splat_poison0(<2 x i17> %x, <2 x i17> %y) {
+; CHECK-LABEL: @fshr_v2i17_constant_splat_poison0(
 ; CHECK-NEXT:    [[R:%.*]] = call <2 x i17> @llvm.fshl.v2i17(<2 x i17> [[Y:%.*]], <2 x i17> [[X:%.*]], <2 x i17> <i17 5, i17 5>)
 ; CHECK-NEXT:    ret <2 x i17> [[R]]
 ;
-  %shr = lshr <2 x i17> %x, <i17 12, i17 undef>
-  %shl = shl <2 x i17> %y, <i17 undef, i17 5>
+  %shr = lshr <2 x i17> %x, <i17 12, i17 poison>
+  %shl = shl <2 x i17> %y, <i17 poison, i17 5>
   %r = or <2 x i17> %shr, %shl
   ret <2 x i17> %r
 }
 
-define <2 x i17> @fshr_v2i17_constant_splat_undef1(<2 x i17> %x, <2 x i17> %y) {
-; CHECK-LABEL: @fshr_v2i17_constant_splat_undef1(
+define <2 x i17> @fshr_v2i17_constant_splat_poison1(<2 x i17> %x, <2 x i17> %y) {
+; CHECK-LABEL: @fshr_v2i17_constant_splat_poison1(
 ; CHECK-NEXT:    [[R:%.*]] = call <2 x i17> @llvm.fshl.v2i17(<2 x i17> [[Y:%.*]], <2 x i17> [[X:%.*]], <2 x i17> <i17 5, i17 5>)
 ; CHECK-NEXT:    ret <2 x i17> [[R]]
 ;
-  %shr = lshr <2 x i17> %x, <i17 12, i17 undef>
-  %shl = shl <2 x i17> %y, <i17 5, i17 undef>
+  %shr = lshr <2 x i17> %x, <i17 12, i17 poison>
+  %shl = shl <2 x i17> %y, <i17 5, i17 poison>
   %r = or <2 x i17> %shr, %shl
   ret <2 x i17> %r
 }
 
 ; Allow arbitrary shift constants.
-; Support undef elements.
+; Support poison elements.
 
 define <2 x i32> @fshr_v2i32_constant_nonsplat(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @fshr_v2i32_constant_nonsplat(
@@ -114,24 +114,24 @@ define <2 x i32> @fshr_v2i32_constant_nonsplat(<2 x i32> %x, <2 x i32> %y) {
   ret <2 x i32> %r
 }
 
-define <2 x i32> @fshr_v2i32_constant_nonsplat_undef0(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: @fshr_v2i32_constant_nonsplat_undef0(
+define <2 x i32> @fshr_v2i32_constant_nonsplat_poison0(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @fshr_v2i32_constant_nonsplat_poison0(
 ; CHECK-NEXT:    [[R:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[Y:%.*]], <2 x i32> [[X:%.*]], <2 x i32> <i32 0, i32 13>)
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %shr = lshr <2 x i32> %x, <i32 undef, i32 19>
+  %shr = lshr <2 x i32> %x, <i32 poison, i32 19>
   %shl = shl <2 x i32> %y, <i32 15, i32 13>
   %r = or <2 x i32> %shl, %shr
   ret <2 x i32> %r
 }
 
-define <2 x i32> @fshr_v2i32_constant_nonsplat_undef1(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: @fshr_v2i32_constant_nonsplat_undef1(
-; CHECK-NEXT:    [[R:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[Y:%.*]], <2 x i32> [[X:%.*]], <2 x i32> <i32 15, i32 0>)
+define <2 x i32> @fshr_v2i32_constant_nonsplat_poison1(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @fshr_v2i32_constant_nonsplat_poison1(
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[Y:%.*]], <2 x i32> [[X:%.*]], <2 x i32> <i32 15, i32 poison>)
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %shr = lshr <2 x i32> %x, <i32 17, i32 19>
-  %shl = shl <2 x i32> %y, <i32 15, i32 undef>
+  %shl = shl <2 x i32> %y, <i32 15, i32 poison>
   %r = or <2 x i32> %shl, %shr
   ret <2 x i32> %r
 }
@@ -147,13 +147,13 @@ define <2 x i36> @fshl_v2i36_constant_nonsplat(<2 x i36> %x, <2 x i36> %y) {
   ret <2 x i36> %r
 }
 
-define <3 x i36> @fshl_v3i36_constant_nonsplat_undef0(<3 x i36> %x, <3 x i36> %y) {
-; CHECK-LABEL: @fshl_v3i36_constant_nonsplat_undef0(
-; CHECK-NEXT:    [[R:%.*]] = call <3 x i36> @llvm.fshl.v3i36(<3 x i36> [[X:%.*]], <3 x i36> [[Y:%.*]], <3 x i36> <i36 21, i36 11, i36 0>)
+define <3 x i36> @fshl_v3i36_constant_nonsplat_poison0(<3 x i36> %x, <3 x i36> %y) {
+; CHECK-LABEL: @fshl_v3i36_constant_nonsplat_poison0(
+; CHECK-NEXT:    [[R:%.*]] = call <3 x i36> @llvm.fshl.v3i36(<3 x i36> [[X:%.*]], <3 x i36> [[Y:%.*]], <3 x i36> <i36 21, i36 11, i36 poison>)
 ; CHECK-NEXT:    ret <3 x i36> [[R]]
 ;
-  %shl = shl <3 x i36> %x, <i36 21, i36 11, i36 undef>
-  %shr = lshr <3 x i36> %y, <i36 15, i36 25, i36 undef>
+  %shl = shl <3 x i36> %x, <i36 21, i36 11, i36 poison>
+  %shr = lshr <3 x i36> %y, <i36 15, i36 25, i36 poison>
   %r = or <3 x i36> %shl, %shr
   ret <3 x i36> %r
 }
diff --git a/llvm/test/Transforms/InstCombine/get-lowbitmask-upto-and-including-bit.ll b/llvm/test/Transforms/InstCombine/get-lowbitmask-upto-and-including-bit.ll
index 12a81f0cd2f0f..40caa57891369 100644
--- a/llvm/test/Transforms/InstCombine/get-lowbitmask-upto-and-including-bit.ll
+++ b/llvm/test/Transforms/InstCombine/get-lowbitmask-upto-and-including-bit.ll
@@ -41,36 +41,36 @@ define <2 x i8> @t2_vec(<2 x i8> %x) {
   %mask = or <2 x i8> %lowbitmask, %bitmask
   ret <2 x i8> %mask
 }
-define <3 x i8> @t3_vec_undef0(<3 x i8> %x) {
-; CHECK-LABEL: @t3_vec_undef0(
+define <3 x i8> @t3_vec_poison0(<3 x i8> %x) {
+; CHECK-LABEL: @t3_vec_poison0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub <3 x i8> <i8 7, i8 7, i8 7>, [[X:%.*]]
 ; CHECK-NEXT:    [[MASK:%.*]] = lshr <3 x i8> <i8 -1, i8 -1, i8 -1>, [[TMP1]]
 ; CHECK-NEXT:    ret <3 x i8> [[MASK]]
 ;
-  %bitmask = shl <3 x i8> <i8 1, i8 undef, i8 1>, %x
+  %bitmask = shl <3 x i8> <i8 1, i8 poison, i8 1>, %x
   %lowbitmask = add <3 x i8> %bitmask, <i8 -1, i8 -1, i8 -1>
   %mask = or <3 x i8> %lowbitmask, %bitmask
   ret <3 x i8> %mask
 }
-define <3 x i8> @t4_vec_undef1(<3 x i8> %x) {
-; CHECK-LABEL: @t4_vec_undef1(
+define <3 x i8> @t4_vec_poison1(<3 x i8> %x) {
+; CHECK-LABEL: @t4_vec_poison1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub <3 x i8> <i8 7, i8 7, i8 7>, [[X:%.*]]
 ; CHECK-NEXT:    [[MASK:%.*]] = lshr <3 x i8> <i8 -1, i8 -1, i8 -1>, [[TMP1]]
 ; CHECK-NEXT:    ret <3 x i8> [[MASK]]
 ;
   %bitmask = shl <3 x i8> <i8 1, i8 1, i8 1>, %x
-  %lowbitmask = add <3 x i8> %bitmask, <i8 -1, i8 undef, i8 -1>
+  %lowbitmask = add <3 x i8> %bitmask, <i8 -1, i8 poison, i8 -1>
   %mask = or <3 x i8> %lowbitmask, %bitmask
   ret <3 x i8> %mask
 }
-define <3 x i8> @t5_vec_undef2(<3 x i8> %x) {
-; CHECK-LABEL: @t5_vec_undef2(
+define <3 x i8> @t5_vec_poison2(<3 x i8> %x) {
+; CHECK-LABEL: @t5_vec_poison2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub <3 x i8> <i8 7, i8 7, i8 7>, [[X:%.*]]
 ; CHECK-NEXT:    [[MASK:%.*]] = lshr <3 x i8> <i8 -1, i8 -1, i8 -1>, [[TMP1]]
 ; CHECK-NEXT:    ret <3 x i8> [[MASK]]
 ;
-  %bitmask = shl <3 x i8> <i8 1, i8 1, i8 undef>, %x
-  %lowbitmask = add <3 x i8> %bitmask, <i8 -1, i8 undef, i8 -1>
+  %bitmask = shl <3 x i8> <i8 1, i8 1, i8 poison>, %x
+  %lowbitmask = add <3 x i8> %bitmask, <i8 -1, i8 poison, i8 -1>
   %mask = or <3 x i8> %lowbitmask, %bitmask
   ret <3 x i8> %mask
 }
diff --git a/llvm/test/Transforms/InstCombine/hoist-negation-out-of-bias-calculation.ll b/llvm/test/Transforms/InstCombine/hoist-negation-out-of-bias-calculation.ll
index c8f14595ea673..e4cae13519783 100644
--- a/llvm/test/Transforms/InstCombine/hoist-negation-out-of-bias-calculation.ll
+++ b/llvm/test/Transforms/InstCombine/hoist-negation-out-of-bias-calculation.ll
@@ -55,14 +55,14 @@ define <2 x i8> @t2_vec(<2 x i8> %x, <2 x i8> %y) {
   ret <2 x i8> %negbias
 }
 
-define <2 x i8> @t3_vec_undef(<2 x i8> %x, <2 x i8> %y) {
-; CHECK-LABEL: @t3_vec_undef(
+define <2 x i8> @t3_vec_poison(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @t3_vec_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i8> [[Y:%.*]], <i8 -1, i8 -1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i8> [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    [[NEGBIAS:%.*]] = sub <2 x i8> zeroinitializer, [[TMP2]]
 ; CHECK-NEXT:    ret <2 x i8> [[NEGBIAS]]
 ;
-  %negy = sub <2 x i8> <i8 0, i8 undef>, %y
+  %negy = sub <2 x i8> <i8 0, i8 poison>, %y
   %unbiasedx = and <2 x i8> %negy, %x
   %negbias = sub <2 x i8> %unbiasedx, %x
   ret <2 x i8> %negbias
diff --git a/llvm/test/Transforms/InstCombine/hoist-not-from-ashr-operand.ll b/llvm/test/Transforms/InstCombine/hoist-not-from-ashr-operand.ll
index e0242855e2683..2217666f0f49a 100644
--- a/llvm/test/Transforms/InstCombine/hoist-not-from-ashr-operand.ll
+++ b/llvm/test/Transforms/InstCombine/hoist-not-from-ashr-operand.ll
@@ -41,14 +41,14 @@ define <2 x i8> @t2_vec(<2 x i8> %x, <2 x i8> %y) {
   %ashr = ashr <2 x i8> %not_x, %y
   ret <2 x i8> %ashr
 }
-; Note that we must sanitize undef elts of -1 constant to -1 or 0.
-define <2 x i8> @t3_vec_undef(<2 x i8> %x, <2 x i8> %y) {
-; CHECK-LABEL: @t3_vec_undef(
+; Note that we must sanitize poison elts of -1 constant to -1 or 0.
+define <2 x i8> @t3_vec_poison(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @t3_vec_poison(
 ; CHECK-NEXT:    [[NOT_X_NOT:%.*]] = ashr <2 x i8> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[ASHR:%.*]] = xor <2 x i8> [[NOT_X_NOT]], <i8 -1, i8 -1>
 ; CHECK-NEXT:    ret <2 x i8> [[ASHR]]
 ;
-  %not_x = xor <2 x i8> %x, <i8 -1, i8 undef>
+  %not_x = xor <2 x i8> %x, <i8 -1, i8 poison>
   %ashr = ashr <2 x i8> %not_x, %y
   ret <2 x i8> %ashr
 }
diff --git a/llvm/test/Transforms/InstCombine/icmp-uge-of-add-of-shl-one-by-bits-to-allones-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll b/llvm/test/Transforms/InstCombine/icmp-uge-of-add-of-shl-one-by-bits-to-allones-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll
index 5adf476f7a79f..32ef6267cdf8b 100644
--- a/llvm/test/Transforms/InstCombine/icmp-uge-of-add-of-shl-one-by-bits-to-allones-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-uge-of-add-of-shl-one-by-bits-to-allones-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll
@@ -56,8 +56,8 @@ define <3 x i1> @p2_vec_undef0(<3 x i8> %val, <3 x i8> %bits) {
 ; CHECK-LABEL: @p2_vec_undef0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl <3 x i8> <i8 1, i8 undef, i8 1>, [[BITS:%.*]]
 ; CHECK-NEXT:    call void @use3i8(<3 x i8> [[T0]])
-; CHECK-NEXT:    [[VAL_HIGHBITS:%.*]] = lshr <3 x i8> [[VAL:%.*]], [[BITS]]
-; CHECK-NEXT:    [[R:%.*]] = icmp eq <3 x i8> [[VAL_HIGHBITS]], zeroinitializer
+; CHECK-NEXT:    [[T1:%.*]] = add <3 x i8> [[T0]], <i8 -1, i8 -1, i8 -1>
+; CHECK-NEXT:    [[R:%.*]] = icmp uge <3 x i8> [[T1]], [[VAL:%.*]]
 ; CHECK-NEXT:    ret <3 x i1> [[R]]
 ;
   %t0 = shl <3 x i8> <i8 1, i8 undef, i8 1>, %bits
diff --git a/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll b/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll
index 7f4603881f23c..27b02c8c6e936 100644
--- a/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll
@@ -40,38 +40,38 @@ define <2 x i1> @p1_vec(<2 x i8> %val, <2 x i8> %bits) {
   ret <2 x i1> %r
 }
 
-define <3 x i1> @p2_vec_undef0(<3 x i8> %val, <3 x i8> %bits) {
-; CHECK-LABEL: @p2_vec_undef0(
+define <3 x i1> @p2_vec_poison0(<3 x i8> %val, <3 x i8> %bits) {
+; CHECK-LABEL: @p2_vec_poison0(
 ; CHECK-NEXT:    [[VAL_HIGHBITS:%.*]] = lshr <3 x i8> [[VAL:%.*]], [[BITS:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = icmp eq <3 x i8> [[VAL_HIGHBITS]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[R]]
 ;
-  %t0 = shl <3 x i8> <i8 -1, i8 undef, i8 -1>, %bits
+  %t0 = shl <3 x i8> <i8 -1, i8 poison, i8 -1>, %bits
   %t1 = xor <3 x i8> %t0, <i8 -1, i8 -1, i8 -1>
   %r = icmp uge <3 x i8> %t1, %val
   ret <3 x i1> %r
 }
 
-define <3 x i1> @p2_vec_undef1(<3 x i8> %val, <3 x i8> %bits) {
-; CHECK-LABEL: @p2_vec_undef1(
+define <3 x i1> @p2_vec_poison1(<3 x i8> %val, <3 x i8> %bits) {
+; CHECK-LABEL: @p2_vec_poison1(
 ; CHECK-NEXT:    [[VAL_HIGHBITS:%.*]] = lshr <3 x i8> [[VAL:%.*]], [[BITS:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = icmp eq <3 x i8> [[VAL_HIGHBITS]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[R]]
 ;
   %t0 = shl <3 x i8> <i8 -1, i8 -1, i8 -1>, %bits
-  %t1 = xor <3 x i8> %t0, <i8 -1, i8 undef, i8 -1>
+  %t1 = xor <3 x i8> %t0, <i8 -1, i8 poison, i8 -1>
   %r = icmp uge <3 x i8> %t1, %val
   ret <3 x i1> %r
 }
 
-define <3 x i1> @p2_vec_undef2(<3 x i8> %val, <3 x i8> %bits) {
-; CHECK-LABEL: @p2_vec_undef2(
+define <3 x i1> @p2_vec_poison2(<3 x i8> %val, <3 x i8> %bits) {
+; CHECK-LABEL: @p2_vec_poison2(
 ; CHECK-NEXT:    [[VAL_HIGHBITS:%.*]] = lshr <3 x i8> [[VAL:%.*]], [[BITS:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = icmp eq <3 x i8> [[VAL_HIGHBITS]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[R]]
 ;
-  %t0 = shl <3 x i8> <i8 -1, i8 undef, i8 -1>, %bits
-  %t1 = xor <3 x i8> %t0, <i8 -1, i8 undef, i8 -1>
+  %t0 = shl <3 x i8> <i8 -1, i8 poison, i8 -1>, %bits
+  %t1 = xor <3 x i8> %t0, <i8 -1, i8 poison, i8 -1>
   %r = icmp uge <3 x i8> %t1, %val
   ret <3 x i1> %r
 }
diff --git a/llvm/test/Transforms/InstCombine/icmp-ugt-of-shl-1-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll b/llvm/test/Transforms/InstCombine/icmp-ugt-of-shl-1-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll
index 550e8bb17229f..72cfb5a9f8bd0 100644
--- a/llvm/test/Transforms/InstCombine/icmp-ugt-of-shl-1-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-ugt-of-shl-1-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll
@@ -38,13 +38,13 @@ define <2 x i1> @p1_vec(<2 x i8> %val, <2 x i8> %bits) {
   ret <2 x i1> %r
 }
 
-define <3 x i1> @p2_vec_undef(<3 x i8> %val, <3 x i8> %bits) {
-; CHECK-LABEL: @p2_vec_undef(
+define <3 x i1> @p2_vec_poison(<3 x i8> %val, <3 x i8> %bits) {
+; CHECK-LABEL: @p2_vec_poison(
 ; CHECK-NEXT:    [[VAL_HIGHBITS:%.*]] = lshr <3 x i8> [[VAL:%.*]], [[BITS:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = icmp eq <3 x i8> [[VAL_HIGHBITS]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[R]]
 ;
-  %t0 = shl <3 x i8> <i8 1, i8 undef, i8 1>, %bits
+  %t0 = shl <3 x i8> <i8 1, i8 poison, i8 1>, %bits
   %r = icmp ugt <3 x i8> %t0, %val
   ret <3 x i1> %r
 }
diff --git a/llvm/test/Transforms/InstCombine/icmp-ule-of-shl-1-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll b/llvm/test/Transforms/InstCombine/icmp-ule-of-shl-1-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll
index 26b667d36728a..79e6914f09531 100644
--- a/llvm/test/Transforms/InstCombine/icmp-ule-of-shl-1-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-ule-of-shl-1-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll
@@ -38,13 +38,13 @@ define <2 x i1> @p1_vec(<2 x i8> %val, <2 x i8> %bits) {
   ret <2 x i1> %r
 }
 
-define <3 x i1> @p2_vec_undef(<3 x i8> %val, <3 x i8> %bits) {
-; CHECK-LABEL: @p2_vec_undef(
+define <3 x i1> @p2_vec_poison(<3 x i8> %val, <3 x i8> %bits) {
+; CHECK-LABEL: @p2_vec_poison(
 ; CHECK-NEXT:    [[VAL_HIGHBITS:%.*]] = lshr <3 x i8> [[VAL:%.*]], [[BITS:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = icmp ne <3 x i8> [[VAL_HIGHBITS]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[R]]
 ;
-  %t0 = shl <3 x i8> <i8 1, i8 undef, i8 1>, %bits
+  %t0 = shl <3 x i8> <i8 1, i8 poison, i8 1>, %bits
   %r = icmp ule <3 x i8> %t0, %val
   ret <3 x i1> %r
 }
diff --git a/llvm/test/Transforms/InstCombine/icmp-ult-of-add-of-shl-one-by-bits-to-allones-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll b/llvm/test/Transforms/InstCombine/icmp-ult-of-add-of-shl-one-by-bits-to-allones-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll
index dd353d44218bf..25894a22f0075 100644
--- a/llvm/test/Transforms/InstCombine/icmp-ult-of-add-of-shl-one-by-bits-to-allones-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-ult-of-add-of-shl-one-by-bits-to-allones-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll
@@ -56,8 +56,8 @@ define <3 x i1> @p2_vec_undef0(<3 x i8> %val, <3 x i8> %bits) {
 ; CHECK-LABEL: @p2_vec_undef0(
 ; CHECK-NEXT:    [[T0:%.*]] = shl <3 x i8> <i8 1, i8 undef, i8 1>, [[BITS:%.*]]
 ; CHECK-NEXT:    call void @use3i8(<3 x i8> [[T0]])
-; CHECK-NEXT:    [[VAL_HIGHBITS:%.*]] = lshr <3 x i8> [[VAL:%.*]], [[BITS]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ne <3 x i8> [[VAL_HIGHBITS]], zeroinitializer
+; CHECK-NEXT:    [[T1:%.*]] = add <3 x i8> [[T0]], <i8 -1, i8 -1, i8 -1>
+; CHECK-NEXT:    [[R:%.*]] = icmp ult <3 x i8> [[T1]], [[VAL:%.*]]
 ; CHECK-NEXT:    ret <3 x i1> [[R]]
 ;
   %t0 = shl <3 x i8> <i8 1, i8 undef, i8 1>, %bits
diff --git a/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll b/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll
index c7a45c5cdc11a..8441033d4857e 100644
--- a/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll
@@ -40,38 +40,38 @@ define <2 x i1> @p1_vec(<2 x i8> %val, <2 x i8> %bits) {
   ret <2 x i1> %r
 }
 
-define <3 x i1> @p2_vec_undef0(<3 x i8> %val, <3 x i8> %bits) {
-; CHECK-LABEL: @p2_vec_undef0(
+define <3 x i1> @p2_vec_poison0(<3 x i8> %val, <3 x i8> %bits) {
+; CHECK-LABEL: @p2_vec_poison0(
 ; CHECK-NEXT:    [[VAL_HIGHBITS:%.*]] = lshr <3 x i8> [[VAL:%.*]], [[BITS:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = icmp ne <3 x i8> [[VAL_HIGHBITS]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[R]]
 ;
-  %t0 = shl <3 x i8> <i8 -1, i8 undef, i8 -1>, %bits
+  %t0 = shl <3 x i8> <i8 -1, i8 poison, i8 -1>, %bits
   %t1 = xor <3 x i8> %t0, <i8 -1, i8 -1, i8 -1>
   %r = icmp ult <3 x i8> %t1, %val
   ret <3 x i1> %r
 }
 
-define <3 x i1> @p2_vec_undef1(<3 x i8> %val, <3 x i8> %bits) {
-; CHECK-LABEL: @p2_vec_undef1(
+define <3 x i1> @p2_vec_poison1(<3 x i8> %val, <3 x i8> %bits) {
+; CHECK-LABEL: @p2_vec_poison1(
 ; CHECK-NEXT:    [[VAL_HIGHBITS:%.*]] = lshr <3 x i8> [[VAL:%.*]], [[BITS:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = icmp ne <3 x i8> [[VAL_HIGHBITS]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[R]]
 ;
   %t0 = shl <3 x i8> <i8 -1, i8 -1, i8 -1>, %bits
-  %t1 = xor <3 x i8> %t0, <i8 -1, i8 undef, i8 -1>
+  %t1 = xor <3 x i8> %t0, <i8 -1, i8 poison, i8 -1>
   %r = icmp ult <3 x i8> %t1, %val
   ret <3 x i1> %r
 }
 
-define <3 x i1> @p2_vec_undef2(<3 x i8> %val, <3 x i8> %bits) {
-; CHECK-LABEL: @p2_vec_undef2(
+define <3 x i1> @p2_vec_poison2(<3 x i8> %val, <3 x i8> %bits) {
+; CHECK-LABEL: @p2_vec_poison2(
 ; CHECK-NEXT:    [[VAL_HIGHBITS:%.*]] = lshr <3 x i8> [[VAL:%.*]], [[BITS:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = icmp ne <3 x i8> [[VAL_HIGHBITS]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[R]]
 ;
-  %t0 = shl <3 x i8> <i8 -1, i8 undef, i8 -1>, %bits
-  %t1 = xor <3 x i8> %t0, <i8 -1, i8 undef, i8 -1>
+  %t0 = shl <3 x i8> <i8 -1, i8 poison, i8 -1>, %bits
+  %t1 = xor <3 x i8> %t0, <i8 -1, i8 poison, i8 -1>
   %r = icmp ult <3 x i8> %t1, %val
   ret <3 x i1> %r
 }
diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll
index 10ab1fe118348..31093c7ca1036 100644
--- a/llvm/test/Transforms/InstCombine/icmp.ll
+++ b/llvm/test/Transforms/InstCombine/icmp.ll
@@ -1790,14 +1790,14 @@ define <2 x i1> @icmp_add20_eq_add57_splat(<2 x i32> %x, <2 x i32> %y) {
   ret <2 x i1> %cmp
 }
 
-define <2 x i1> @icmp_add20_eq_add57_undef(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: @icmp_add20_eq_add57_undef(
+define <2 x i1> @icmp_add20_eq_add57_poison(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @icmp_add20_eq_add57_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> [[Y:%.*]], <i32 37, i32 37>
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i32> [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %1 = add <2 x i32> %x, <i32 20, i32 20>
-  %2 = add <2 x i32> %y, <i32 57, i32 undef>
+  %2 = add <2 x i32> %y, <i32 57, i32 poison>
   %cmp = icmp eq <2 x i32> %1, %2
   ret <2 x i1> %cmp
 }
@@ -1838,14 +1838,14 @@ define <2 x i1> @icmp_sub57_ne_sub20_splat(<2 x i32> %x, <2 x i32> %y) {
   ret <2 x i1> %cmp
 }
 
-define <2 x i1> @icmp_sub57_ne_sub20_vec_undef(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: @icmp_sub57_ne_sub20_vec_undef(
+define <2 x i1> @icmp_sub57_ne_sub20_vec_poison(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @icmp_sub57_ne_sub20_vec_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i32> [[X:%.*]], <i32 -37, i32 -37>
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne <2 x i32> [[TMP1]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
-  %1 = add <2 x i32> %x, <i32 -57, i32 undef>
-  %2 = add <2 x i32> %y, <i32 -20, i32 undef>
+  %1 = add <2 x i32> %x, <i32 -57, i32 poison>
+  %2 = add <2 x i32> %y, <i32 -20, i32 poison>
   %cmp = icmp ne <2 x i32> %1, %2
   ret <2 x i1> %cmp
 }
@@ -1926,14 +1926,14 @@ define <2 x i1> @icmp_add20_sge_add57_splat(<2 x i32> %x, <2 x i32> %y) {
   ret <2 x i1> %cmp
 }
 
-define <2 x i1> @icmp_add20_sge_add57_undef(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: @icmp_add20_sge_add57_undef(
+define <2 x i1> @icmp_add20_sge_add57_poison(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @icmp_add20_sge_add57_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <2 x i32> [[Y:%.*]], <i32 37, i32 37>
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sle <2 x i32> [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %1 = add nsw <2 x i32> %x, <i32 20, i32 20>
-  %2 = add nsw <2 x i32> %y, <i32 57, i32 undef>
+  %2 = add nsw <2 x i32> %y, <i32 57, i32 poison>
   %cmp = icmp sge <2 x i32> %1, %2
   ret <2 x i1> %cmp
 }
@@ -1975,14 +1975,14 @@ define <2 x i1> @icmp_sub57_sge_sub20_splat(<2 x i32> %x, <2 x i32> %y) {
   ret <2 x i1> %cmp
 }
 
-define <2 x i1> @icmp_sub57_sge_sub20_vec_undef(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: @icmp_sub57_sge_sub20_vec_undef(
+define <2 x i1> @icmp_sub57_sge_sub20_vec_poison(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @icmp_sub57_sge_sub20_vec_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <2 x i32> [[X:%.*]], <i32 -37, i32 -37>
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sge <2 x i32> [[TMP1]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
-  %1 = add nsw <2 x i32> %x, <i32 -57, i32 undef>
-  %2 = add nsw <2 x i32> %y, <i32 -20, i32 undef>
+  %1 = add nsw <2 x i32> %x, <i32 -57, i32 poison>
+  %2 = add nsw <2 x i32> %y, <i32 -20, i32 poison>
   %cmp = icmp sge <2 x i32> %1, %2
   ret <2 x i1> %cmp
 }
@@ -2557,13 +2557,13 @@ define <2 x i1> @or_icmp_eq_B_0_icmp_ult_A_B_uniform(<2 x i64> %a, <2 x i64> %b)
   ret <2 x i1> %3
 }
 
-define <2 x i1> @or_icmp_eq_B_0_icmp_ult_A_B_undef(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: @or_icmp_eq_B_0_icmp_ult_A_B_undef(
+define <2 x i1> @or_icmp_eq_B_0_icmp_ult_A_B_poison(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: @or_icmp_eq_B_0_icmp_ult_A_B_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i64> [[B:%.*]], <i64 -1, i64 -1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp uge <2 x i64> [[TMP1]], [[A:%.*]]
 ; CHECK-NEXT:    ret <2 x i1> [[TMP2]]
 ;
-  %1 = icmp eq <2 x i64> %b, <i64 0, i64 undef>
+  %1 = icmp eq <2 x i64> %b, <i64 0, i64 poison>
   %2 = icmp ult <2 x i64> %a, %b
   %3 = or <2 x i1> %1, %2
   ret <2 x i1> %3
@@ -2606,14 +2606,14 @@ define <2 x i1> @or_icmp_ne_A_0_icmp_ne_B_0_uniform(<2 x i64> %a, <2 x i64> %b)
   ret <2 x i1> %3
 }
 
-define <2 x i1> @or_icmp_ne_A_0_icmp_ne_B_0_undef(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: @or_icmp_ne_A_0_icmp_ne_B_0_undef(
+define <2 x i1> @or_icmp_ne_A_0_icmp_ne_B_0_poison(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: @or_icmp_ne_A_0_icmp_ne_B_0_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = or <2 x i64> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <2 x i64> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[TMP2]]
 ;
-  %1 = icmp ne <2 x i64> %a, <i64 0, i64 undef>
-  %2 = icmp ne <2 x i64> %b, <i64 0, i64 undef>
+  %1 = icmp ne <2 x i64> %a, <i64 0, i64 poison>
+  %2 = icmp ne <2 x i64> %b, <i64 0, i64 poison>
   %3 = or <2 x i1> %1, %2
   ret <2 x i1> %3
 }
@@ -2803,13 +2803,13 @@ define <2 x i1> @and_icmp_ne_B_0_icmp_uge_A_B_uniform(<2 x i64> %a, <2 x i64> %b
   ret <2 x i1> %3
 }
 
-define <2 x i1> @and_icmp_ne_B_0_icmp_uge_A_B_undef(<2 x i64> %a, <2 x i64> %b) {
-; CHECK-LABEL: @and_icmp_ne_B_0_icmp_uge_A_B_undef(
+define <2 x i1> @and_icmp_ne_B_0_icmp_uge_A_B_poison(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: @and_icmp_ne_B_0_icmp_uge_A_B_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i64> [[B:%.*]], <i64 -1, i64 -1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult <2 x i64> [[TMP1]], [[A:%.*]]
 ; CHECK-NEXT:    ret <2 x i1> [[TMP2]]
 ;
-  %1 = icmp ne <2 x i64> %b, <i64 0, i64 undef>
+  %1 = icmp ne <2 x i64> %b, <i64 0, i64 poison>
   %2 = icmp uge <2 x i64> %a, %b
   %3 = and <2 x i1> %1, %2
   ret <2 x i1> %3
@@ -3272,13 +3272,13 @@ define <2 x i1> @icmp_and_or_lshr_cst_vec_nonuniform(<2 x i32> %x) {
   ret <2 x i1> %ret
 }
 
-define <2 x i1> @icmp_and_or_lshr_cst_vec_undef(<2 x i32> %x) {
-; CHECK-LABEL: @icmp_and_or_lshr_cst_vec_undef(
+define <2 x i1> @icmp_and_or_lshr_cst_vec_poison(<2 x i32> %x) {
+; CHECK-LABEL: @icmp_and_or_lshr_cst_vec_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[X:%.*]], <i32 3, i32 poison>
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[RET]]
 ;
-  %shf = lshr <2 x i32> %x, <i32 1, i32 undef>
+  %shf = lshr <2 x i32> %x, <i32 1, i32 poison>
   %or = or <2 x i32> %shf, %x
   %and = and <2 x i32> %or, <i32 1, i32 1>
   %ret = icmp ne <2 x i32> %and, zeroinitializer
@@ -3315,15 +3315,15 @@ define <2 x i1> @icmp_and_or_lshr_cst_vec_nonuniform_commute(<2 x i32> %xp) {
   ret <2 x i1> %ret
 }
 
-define <2 x i1> @icmp_and_or_lshr_cst_vec_undef_commute(<2 x i32> %xp) {
-; CHECK-LABEL: @icmp_and_or_lshr_cst_vec_undef_commute(
+define <2 x i1> @icmp_and_or_lshr_cst_vec_poison_commute(<2 x i32> %xp) {
+; CHECK-LABEL: @icmp_and_or_lshr_cst_vec_poison_commute(
 ; CHECK-NEXT:    [[X:%.*]] = srem <2 x i32> [[XP:%.*]], <i32 42, i32 42>
 ; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[X]], <i32 3, i32 poison>
 ; CHECK-NEXT:    [[RET:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[RET]]
 ;
   %x = srem <2 x i32> %xp, <i32 42, i32 -42> ; prevent complexity-based canonicalization
-  %shf = lshr <2 x i32> %x, <i32 1, i32 undef>
+  %shf = lshr <2 x i32> %x, <i32 1, i32 poison>
   %or = or <2 x i32> %x, %shf
   %and = and <2 x i32> %or, <i32 1, i32 1>
   %ret = icmp ne <2 x i32> %and, zeroinitializer
@@ -4360,7 +4360,7 @@ define <2 x i1> @signbit_false_logic(<2 x i5> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = icmp ne <2 x i5> [[X:%.*]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
-  %dec = add <2 x i5> %x,  <i5 -1, i5 undef>
+  %dec = add <2 x i5> %x,  <i5 -1, i5 poison>
   %not = xor <2 x i5> %x,  <i5 -1, i5 -1>
   %and = and <2 x i5> %dec, %not
   %r = icmp sgt <2 x i5> %and, <i5 -1, i5 -1>
diff --git a/llvm/test/Transforms/InstCombine/integer-round-up-pow2-alignment.ll b/llvm/test/Transforms/InstCombine/integer-round-up-pow2-alignment.ll
index 7cef922eaf0ce..c7e0553992b90 100644
--- a/llvm/test/Transforms/InstCombine/integer-round-up-pow2-alignment.ll
+++ b/llvm/test/Transforms/InstCombine/integer-round-up-pow2-alignment.ll
@@ -86,9 +86,9 @@ define <2 x i8> @t4_splat(<2 x i8> %x) {
   ret <2 x i8> %x.roundedup
 }
 
-; Splat-with-undef
-define <2 x i8> @t5_splat_undef_0b0001(<2 x i8> %x) {
-; CHECK-LABEL: @t5_splat_undef_0b0001(
+; Splat-with-poison
+define <2 x i8> @t5_splat_poison_0b0001(<2 x i8> %x) {
+; CHECK-LABEL: @t5_splat_poison_0b0001(
 ; CHECK-NEXT:    [[X_BIASED1:%.*]] = add <2 x i8> [[X:%.*]], <i8 15, i8 15>
 ; CHECK-NEXT:    [[X_ROUNDEDUP:%.*]] = and <2 x i8> [[X_BIASED1]], <i8 -16, i8 -16>
 ; CHECK-NEXT:    ret <2 x i8> [[X_ROUNDEDUP]]
@@ -96,43 +96,43 @@ define <2 x i8> @t5_splat_undef_0b0001(<2 x i8> %x) {
   %x.lowbits = and <2 x i8> %x, <i8 15, i8 15>
   %x.lowbits.are.zero = icmp eq <2 x i8> %x.lowbits, <i8 0, i8 0>
   %x.biased = add <2 x i8> %x, <i8 16, i8 16>
-  %x.biased.highbits = and <2 x i8> %x.biased, <i8 -16, i8 undef>
+  %x.biased.highbits = and <2 x i8> %x.biased, <i8 -16, i8 poison>
   %x.roundedup = select <2 x i1> %x.lowbits.are.zero, <2 x i8> %x, <2 x i8> %x.biased.highbits
   ret <2 x i8> %x.roundedup
 }
-define <2 x i8> @t5_splat_undef_0b0010(<2 x i8> %x) {
-; CHECK-LABEL: @t5_splat_undef_0b0010(
+define <2 x i8> @t5_splat_poison_0b0010(<2 x i8> %x) {
+; CHECK-LABEL: @t5_splat_poison_0b0010(
 ; CHECK-NEXT:    [[X_BIASED1:%.*]] = add <2 x i8> [[X:%.*]], <i8 15, i8 15>
 ; CHECK-NEXT:    [[X_ROUNDEDUP:%.*]] = and <2 x i8> [[X_BIASED1]], <i8 -16, i8 -16>
 ; CHECK-NEXT:    ret <2 x i8> [[X_ROUNDEDUP]]
 ;
   %x.lowbits = and <2 x i8> %x, <i8 15, i8 15>
   %x.lowbits.are.zero = icmp eq <2 x i8> %x.lowbits, <i8 0, i8 0>
-  %x.biased = add <2 x i8> %x, <i8 16, i8 undef>
+  %x.biased = add <2 x i8> %x, <i8 16, i8 poison>
   %x.biased.highbits = and <2 x i8> %x.biased, <i8 -16, i8 -16>
   %x.roundedup = select <2 x i1> %x.lowbits.are.zero, <2 x i8> %x, <2 x i8> %x.biased.highbits
   ret <2 x i8> %x.roundedup
 }
-define <2 x i8> @t5_splat_undef_0b0100(<2 x i8> %x) {
-; CHECK-LABEL: @t5_splat_undef_0b0100(
+define <2 x i8> @t5_splat_poison_0b0100(<2 x i8> %x) {
+; CHECK-LABEL: @t5_splat_poison_0b0100(
 ; CHECK-NEXT:    [[X_BIASED:%.*]] = add <2 x i8> [[X:%.*]], <i8 15, i8 15>
 ; CHECK-NEXT:    [[X_ROUNDEDUP:%.*]] = and <2 x i8> [[X_BIASED]], <i8 -16, i8 -16>
 ; CHECK-NEXT:    ret <2 x i8> [[X_ROUNDEDUP]]
 ;
   %x.lowbits = and <2 x i8> %x, <i8 15, i8 15>
-  %x.lowbits.are.zero = icmp eq <2 x i8> %x.lowbits, <i8 0, i8 undef>
+  %x.lowbits.are.zero = icmp eq <2 x i8> %x.lowbits, <i8 0, i8 poison>
   %x.biased = add <2 x i8> %x, <i8 16, i8 16>
   %x.biased.highbits = and <2 x i8> %x.biased, <i8 -16, i8 -16>
   %x.roundedup = select <2 x i1> %x.lowbits.are.zero, <2 x i8> %x, <2 x i8> %x.biased.highbits
   ret <2 x i8> %x.roundedup
 }
-define <2 x i8> @t5_splat_undef_0b1000(<2 x i8> %x) {
-; CHECK-LABEL: @t5_splat_undef_0b1000(
+define <2 x i8> @t5_splat_poison_0b1000(<2 x i8> %x) {
+; CHECK-LABEL: @t5_splat_poison_0b1000(
 ; CHECK-NEXT:    [[X_BIASED:%.*]] = add <2 x i8> [[X:%.*]], <i8 15, i8 15>
 ; CHECK-NEXT:    [[X_ROUNDEDUP:%.*]] = and <2 x i8> [[X_BIASED]], <i8 -16, i8 -16>
 ; CHECK-NEXT:    ret <2 x i8> [[X_ROUNDEDUP]]
 ;
-  %x.lowbits = and <2 x i8> %x, <i8 15, i8 undef>
+  %x.lowbits = and <2 x i8> %x, <i8 15, i8 poison>
   %x.lowbits.are.zero = icmp eq <2 x i8> %x.lowbits, <i8 0, i8 0>
   %x.biased = add <2 x i8> %x, <i8 16, i8 16>
   %x.biased.highbits = and <2 x i8> %x.biased, <i8 -16, i8 -16>
@@ -177,64 +177,64 @@ define <2 x i8> @t7_nonsplat_bias(<2 x i8> %x) {
 }
 
 ; Splat-in-disguise vector tests
-define <2 x i8> @t8_nonsplat_masked_by_undef_0b0001(<2 x i8> %x) {
-; CHECK-LABEL: @t8_nonsplat_masked_by_undef_0b0001(
+define <2 x i8> @t8_nonsplat_masked_by_poison_0b0001(<2 x i8> %x) {
+; CHECK-LABEL: @t8_nonsplat_masked_by_poison_0b0001(
 ; CHECK-NEXT:    [[X_LOWBITS:%.*]] = and <2 x i8> [[X:%.*]], <i8 15, i8 31>
 ; CHECK-NEXT:    [[X_LOWBITS_ARE_ZERO:%.*]] = icmp eq <2 x i8> [[X_LOWBITS]], zeroinitializer
 ; CHECK-NEXT:    [[X_BIASED:%.*]] = add <2 x i8> [[X]], <i8 16, i8 32>
-; CHECK-NEXT:    [[X_BIASED_HIGHBITS:%.*]] = and <2 x i8> [[X_BIASED]], <i8 -16, i8 undef>
+; CHECK-NEXT:    [[X_BIASED_HIGHBITS:%.*]] = and <2 x i8> [[X_BIASED]], <i8 -16, i8 poison>
 ; CHECK-NEXT:    [[X_ROUNDEDUP:%.*]] = select <2 x i1> [[X_LOWBITS_ARE_ZERO]], <2 x i8> [[X]], <2 x i8> [[X_BIASED_HIGHBITS]]
 ; CHECK-NEXT:    ret <2 x i8> [[X_ROUNDEDUP]]
 ;
   %x.lowbits = and <2 x i8> %x, <i8 15, i8 31>
   %x.lowbits.are.zero = icmp eq <2 x i8> %x.lowbits, <i8 0, i8 0>
   %x.biased = add <2 x i8> %x, <i8 16, i8 32>
-  %x.biased.highbits = and <2 x i8> %x.biased, <i8 -16, i8 undef>
+  %x.biased.highbits = and <2 x i8> %x.biased, <i8 -16, i8 poison>
   %x.roundedup = select <2 x i1> %x.lowbits.are.zero, <2 x i8> %x, <2 x i8> %x.biased.highbits
   ret <2 x i8> %x.roundedup
 }
-define <2 x i8> @t8_nonsplat_masked_by_undef_0b0010(<2 x i8> %x) {
-; CHECK-LABEL: @t8_nonsplat_masked_by_undef_0b0010(
+define <2 x i8> @t8_nonsplat_masked_by_poison_0b0010(<2 x i8> %x) {
+; CHECK-LABEL: @t8_nonsplat_masked_by_poison_0b0010(
 ; CHECK-NEXT:    [[X_LOWBITS:%.*]] = and <2 x i8> [[X:%.*]], <i8 15, i8 31>
 ; CHECK-NEXT:    [[X_LOWBITS_ARE_ZERO:%.*]] = icmp eq <2 x i8> [[X_LOWBITS]], zeroinitializer
-; CHECK-NEXT:    [[X_BIASED:%.*]] = add <2 x i8> [[X]], <i8 16, i8 undef>
+; CHECK-NEXT:    [[X_BIASED:%.*]] = add <2 x i8> [[X]], <i8 16, i8 poison>
 ; CHECK-NEXT:    [[X_BIASED_HIGHBITS:%.*]] = and <2 x i8> [[X_BIASED]], <i8 -16, i8 -32>
 ; CHECK-NEXT:    [[X_ROUNDEDUP:%.*]] = select <2 x i1> [[X_LOWBITS_ARE_ZERO]], <2 x i8> [[X]], <2 x i8> [[X_BIASED_HIGHBITS]]
 ; CHECK-NEXT:    ret <2 x i8> [[X_ROUNDEDUP]]
 ;
   %x.lowbits = and <2 x i8> %x, <i8 15, i8 31>
   %x.lowbits.are.zero = icmp eq <2 x i8> %x.lowbits, <i8 0, i8 0>
-  %x.biased = add <2 x i8> %x, <i8 16, i8 undef>
+  %x.biased = add <2 x i8> %x, <i8 16, i8 poison>
   %x.biased.highbits = and <2 x i8> %x.biased, <i8 -16, i8 -32>
   %x.roundedup = select <2 x i1> %x.lowbits.are.zero, <2 x i8> %x, <2 x i8> %x.biased.highbits
   ret <2 x i8> %x.roundedup
 }
-define <2 x i8> @t8_nonsplat_masked_by_undef_0b0100(<2 x i8> %x) {
-; CHECK-LABEL: @t8_nonsplat_masked_by_undef_0b0100(
+define <2 x i8> @t8_nonsplat_masked_by_poison_0b0100(<2 x i8> %x) {
+; CHECK-LABEL: @t8_nonsplat_masked_by_poison_0b0100(
 ; CHECK-NEXT:    [[X_LOWBITS:%.*]] = and <2 x i8> [[X:%.*]], <i8 15, i8 31>
-; CHECK-NEXT:    [[X_LOWBITS_ARE_ZERO:%.*]] = icmp eq <2 x i8> [[X_LOWBITS]], <i8 0, i8 undef>
+; CHECK-NEXT:    [[X_LOWBITS_ARE_ZERO:%.*]] = icmp eq <2 x i8> [[X_LOWBITS]], <i8 0, i8 poison>
 ; CHECK-NEXT:    [[X_BIASED:%.*]] = add <2 x i8> [[X]], <i8 16, i8 32>
 ; CHECK-NEXT:    [[X_BIASED_HIGHBITS:%.*]] = and <2 x i8> [[X_BIASED]], <i8 -16, i8 -32>
 ; CHECK-NEXT:    [[X_ROUNDEDUP:%.*]] = select <2 x i1> [[X_LOWBITS_ARE_ZERO]], <2 x i8> [[X]], <2 x i8> [[X_BIASED_HIGHBITS]]
 ; CHECK-NEXT:    ret <2 x i8> [[X_ROUNDEDUP]]
 ;
   %x.lowbits = and <2 x i8> %x, <i8 15, i8 31>
-  %x.lowbits.are.zero = icmp eq <2 x i8> %x.lowbits, <i8 0, i8 undef>
+  %x.lowbits.are.zero = icmp eq <2 x i8> %x.lowbits, <i8 0, i8 poison>
   %x.biased = add <2 x i8> %x, <i8 16, i8 32>
   %x.biased.highbits = and <2 x i8> %x.biased, <i8 -16, i8 -32>
   %x.roundedup = select <2 x i1> %x.lowbits.are.zero, <2 x i8> %x, <2 x i8> %x.biased.highbits
   ret <2 x i8> %x.roundedup
 }
-define <2 x i8> @t8_nonsplat_masked_by_undef_0b1000(<2 x i8> %x) {
-; CHECK-LABEL: @t8_nonsplat_masked_by_undef_0b1000(
-; CHECK-NEXT:    [[X_LOWBITS:%.*]] = and <2 x i8> [[X:%.*]], <i8 15, i8 undef>
+define <2 x i8> @t8_nonsplat_masked_by_poison_0b1000(<2 x i8> %x) {
+; CHECK-LABEL: @t8_nonsplat_masked_by_poison_0b1000(
+; CHECK-NEXT:    [[X_LOWBITS:%.*]] = and <2 x i8> [[X:%.*]], <i8 15, i8 poison>
 ; CHECK-NEXT:    [[X_LOWBITS_ARE_ZERO:%.*]] = icmp eq <2 x i8> [[X_LOWBITS]], zeroinitializer
 ; CHECK-NEXT:    [[X_BIASED:%.*]] = add <2 x i8> [[X]], <i8 16, i8 32>
 ; CHECK-NEXT:    [[X_BIASED_HIGHBITS:%.*]] = and <2 x i8> [[X_BIASED]], <i8 -16, i8 -32>
 ; CHECK-NEXT:    [[X_ROUNDEDUP:%.*]] = select <2 x i1> [[X_LOWBITS_ARE_ZERO]], <2 x i8> [[X]], <2 x i8> [[X_BIASED_HIGHBITS]]
 ; CHECK-NEXT:    ret <2 x i8> [[X_ROUNDEDUP]]
 ;
-  %x.lowbits = and <2 x i8> %x, <i8 15, i8 undef>
+  %x.lowbits = and <2 x i8> %x, <i8 15, i8 poison>
   %x.lowbits.are.zero = icmp eq <2 x i8> %x.lowbits, <i8 0, i8 0>
   %x.biased = add <2 x i8> %x, <i8 16, i8 32>
   %x.biased.highbits = and <2 x i8> %x.biased, <i8 -16, i8 -32>
@@ -442,28 +442,28 @@ define i8 @t17_oneuse(i8 %x) {
 define <2 x i4> @t18_replacement_0b0001(<2 x i4> %x) {
 ; CHECK-LABEL: @t18_replacement_0b0001(
 ; CHECK-NEXT:    [[X_BIASED:%.*]] = add <2 x i4> [[X:%.*]], <i4 3, i4 3>
-; CHECK-NEXT:    [[X_BIASED_HIGHBITS:%.*]] = and <2 x i4> [[X_BIASED]], <i4 -4, i4 undef>
+; CHECK-NEXT:    [[X_BIASED_HIGHBITS:%.*]] = and <2 x i4> [[X_BIASED]], <i4 -4, i4 poison>
 ; CHECK-NEXT:    call void @use.v2i4(<2 x i4> [[X_BIASED_HIGHBITS]])
 ; CHECK-NEXT:    ret <2 x i4> [[X_BIASED_HIGHBITS]]
 ;
   %x.lowbits = and <2 x i4> %x, <i4 3, i4 3>
   %x.lowbits.are.zero = icmp eq <2 x i4> %x.lowbits, <i4 0, i4 0>
   %x.biased = add <2 x i4> %x, <i4 3, i4 3>
-  %x.biased.highbits = and <2 x i4> %x.biased, <i4 -4, i4 undef>
+  %x.biased.highbits = and <2 x i4> %x.biased, <i4 -4, i4 poison>
   call void @use.v2i4(<2 x i4> %x.biased.highbits)
   %x.roundedup = select <2 x i1> %x.lowbits.are.zero, <2 x i4> %x, <2 x i4> %x.biased.highbits
   ret <2 x i4> %x.roundedup
 }
 define <2 x i4> @t18_replacement_0b0010(<2 x i4> %x) {
 ; CHECK-LABEL: @t18_replacement_0b0010(
-; CHECK-NEXT:    [[X_BIASED:%.*]] = add <2 x i4> [[X:%.*]], <i4 3, i4 undef>
+; CHECK-NEXT:    [[X_BIASED:%.*]] = add <2 x i4> [[X:%.*]], <i4 3, i4 poison>
 ; CHECK-NEXT:    [[X_BIASED_HIGHBITS:%.*]] = and <2 x i4> [[X_BIASED]], <i4 -4, i4 -4>
 ; CHECK-NEXT:    call void @use.v2i4(<2 x i4> [[X_BIASED_HIGHBITS]])
 ; CHECK-NEXT:    ret <2 x i4> [[X_BIASED_HIGHBITS]]
 ;
   %x.lowbits = and <2 x i4> %x, <i4 3, i4 3>
   %x.lowbits.are.zero = icmp eq <2 x i4> %x.lowbits, <i4 0, i4 0>
-  %x.biased = add <2 x i4> %x, <i4 3, i4 undef>
+  %x.biased = add <2 x i4> %x, <i4 3, i4 poison>
   %x.biased.highbits = and <2 x i4> %x.biased, <i4 -4, i4 -4>
   call void @use.v2i4(<2 x i4> %x.biased.highbits)
   %x.roundedup = select <2 x i1> %x.lowbits.are.zero, <2 x i4> %x, <2 x i4> %x.biased.highbits
@@ -477,7 +477,7 @@ define <2 x i4> @t18_replacement_0b0100(<2 x i4> %x) {
 ; CHECK-NEXT:    ret <2 x i4> [[X_BIASED_HIGHBITS]]
 ;
   %x.lowbits = and <2 x i4> %x, <i4 3, i4 3>
-  %x.lowbits.are.zero = icmp eq <2 x i4> %x.lowbits, <i4 0, i4 undef>
+  %x.lowbits.are.zero = icmp eq <2 x i4> %x.lowbits, <i4 0, i4 poison>
   %x.biased = add <2 x i4> %x, <i4 3, i4 3>
   %x.biased.highbits = and <2 x i4> %x.biased, <i4 -4, i4 -4>
   call void @use.v2i4(<2 x i4> %x.biased.highbits)
@@ -491,7 +491,7 @@ define <2 x i4> @t18_replacement_0b1000(<2 x i4> %x) {
 ; CHECK-NEXT:    call void @use.v2i4(<2 x i4> [[X_BIASED_HIGHBITS]])
 ; CHECK-NEXT:    ret <2 x i4> [[X_BIASED_HIGHBITS]]
 ;
-  %x.lowbits = and <2 x i4> %x, <i4 3, i4 undef>
+  %x.lowbits = and <2 x i4> %x, <i4 3, i4 poison>
   %x.lowbits.are.zero = icmp eq <2 x i4> %x.lowbits, <i4 0, i4 0>
   %x.biased = add <2 x i4> %x, <i4 3, i4 3>
   %x.biased.highbits = and <2 x i4> %x.biased, <i4 -4, i4 -4>
diff --git a/llvm/test/Transforms/InstCombine/invert-variable-mask-in-masked-merge-vector.ll b/llvm/test/Transforms/InstCombine/invert-variable-mask-in-masked-merge-vector.ll
index 486113202ddd7..a76662c4bc439 100644
--- a/llvm/test/Transforms/InstCombine/invert-variable-mask-in-masked-merge-vector.ll
+++ b/llvm/test/Transforms/InstCombine/invert-variable-mask-in-masked-merge-vector.ll
@@ -20,14 +20,14 @@ define <2 x i4> @vector (<2 x i4> %x, <2 x i4> %y, <2 x i4> %m) {
   ret <2 x i4> %r
 }
 
-define <3 x i4> @vector_undef (<3 x i4> %x, <3 x i4> %y, <3 x i4> %m) {
-; CHECK-LABEL: @vector_undef(
+define <3 x i4> @vector_poison (<3 x i4> %x, <3 x i4> %y, <3 x i4> %m) {
+; CHECK-LABEL: @vector_poison(
 ; CHECK-NEXT:    [[N0:%.*]] = xor <3 x i4> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = and <3 x i4> [[N0]], [[M:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = xor <3 x i4> [[TMP1]], [[X]]
 ; CHECK-NEXT:    ret <3 x i4> [[R]]
 ;
-  %im = xor <3 x i4> %m, <i4 -1, i4 undef, i4 -1>
+  %im = xor <3 x i4> %m, <i4 -1, i4 poison, i4 -1>
   %n0 = xor <3 x i4> %x, %y
   %n1 = and <3 x i4> %n0, %im
   %r  = xor <3 x i4> %n1, %y
@@ -78,17 +78,17 @@ define <2 x i4> @in_constant_varx_6_invmask_nonsplat(<2 x i4> %x, <2 x i4> %mask
   ret <2 x i4> %r
 }
 
-define <3 x i4> @in_constant_varx_6_invmask_undef(<3 x i4> %x, <3 x i4> %mask) {
-; CHECK-LABEL: @in_constant_varx_6_invmask_undef(
-; CHECK-NEXT:    [[N0:%.*]] = xor <3 x i4> [[X:%.*]], <i4 6, i4 undef, i4 7>
+define <3 x i4> @in_constant_varx_6_invmask_poison(<3 x i4> %x, <3 x i4> %mask) {
+; CHECK-LABEL: @in_constant_varx_6_invmask_poison(
+; CHECK-NEXT:    [[N0:%.*]] = xor <3 x i4> [[X:%.*]], <i4 6, i4 poison, i4 7>
 ; CHECK-NEXT:    [[TMP1:%.*]] = and <3 x i4> [[N0]], [[MASK:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = xor <3 x i4> [[TMP1]], [[X]]
 ; CHECK-NEXT:    ret <3 x i4> [[R]]
 ;
-  %notmask = xor <3 x i4> %mask, <i4 -1, i4 undef, i4 -1>
-  %n0 = xor <3 x i4> %x, <i4 6, i4 undef, i4 7> ; %x
+  %notmask = xor <3 x i4> %mask, <i4 -1, i4 poison, i4 -1>
+  %n0 = xor <3 x i4> %x, <i4 6, i4 poison, i4 7> ; %x
   %n1 = and <3 x i4> %n0, %notmask
-  %r = xor <3 x i4> %n1, <i4 6, i4 undef, i4 7>
+  %r = xor <3 x i4> %n1, <i4 6, i4 poison, i4 7>
   ret <3 x i4> %r
 }
 
@@ -133,15 +133,15 @@ define <2 x i4> @in_constant_6_vary_invmask_nonsplat(<2 x i4> %y, <2 x i4> %mask
   ret <2 x i4> %r
 }
 
-define <3 x i4> @in_constant_6_vary_invmask_undef(<3 x i4> %y, <3 x i4> %mask) {
-; CHECK-LABEL: @in_constant_6_vary_invmask_undef(
-; CHECK-NEXT:    [[N0:%.*]] = xor <3 x i4> [[Y:%.*]], <i4 6, i4 undef, i4 6>
+define <3 x i4> @in_constant_6_vary_invmask_poison(<3 x i4> %y, <3 x i4> %mask) {
+; CHECK-LABEL: @in_constant_6_vary_invmask_poison(
+; CHECK-NEXT:    [[N0:%.*]] = xor <3 x i4> [[Y:%.*]], <i4 6, i4 poison, i4 6>
 ; CHECK-NEXT:    [[TMP1:%.*]] = and <3 x i4> [[N0]], [[MASK:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = xor <3 x i4> [[TMP1]], <i4 6, i4 undef, i4 6>
+; CHECK-NEXT:    [[R:%.*]] = xor <3 x i4> [[TMP1]], <i4 6, i4 poison, i4 6>
 ; CHECK-NEXT:    ret <3 x i4> [[R]]
 ;
-  %notmask = xor <3 x i4> %mask, <i4 -1, i4 undef, i4 -1>
-  %n0 = xor <3 x i4> %y, <i4 6, i4 undef, i4 6> ; %x
+  %notmask = xor <3 x i4> %mask, <i4 -1, i4 poison, i4 -1>
+  %n0 = xor <3 x i4> %y, <i4 6, i4 poison, i4 6> ; %x
   %n1 = and <3 x i4> %n0, %notmask
   %r = xor <3 x i4> %n1, %y
   ret <3 x i4> %r
diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll
index 85a21332b0788..8b4249b2c25a9 100644
--- a/llvm/test/Transforms/InstCombine/known-bits.ll
+++ b/llvm/test/Transforms/InstCombine/known-bits.ll
@@ -1223,7 +1223,7 @@ define i8 @known_reduce_and(<2 x i8> %xx) {
 ; CHECK-NEXT:    ret i8 1
 ;
   %x = or <2 x i8> %xx, <i8 5, i8 3>
-  %v = call i8 @llvm.vector.reduce.or(<2 x i8> %x)
+  %v = call i8 @llvm.vector.reduce.and(<2 x i8> %x)
   %r = and i8 %v, 1
   ret i8 %r
 }
@@ -1231,12 +1231,12 @@ define i8 @known_reduce_and(<2 x i8> %xx) {
 define i8 @known_reduce_and_fail(<2 x i8> %xx) {
 ; CHECK-LABEL: @known_reduce_and_fail(
 ; CHECK-NEXT:    [[X:%.*]] = or <2 x i8> [[XX:%.*]], <i8 5, i8 3>
-; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> [[X]])
+; CHECK-NEXT:    [[V:%.*]] = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> [[X]])
 ; CHECK-NEXT:    [[R:%.*]] = and i8 [[V]], 2
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %x = or <2 x i8> %xx, <i8 5, i8 3>
-  %v = call i8 @llvm.vector.reduce.or(<2 x i8> %x)
+  %v = call i8 @llvm.vector.reduce.and(<2 x i8> %x)
   %r = and i8 %v, 2
   ret i8 %r
 }
diff --git a/llvm/test/Transforms/InstCombine/known-fpclass-reduce-signbit.ll b/llvm/test/Transforms/InstCombine/known-fpclass-reduce-signbit.ll
new file mode 100644
index 0000000000000..f46ea9db751ff
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/known-fpclass-reduce-signbit.ll
@@ -0,0 +1,119 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -S -passes=instcombine | FileCheck %s
+
+define i1 @vector_reduce_maximum_signbit(<4 x double> nofpclass(nan nzero) %x) {
+; CHECK-LABEL: define i1 @vector_reduce_maximum_signbit
+; CHECK-SAME: (<4 x double> nofpclass(nan nzero) [[X:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
+  %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x)
+  %op = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %x.abs)
+  %cmp = fcmp oge double %op, 0.0
+  ret i1 %cmp
+}
+
+define i1 @vector_reduce_maximum_signbit_fail_maybe_nan(<4 x double> nofpclass(nzero) %x) {
+; CHECK-LABEL: define i1 @vector_reduce_maximum_signbit_fail_maybe_nan
+; CHECK-SAME: (<4 x double> nofpclass(nzero) [[X:%.*]]) {
+; CHECK-NEXT:    [[X_ABS:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[X]])
+; CHECK-NEXT:    [[OP:%.*]] = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> [[X_ABS]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oge double [[OP]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x)
+  %op = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %x.abs)
+  %cmp = fcmp oge double %op, 0.0
+  ret i1 %cmp
+}
+
+
+define i1 @vector_reduce_minimum_signbit(<4 x double> nofpclass(nan nzero) %x) {
+; CHECK-LABEL: define i1 @vector_reduce_minimum_signbit
+; CHECK-SAME: (<4 x double> nofpclass(nan nzero) [[X:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
+  %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x)
+  %op = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %x.abs)
+  %cmp = fcmp oge double %op, 0.0
+  ret i1 %cmp
+}
+
+define i1 @vector_reduce_minimum_signbit_fail_maybe_nan(<4 x double> nofpclass(nzero) %x) {
+; CHECK-LABEL: define i1 @vector_reduce_minimum_signbit_fail_maybe_nan
+; CHECK-SAME: (<4 x double> nofpclass(nzero) [[X:%.*]]) {
+; CHECK-NEXT:    [[X_ABS:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[X]])
+; CHECK-NEXT:    [[OP:%.*]] = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> [[X_ABS]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oge double [[OP]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x)
+  %op = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %x.abs)
+  %cmp = fcmp oge double %op, 0.0
+  ret i1 %cmp
+}
+
+define i1 @vector_reduce_max_signbit(<4 x double> nofpclass(nan nzero) %x) {
+; CHECK-LABEL: define i1 @vector_reduce_max_signbit
+; CHECK-SAME: (<4 x double> nofpclass(nan nzero) [[X:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
+  %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x)
+  %op = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x.abs)
+  %cmp = fcmp oge double %op, 0.0
+  ret i1 %cmp
+}
+
+define i1 @vector_reduce_max_signbit_fail_maybe_nan(<4 x double> nofpclass(nzero) %x) {
+; CHECK-LABEL: define i1 @vector_reduce_max_signbit_fail_maybe_nan
+; CHECK-SAME: (<4 x double> nofpclass(nzero) [[X:%.*]]) {
+; CHECK-NEXT:    [[X_ABS:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[X]])
+; CHECK-NEXT:    [[OP:%.*]] = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> [[X_ABS]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oge double [[OP]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x)
+  %op = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %x.abs)
+  %cmp = fcmp oge double %op, 0.0
+  ret i1 %cmp
+}
+
+
+define i1 @vector_reduce_min_signbit(<4 x double> nofpclass(nan nzero) %x) {
+; CHECK-LABEL: define i1 @vector_reduce_min_signbit
+; CHECK-SAME: (<4 x double> nofpclass(nan nzero) [[X:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
+  %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x)
+  %op = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x.abs)
+  %cmp = fcmp oge double %op, 0.0
+  ret i1 %cmp
+}
+
+define i1 @vector_reduce_min_signbit_fail_maybe_nan(<4 x double> nofpclass(nzero) %x) {
+; CHECK-LABEL: define i1 @vector_reduce_min_signbit_fail_maybe_nan
+; CHECK-SAME: (<4 x double> nofpclass(nzero) [[X:%.*]]) {
+; CHECK-NEXT:    [[X_ABS:%.*]] = call <4 x double> @llvm.fabs.v4f64(<4 x double> [[X]])
+; CHECK-NEXT:    [[OP:%.*]] = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[X_ABS]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oge double [[OP]], 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x)
+  %op = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x.abs)
+  %cmp = fcmp oge double %op, 0.0
+  ret i1 %cmp
+}
+
+
+
+define i1 @vector_reduce_min_signbit_nnan_from_fmf(<4 x double> nofpclass(nzero) %x) {
+; CHECK-LABEL: define i1 @vector_reduce_min_signbit_nnan_from_fmf
+; CHECK-SAME: (<4 x double> nofpclass(nzero) [[X:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
+  %x.abs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x)
+  %op = call nnan double @llvm.vector.reduce.fmin.v4f64(<4 x double> %x.abs)
+  %cmp = fcmp oge double %op, 0.0
+  ret i1 %cmp
+}
+
+
diff --git a/llvm/test/Transforms/InstCombine/lshr-and-negC-icmpeq-zero.ll b/llvm/test/Transforms/InstCombine/lshr-and-negC-icmpeq-zero.ll
index 847a7940bad8c..5d058b20be720 100644
--- a/llvm/test/Transforms/InstCombine/lshr-and-negC-icmpeq-zero.ll
+++ b/llvm/test/Transforms/InstCombine/lshr-and-negC-icmpeq-zero.ll
@@ -81,39 +81,39 @@ define <4 x i1> @vec_4xi32_lshr_and_negC_eq(<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %r
 }
 
-define <4 x i1> @vec_lshr_and_negC_eq_undef1(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: @vec_lshr_and_negC_eq_undef1(
+define <4 x i1> @vec_lshr_and_negC_eq_poison1(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @vec_lshr_and_negC_eq_poison1(
 ; CHECK-NEXT:    [[LSHR:%.*]] = lshr <4 x i32> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = icmp ult <4 x i32> [[LSHR]], <i32 8, i32 8, i32 8, i32 8>
 ; CHECK-NEXT:    ret <4 x i1> [[R]]
 ;
   %lshr = lshr <4 x i32> %x, %y
-  %and = and <4 x i32> %lshr, <i32 4294967288, i32 undef, i32 4294967288, i32 4294967288>  ; ~7
+  %and = and <4 x i32> %lshr, <i32 4294967288, i32 poison, i32 4294967288, i32 4294967288>  ; ~7
   %r = icmp eq <4 x i32> %and, <i32 0, i32 0, i32 0, i32 0>
   ret <4 x i1> %r
 }
 
-define <4 x i1> @vec_lshr_and_negC_eq_undef2(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: @vec_lshr_and_negC_eq_undef2(
+define <4 x i1> @vec_lshr_and_negC_eq_poison2(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @vec_lshr_and_negC_eq_poison2(
 ; CHECK-NEXT:    [[LSHR:%.*]] = lshr <4 x i32> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = icmp ult <4 x i32> [[LSHR]], <i32 8, i32 8, i32 8, i32 8>
 ; CHECK-NEXT:    ret <4 x i1> [[R]]
 ;
   %lshr = lshr <4 x i32> %x, %y
   %and = and <4 x i32> %lshr, <i32 4294967288, i32 4294967288, i32 4294967288, i32 4294967288>  ; ~7
-  %r = icmp eq <4 x i32> %and, <i32 0, i32 0, i32 0, i32 undef>
+  %r = icmp eq <4 x i32> %and, <i32 0, i32 0, i32 0, i32 poison>
   ret <4 x i1> %r
 }
 
-define <4 x i1> @vec_lshr_and_negC_eq_undef3(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: @vec_lshr_and_negC_eq_undef3(
+define <4 x i1> @vec_lshr_and_negC_eq_poison3(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @vec_lshr_and_negC_eq_poison3(
 ; CHECK-NEXT:    [[LSHR:%.*]] = lshr <4 x i32> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = icmp ult <4 x i32> [[LSHR]], <i32 8, i32 8, i32 8, i32 8>
 ; CHECK-NEXT:    ret <4 x i1> [[R]]
 ;
   %lshr = lshr <4 x i32> %x, %y
-  %and = and <4 x i32> %lshr, <i32 4294967288, i32 4294967288, i32 undef, i32 4294967288>  ; ~7
-  %r = icmp eq <4 x i32> %and, <i32 0, i32 0, i32 0, i32 undef>
+  %and = and <4 x i32> %lshr, <i32 4294967288, i32 4294967288, i32 poison, i32 4294967288>  ; ~7
+  %r = icmp eq <4 x i32> %and, <i32 0, i32 0, i32 0, i32 poison>
   ret <4 x i1> %r
 }
 
diff --git a/llvm/test/Transforms/InstCombine/lshr-and-signbit-icmpeq-zero.ll b/llvm/test/Transforms/InstCombine/lshr-and-signbit-icmpeq-zero.ll
index 39f4e58b25dc8..0166680309ea8 100644
--- a/llvm/test/Transforms/InstCombine/lshr-and-signbit-icmpeq-zero.ll
+++ b/llvm/test/Transforms/InstCombine/lshr-and-signbit-icmpeq-zero.ll
@@ -81,39 +81,39 @@ define <4 x i1> @vec_4xi32_lshr_and_signbit_eq(<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %r
 }
 
-define <4 x i1> @vec_4xi32_lshr_and_signbit_eq_undef1(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: @vec_4xi32_lshr_and_signbit_eq_undef1(
+define <4 x i1> @vec_4xi32_lshr_and_signbit_eq_poison1(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @vec_4xi32_lshr_and_signbit_eq_poison1(
 ; CHECK-NEXT:    [[LSHR:%.*]] = lshr <4 x i32> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = icmp sgt <4 x i32> [[LSHR]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    ret <4 x i1> [[R]]
 ;
   %lshr = lshr <4 x i32> %x, %y
-  %and = and <4 x i32> %lshr, <i32 2147483648, i32 undef, i32 2147483648, i32 2147483648>
+  %and = and <4 x i32> %lshr, <i32 2147483648, i32 poison, i32 2147483648, i32 2147483648>
   %r = icmp eq <4 x i32> %and, <i32 0, i32 0, i32 0, i32 0>
   ret <4 x i1> %r
 }
 
-define <4 x i1> @vec_4xi32_lshr_and_signbit_eq_undef2(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: @vec_4xi32_lshr_and_signbit_eq_undef2(
+define <4 x i1> @vec_4xi32_lshr_and_signbit_eq_poison2(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @vec_4xi32_lshr_and_signbit_eq_poison2(
 ; CHECK-NEXT:    [[LSHR:%.*]] = lshr <4 x i32> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = icmp sgt <4 x i32> [[LSHR]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    ret <4 x i1> [[R]]
 ;
   %lshr = lshr <4 x i32> %x, %y
   %and = and <4 x i32> %lshr, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>
-  %r = icmp eq <4 x i32> %and, <i32 undef, i32 0, i32 0, i32 0>
+  %r = icmp eq <4 x i32> %and, <i32 poison, i32 0, i32 0, i32 0>
   ret <4 x i1> %r
 }
 
-define <4 x i1> @vec_4xi32_lshr_and_signbit_eq_undef3(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: @vec_4xi32_lshr_and_signbit_eq_undef3(
+define <4 x i1> @vec_4xi32_lshr_and_signbit_eq_poison3(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @vec_4xi32_lshr_and_signbit_eq_poison3(
 ; CHECK-NEXT:    [[LSHR:%.*]] = lshr <4 x i32> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = icmp sgt <4 x i32> [[LSHR]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    ret <4 x i1> [[R]]
 ;
   %lshr = lshr <4 x i32> %x, %y
-  %and = and <4 x i32> %lshr, <i32 2147483648, i32 undef, i32 2147483648, i32 2147483648>
-  %r = icmp eq <4 x i32> %and, <i32 0, i32 0, i32 0, i32 undef>
+  %and = and <4 x i32> %lshr, <i32 2147483648, i32 poison, i32 2147483648, i32 2147483648>
+  %r = icmp eq <4 x i32> %and, <i32 0, i32 0, i32 0, i32 poison>
   ret <4 x i1> %r
 }
 
diff --git a/llvm/test/Transforms/InstCombine/masked-merge-add.ll b/llvm/test/Transforms/InstCombine/masked-merge-add.ll
index f655153108a43..0484369e99d6a 100644
--- a/llvm/test/Transforms/InstCombine/masked-merge-add.ll
+++ b/llvm/test/Transforms/InstCombine/masked-merge-add.ll
@@ -51,7 +51,7 @@ define <3 x i32> @p_vec_undef(<3 x i32> %x, <3 x i32> %y, <3 x i32> noundef %m)
 ; CHECK-NEXT:    [[AND:%.*]] = and <3 x i32> [[X:%.*]], [[M:%.*]]
 ; CHECK-NEXT:    [[NEG:%.*]] = xor <3 x i32> [[M]], <i32 -1, i32 undef, i32 -1>
 ; CHECK-NEXT:    [[AND1:%.*]] = and <3 x i32> [[NEG]], [[Y:%.*]]
-; CHECK-NEXT:    [[RET:%.*]] = or disjoint <3 x i32> [[AND]], [[AND1]]
+; CHECK-NEXT:    [[RET:%.*]] = add <3 x i32> [[AND]], [[AND1]]
 ; CHECK-NEXT:    ret <3 x i32> [[RET]]
 ;
   %and = and <3 x i32> %x, %m
@@ -61,6 +61,21 @@ define <3 x i32> @p_vec_undef(<3 x i32> %x, <3 x i32> %y, <3 x i32> noundef %m)
   ret <3 x i32> %ret
 }
 
+define <3 x i32> @p_vec_poison(<3 x i32> %x, <3 x i32> %y, <3 x i32> noundef %m) {
+; CHECK-LABEL: @p_vec_poison(
+; CHECK-NEXT:    [[AND:%.*]] = and <3 x i32> [[X:%.*]], [[M:%.*]]
+; CHECK-NEXT:    [[NEG:%.*]] = xor <3 x i32> [[M]], <i32 -1, i32 poison, i32 -1>
+; CHECK-NEXT:    [[AND1:%.*]] = and <3 x i32> [[NEG]], [[Y:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = or disjoint <3 x i32> [[AND]], [[AND1]]
+; CHECK-NEXT:    ret <3 x i32> [[RET]]
+;
+  %and = and <3 x i32> %x, %m
+  %neg = xor <3 x i32> %m, <i32 -1, i32 poison, i32 -1>
+  %and1 = and <3 x i32> %neg, %y
+  %ret = add <3 x i32> %and, %and1
+  ret <3 x i32> %ret
+}
+
 ; ============================================================================ ;
 ; Constant mask.
 ; ============================================================================ ;
diff --git a/llvm/test/Transforms/InstCombine/masked-merge-or.ll b/llvm/test/Transforms/InstCombine/masked-merge-or.ll
index b49ec07706e28..0531a532fc7e0 100644
--- a/llvm/test/Transforms/InstCombine/masked-merge-or.ll
+++ b/llvm/test/Transforms/InstCombine/masked-merge-or.ll
@@ -51,7 +51,7 @@ define <3 x i32> @p_vec_undef(<3 x i32> %x, <3 x i32> %y, <3 x i32> noundef %m)
 ; CHECK-NEXT:    [[AND:%.*]] = and <3 x i32> [[X:%.*]], [[M:%.*]]
 ; CHECK-NEXT:    [[NEG:%.*]] = xor <3 x i32> [[M]], <i32 -1, i32 undef, i32 -1>
 ; CHECK-NEXT:    [[AND1:%.*]] = and <3 x i32> [[NEG]], [[Y:%.*]]
-; CHECK-NEXT:    [[RET:%.*]] = or disjoint <3 x i32> [[AND]], [[AND1]]
+; CHECK-NEXT:    [[RET:%.*]] = or <3 x i32> [[AND]], [[AND1]]
 ; CHECK-NEXT:    ret <3 x i32> [[RET]]
 ;
   %and = and <3 x i32> %x, %m
@@ -61,6 +61,21 @@ define <3 x i32> @p_vec_undef(<3 x i32> %x, <3 x i32> %y, <3 x i32> noundef %m)
   ret <3 x i32> %ret
 }
 
+define <3 x i32> @p_vec_poison(<3 x i32> %x, <3 x i32> %y, <3 x i32> noundef %m) {
+; CHECK-LABEL: @p_vec_poison(
+; CHECK-NEXT:    [[AND:%.*]] = and <3 x i32> [[X:%.*]], [[M:%.*]]
+; CHECK-NEXT:    [[NEG:%.*]] = xor <3 x i32> [[M]], <i32 -1, i32 poison, i32 -1>
+; CHECK-NEXT:    [[AND1:%.*]] = and <3 x i32> [[NEG]], [[Y:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = or disjoint <3 x i32> [[AND]], [[AND1]]
+; CHECK-NEXT:    ret <3 x i32> [[RET]]
+;
+  %and = and <3 x i32> %x, %m
+  %neg = xor <3 x i32> %m, <i32 -1, i32 poison, i32 -1>
+  %and1 = and <3 x i32> %neg, %y
+  %ret = or <3 x i32> %and, %and1
+  ret <3 x i32> %ret
+}
+
 ; ============================================================================ ;
 ; Constant mask.
 ; ============================================================================ ;
diff --git a/llvm/test/Transforms/InstCombine/masked-merge-xor.ll b/llvm/test/Transforms/InstCombine/masked-merge-xor.ll
index a6d201be68cee..74cc7625aebff 100644
--- a/llvm/test/Transforms/InstCombine/masked-merge-xor.ll
+++ b/llvm/test/Transforms/InstCombine/masked-merge-xor.ll
@@ -51,7 +51,7 @@ define <3 x i32> @p_vec_undef(<3 x i32> %x, <3 x i32> %y, <3 x i32> noundef %m)
 ; CHECK-NEXT:    [[AND:%.*]] = and <3 x i32> [[X:%.*]], [[M:%.*]]
 ; CHECK-NEXT:    [[NEG:%.*]] = xor <3 x i32> [[M]], <i32 -1, i32 undef, i32 -1>
 ; CHECK-NEXT:    [[AND1:%.*]] = and <3 x i32> [[NEG]], [[Y:%.*]]
-; CHECK-NEXT:    [[RET:%.*]] = or disjoint <3 x i32> [[AND]], [[AND1]]
+; CHECK-NEXT:    [[RET:%.*]] = xor <3 x i32> [[AND]], [[AND1]]
 ; CHECK-NEXT:    ret <3 x i32> [[RET]]
 ;
   %and = and <3 x i32> %x, %m
@@ -61,6 +61,21 @@ define <3 x i32> @p_vec_undef(<3 x i32> %x, <3 x i32> %y, <3 x i32> noundef %m)
   ret <3 x i32> %ret
 }
 
+define <3 x i32> @p_vec_poison(<3 x i32> %x, <3 x i32> %y, <3 x i32> noundef %m) {
+; CHECK-LABEL: @p_vec_poison(
+; CHECK-NEXT:    [[AND:%.*]] = and <3 x i32> [[X:%.*]], [[M:%.*]]
+; CHECK-NEXT:    [[NEG:%.*]] = xor <3 x i32> [[M]], <i32 -1, i32 poison, i32 -1>
+; CHECK-NEXT:    [[AND1:%.*]] = and <3 x i32> [[NEG]], [[Y:%.*]]
+; CHECK-NEXT:    [[RET:%.*]] = or disjoint <3 x i32> [[AND]], [[AND1]]
+; CHECK-NEXT:    ret <3 x i32> [[RET]]
+;
+  %and = and <3 x i32> %x, %m
+  %neg = xor <3 x i32> %m, <i32 -1, i32 poison, i32 -1>
+  %and1 = and <3 x i32> %neg, %y
+  %ret = xor <3 x i32> %and, %and1
+  ret <3 x i32> %ret
+}
+
 ; ============================================================================ ;
 ; Constant mask.
 ; ============================================================================ ;
diff --git a/llvm/test/Transforms/InstCombine/min-positive.ll b/llvm/test/Transforms/InstCombine/min-positive.ll
index 1fb212b738725..d2c2e9018792b 100644
--- a/llvm/test/Transforms/InstCombine/min-positive.ll
+++ b/llvm/test/Transforms/InstCombine/min-positive.ll
@@ -67,16 +67,16 @@ define <2 x i1> @smin_commute_vec(<2 x i32> %x, <2 x i32> %other) {
   ret <2 x i1> %test
 }
 
-define <2 x i1> @smin_commute_vec_undef_elts(<2 x i32> %x, <2 x i32> %other) {
-; CHECK-LABEL: @smin_commute_vec_undef_elts(
-; CHECK-NEXT:    [[TEST:%.*]] = icmp sgt <2 x i32> [[OTHER:%.*]], <i32 0, i32 undef>
+define <2 x i1> @smin_commute_vec_poison_elts(<2 x i32> %x, <2 x i32> %other) {
+; CHECK-LABEL: @smin_commute_vec_poison_elts(
+; CHECK-NEXT:    [[TEST:%.*]] = icmp sgt <2 x i32> [[OTHER:%.*]], <i32 0, i32 poison>
 ; CHECK-NEXT:    ret <2 x i1> [[TEST]]
 ;
   %notneg = and <2 x i32> %x, <i32 7, i32 7>
   %positive = or <2 x i32> %notneg, <i32 1, i32 1>
   %cmp = icmp slt <2 x i32> %other, %positive
   %sel = select <2 x i1> %cmp, <2 x i32> %other, <2 x i32> %positive
-  %test = icmp sgt <2 x i32> %sel, <i32 0, i32 undef>
+  %test = icmp sgt <2 x i32> %sel, <i32 0, i32 poison>
   ret <2 x i1> %test
 }
 ; %positive might be zero
diff --git a/llvm/test/Transforms/InstCombine/minmax-fold.ll b/llvm/test/Transforms/InstCombine/minmax-fold.ll
index 8391fe33eb9b5..8b47dc7a28079 100644
--- a/llvm/test/Transforms/InstCombine/minmax-fold.ll
+++ b/llvm/test/Transforms/InstCombine/minmax-fold.ll
@@ -131,7 +131,7 @@ define i64 @t9(i32 %a) {
 define float @t10(i32 %x) {
 ; CHECK-LABEL: @t10(
 ; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 255)
-; CHECK-NEXT:    [[R:%.*]] = sitofp i32 [[R1]] to float
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i32 [[R1]] to float
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %f_x = sitofp i32 %x to float
@@ -143,7 +143,7 @@ define float @t10(i32 %x) {
 define float @t11(i64 %x) {
 ; CHECK-LABEL: @t11(
 ; CHECK-NEXT:    [[R1:%.*]] = call i64 @llvm.smax.i64(i64 [[X:%.*]], i64 255)
-; CHECK-NEXT:    [[R:%.*]] = sitofp i64 [[R1]] to float
+; CHECK-NEXT:    [[R:%.*]] = uitofp nneg i64 [[R1]] to float
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %f_x = sitofp i64 %x to float
@@ -526,7 +526,7 @@ falselabel:
 define double @PR31751_umin1(i32 %x) {
 ; CHECK-LABEL: @PR31751_umin1(
 ; CHECK-NEXT:    [[SEL:%.*]] = call i32 @llvm.umin.i32(i32 [[X:%.*]], i32 2147483647)
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[SEL]] to double
+; CHECK-NEXT:    [[CONV:%.*]] = uitofp nneg i32 [[SEL]] to double
 ; CHECK-NEXT:    ret double [[CONV]]
 ;
   %cmp = icmp slt i32 %x, 0
@@ -538,7 +538,7 @@ define double @PR31751_umin1(i32 %x) {
 define double @PR31751_umin2(i32 %x) {
 ; CHECK-LABEL: @PR31751_umin2(
 ; CHECK-NEXT:    [[SEL:%.*]] = call i32 @llvm.umin.i32(i32 [[X:%.*]], i32 2147483647)
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[SEL]] to double
+; CHECK-NEXT:    [[CONV:%.*]] = uitofp nneg i32 [[SEL]] to double
 ; CHECK-NEXT:    ret double [[CONV]]
 ;
   %cmp = icmp ult i32 %x, 2147483647
@@ -550,7 +550,7 @@ define double @PR31751_umin2(i32 %x) {
 define double @PR31751_umin3(i32 %x) {
 ; CHECK-LABEL: @PR31751_umin3(
 ; CHECK-NEXT:    [[SEL:%.*]] = call i32 @llvm.umin.i32(i32 [[X:%.*]], i32 2147483647)
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[SEL]] to double
+; CHECK-NEXT:    [[CONV:%.*]] = uitofp nneg i32 [[SEL]] to double
 ; CHECK-NEXT:    ret double [[CONV]]
 ;
   %cmp = icmp ugt i32 %x, 2147483647
@@ -1360,14 +1360,15 @@ define i8 @PR14613_smax(i8 %x) {
 
 define i8 @PR46271(<2 x i8> %x) {
 ; CHECK-LABEL: @PR46271(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i8> @llvm.smax.v2i8(<2 x i8> [[X:%.*]], <2 x i8> <i8 -1, i8 -1>)
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <2 x i8> [[X:%.*]], <i8 poison, i8 -1>
+; CHECK-NEXT:    [[A_INV:%.*]] = icmp slt <2 x i8> [[X]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> [[A_INV]], <2 x i8> <i8 poison, i8 0>, <2 x i8> [[TMP3]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i8> [[TMP1]], i64 1
-; CHECK-NEXT:    [[R:%.*]] = xor i8 [[TMP2]], -1
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    ret i8 [[TMP2]]
 ;
   %a = icmp sgt <2 x i8> %x, <i8 -1, i8 -1>
-  %b = select <2 x i1> %a, <2 x i8> %x, <2 x i8> <i8 undef, i8 -1>
-  %not = xor <2 x i8> %b, <i8 undef, i8 -1>
+  %b = select <2 x i1> %a, <2 x i8> %x, <2 x i8> <i8 poison, i8 -1>
+  %not = xor <2 x i8> %b, <i8 poison, i8 -1>
   %r = extractelement <2 x i8> %not, i32 1
   ret i8 %r
 }
diff --git a/llvm/test/Transforms/InstCombine/minmax-fp.ll b/llvm/test/Transforms/InstCombine/minmax-fp.ll
index f89e8a18e6344..b9e46caa63753 100644
--- a/llvm/test/Transforms/InstCombine/minmax-fp.ll
+++ b/llvm/test/Transforms/InstCombine/minmax-fp.ll
@@ -257,7 +257,7 @@ define double @t16(i32 %x) {
 define double @t17(i32 %x) {
 ; CHECK-LABEL: @t17(
 ; CHECK-NEXT:    [[SEL1:%.*]] = call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 2)
-; CHECK-NEXT:    [[SEL:%.*]] = sitofp i32 [[SEL1]] to double
+; CHECK-NEXT:    [[SEL:%.*]] = uitofp nneg i32 [[SEL1]] to double
 ; CHECK-NEXT:    ret double [[SEL]]
 ;
   %cmp = icmp sgt i32 %x, 2
diff --git a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
index ae2e115b1dd9a..a76f0f84ba340 100644
--- a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
@@ -393,7 +393,7 @@ define i8 @smax_of_nots(i8 %x, i8 %y) {
   ret i8 %m
 }
 
-; Vectors are ok (including undef lanes of not ops)
+; Vectors are ok (including poison lanes of not ops)
 
 define <3 x i8> @smin_of_nots(<3 x i8> %x, <3 x i8> %y) {
 ; CHECK-LABEL: @smin_of_nots(
@@ -401,8 +401,8 @@ define <3 x i8> @smin_of_nots(<3 x i8> %x, <3 x i8> %y) {
 ; CHECK-NEXT:    [[M:%.*]] = xor <3 x i8> [[TMP1]], <i8 -1, i8 -1, i8 -1>
 ; CHECK-NEXT:    ret <3 x i8> [[M]]
 ;
-  %notx = xor <3 x i8> %x, <i8 -1, i8 undef, i8 -1>
-  %noty = xor <3 x i8> %y, <i8 -1, i8 -1, i8 undef>
+  %notx = xor <3 x i8> %x, <i8 -1, i8 poison, i8 -1>
+  %noty = xor <3 x i8> %y, <i8 -1, i8 -1, i8 poison>
   %m = call <3 x i8> @llvm.smin.v3i8(<3 x i8> %notx, <3 x i8> %noty)
   ret <3 x i8> %m
 }
@@ -473,16 +473,16 @@ define i8 @smax_of_not_and_const(i8 %x) {
   ret i8 %m
 }
 
-; Vectors are ok (including undef lanes of not ops and min/max constant operand)
+; Vectors are ok (including poison lanes of not ops and min/max constant operand)
 
 define <3 x i8> @smin_of_not_and_const(<3 x i8> %x) {
 ; CHECK-LABEL: @smin_of_not_and_const(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <3 x i8> @llvm.smax.v3i8(<3 x i8> [[X:%.*]], <3 x i8> <i8 -43, i8 undef, i8 -44>)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <3 x i8> @llvm.smax.v3i8(<3 x i8> [[X:%.*]], <3 x i8> <i8 -43, i8 poison, i8 -44>)
 ; CHECK-NEXT:    [[M:%.*]] = xor <3 x i8> [[TMP1]], <i8 -1, i8 -1, i8 -1>
 ; CHECK-NEXT:    ret <3 x i8> [[M]]
 ;
-  %notx = xor <3 x i8> %x, <i8 -1, i8 -1, i8 undef>
-  %m = call <3 x i8> @llvm.smin.v3i8(<3 x i8> <i8 42, i8 undef, i8 43>, <3 x i8> %notx)
+  %notx = xor <3 x i8> %x, <i8 -1, i8 -1, i8 poison>
+  %m = call <3 x i8> @llvm.smin.v3i8(<3 x i8> <i8 42, i8 poison, i8 43>, <3 x i8> %notx)
   ret <3 x i8> %m
 }
 
@@ -706,7 +706,7 @@ define <3 x i8> @smax_negation_vec(<3 x i8> %x) {
 ; CHECK-NEXT:    [[R:%.*]] = call <3 x i8> @llvm.abs.v3i8(<3 x i8> [[X:%.*]], i1 false)
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
-  %s = sub <3 x i8> <i8 0, i8 undef, i8 0>, %x
+  %s = sub <3 x i8> <i8 0, i8 poison, i8 0>, %x
   %r = call <3 x i8> @llvm.smax.v3i8(<3 x i8> %x, <3 x i8> %s)
   ret <3 x i8> %r
 }
@@ -912,7 +912,7 @@ define <3 x i8> @umin_non_zero_idiom4(<3 x i8> %a) {
 ; CHECK-NEXT:    [[RES:%.*]] = zext <3 x i1> [[TMP1]] to <3 x i8>
 ; CHECK-NEXT:    ret <3 x i8> [[RES]]
 ;
-  %res = call <3 x i8> @llvm.umin.v3i8(<3 x i8> %a, <3 x i8> <i8 1, i8 undef, i8 undef>)
+  %res = call <3 x i8> @llvm.umin.v3i8(<3 x i8> %a, <3 x i8> <i8 1, i8 poison, i8 poison>)
   ret <3 x i8> %res
 }
 
@@ -2118,15 +2118,15 @@ define i8 @umin_offset_uses(i8 %x) {
   ret i8 %m
 }
 
-; TODO: This could transform, but undef element must not propagate to the new add.
+; TODO: This could transform
 
-define <3 x i8> @umax_vector_splat_undef(<3 x i8> %x) {
-; CHECK-LABEL: @umax_vector_splat_undef(
-; CHECK-NEXT:    [[A:%.*]] = add nuw <3 x i8> [[X:%.*]], <i8 undef, i8 64, i8 64>
+define <3 x i8> @umax_vector_splat_poison(<3 x i8> %x) {
+; CHECK-LABEL: @umax_vector_splat_poison(
+; CHECK-NEXT:    [[A:%.*]] = add nuw <3 x i8> [[X:%.*]], <i8 poison, i8 64, i8 64>
 ; CHECK-NEXT:    [[R:%.*]] = call <3 x i8> @llvm.umax.v3i8(<3 x i8> [[A]], <3 x i8> <i8 13, i8 -126, i8 -126>)
 ; CHECK-NEXT:    ret <3 x i8> [[R]]
 ;
-  %a = add nuw <3 x i8> %x, <i8 undef, i8 64, i8 64>
+  %a = add nuw <3 x i8> %x, <i8 poison, i8 64, i8 64>
   %r = call <3 x i8> @llvm.umax.v3i8(<3 x i8> %a, <3 x i8> <i8 13, i8 130, i8 130>)
   ret <3 x i8> %r
 }
@@ -2506,8 +2506,8 @@ entry:
   ret i8 %val
 }
 
-define <3 x i8> @fold_umax_with_knownbits_info_undef_in_splat(<3 x i8> %a, <3 x i8> %b) {
-; CHECK-LABEL: @fold_umax_with_knownbits_info_undef_in_splat(
+define <3 x i8> @fold_umax_with_knownbits_info_poison_in_splat(<3 x i8> %a, <3 x i8> %b) {
+; CHECK-LABEL: @fold_umax_with_knownbits_info_poison_in_splat(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A1:%.*]] = or <3 x i8> [[A:%.*]], <i8 1, i8 1, i8 1>
 ; CHECK-NEXT:    [[A2:%.*]] = shl <3 x i8> [[B:%.*]], <i8 1, i8 1, i8 1>
@@ -2518,7 +2518,7 @@ entry:
   %a1 = or <3 x i8> %a, <i8 1, i8 1, i8 1>
   %a2 = shl <3 x i8> %b, <i8 1, i8 1, i8 1>
   %sub = sub <3 x i8> %a1, %a2
-  %val = call <3 x i8> @llvm.umax.v3i8(<3 x i8> %sub, <3 x i8> <i8 1, i8 undef, i8 1>)
+  %val = call <3 x i8> @llvm.umax.v3i8(<3 x i8> %sub, <3 x i8> <i8 1, i8 poison, i8 1>)
   ret <3 x i8> %val
 }
 
@@ -2535,8 +2535,8 @@ entry:
   ret i8 %val
 }
 
-define <3 x i8> @fold_umin_with_knownbits_info_undef_in_splat(<3 x i8> %a, <3 x i8> %b) {
-; CHECK-LABEL: @fold_umin_with_knownbits_info_undef_in_splat(
+define <3 x i8> @fold_umin_with_knownbits_info_poison_in_splat(<3 x i8> %a, <3 x i8> %b) {
+; CHECK-LABEL: @fold_umin_with_knownbits_info_poison_in_splat(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    ret <3 x i8> <i8 3, i8 3, i8 3>
 ;
@@ -2544,7 +2544,7 @@ entry:
   %a1 = or <3 x i8> %a, <i8 3, i8 3, i8 3>
   %a2 = shl <3 x i8> %b, <i8 2, i8 2, i8 2>
   %sub = sub <3 x i8> %a1, %a2
-  %val = call <3 x i8> @llvm.umin.v3i8(<3 x i8> %sub, <3 x i8> <i8 3, i8 undef, i8 3>)
+  %val = call <3 x i8> @llvm.umin.v3i8(<3 x i8> %sub, <3 x i8> <i8 3, i8 poison, i8 3>)
   ret <3 x i8> %val
 }
 
@@ -2581,3 +2581,92 @@ entry:
   %val = call i8 @llvm.umin.i8(i8 %sub, i8 3)
   ret i8 %val
 }
+
+define i8 @test_umax_and(i8 %x, i8 %y) {
+; CHECK-LABEL: @test_umax_and(
+; CHECK-NEXT:    [[RES:%.*]] = call i8 @llvm.umax.i8(i8 [[X1:%.*]], i8 [[Y1:%.*]])
+; CHECK-NEXT:    [[RES1:%.*]] = and i8 [[RES]], -64
+; CHECK-NEXT:    ret i8 [[RES1]]
+;
+  %x1 = and i8 %x, -64
+  %y1 = and i8 %y, -64
+  %res = call i8 @llvm.umax.i8(i8 %x1, i8 %y1)
+  ret i8 %res
+}
+
+define i8 @test_umin_and(i8 %x, i8 %y) {
+; CHECK-LABEL: @test_umin_and(
+; CHECK-NEXT:    [[RES:%.*]] = call i8 @llvm.umin.i8(i8 [[X1:%.*]], i8 [[Y1:%.*]])
+; CHECK-NEXT:    [[RES1:%.*]] = and i8 [[RES]], -64
+; CHECK-NEXT:    ret i8 [[RES1]]
+;
+  %x1 = and i8 %x, -64
+  %y1 = and i8 %y, -64
+  %res = call i8 @llvm.umin.i8(i8 %x1, i8 %y1)
+  ret i8 %res
+}
+
+define i8 @test_smax_and(i8 %x, i8 %y) {
+; CHECK-LABEL: @test_smax_and(
+; CHECK-NEXT:    [[RES:%.*]] = call i8 @llvm.smax.i8(i8 [[X1:%.*]], i8 [[Y1:%.*]])
+; CHECK-NEXT:    [[RES1:%.*]] = and i8 [[RES]], -64
+; CHECK-NEXT:    ret i8 [[RES1]]
+;
+  %x1 = and i8 %x, -64
+  %y1 = and i8 %y, -64
+  %res = call i8 @llvm.smax.i8(i8 %x1, i8 %y1)
+  ret i8 %res
+}
+
+define i8 @test_smin_and(i8 %x, i8 %y) {
+; CHECK-LABEL: @test_smin_and(
+; CHECK-NEXT:    [[RES:%.*]] = call i8 @llvm.smin.i8(i8 [[X1:%.*]], i8 [[Y1:%.*]])
+; CHECK-NEXT:    [[RES1:%.*]] = and i8 [[RES]], -64
+; CHECK-NEXT:    ret i8 [[RES1]]
+;
+  %x1 = and i8 %x, -64
+  %y1 = and i8 %y, -64
+  %res = call i8 @llvm.smin.i8(i8 %x1, i8 %y1)
+  ret i8 %res
+}
+
+define i8 @test_smin_and_mismatch(i8 %x, i8 %y) {
+; CHECK-LABEL: @test_smin_and_mismatch(
+; CHECK-NEXT:    [[X1:%.*]] = and i8 [[X:%.*]], -64
+; CHECK-NEXT:    [[Y1:%.*]] = and i8 [[Y:%.*]], -32
+; CHECK-NEXT:    [[RES:%.*]] = call i8 @llvm.smin.i8(i8 [[X1]], i8 [[Y1]])
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %x1 = and i8 %x, -64
+  %y1 = and i8 %y, -32
+  %res = call i8 @llvm.smin.i8(i8 %x1, i8 %y1)
+  ret i8 %res
+}
+
+define i8 @test_smin_and_non_negated_pow2(i8 %x, i8 %y) {
+; CHECK-LABEL: @test_smin_and_non_negated_pow2(
+; CHECK-NEXT:    [[X1:%.*]] = and i8 [[X:%.*]], 31
+; CHECK-NEXT:    [[Y1:%.*]] = and i8 [[Y:%.*]], 31
+; CHECK-NEXT:    [[RES:%.*]] = call i8 @llvm.smin.i8(i8 [[X1]], i8 [[Y1]])
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %x1 = and i8 %x, 31
+  %y1 = and i8 %y, 31
+  %res = call i8 @llvm.smin.i8(i8 %x1, i8 %y1)
+  ret i8 %res
+}
+
+define i8 @test_smin_and_multiuse(i8 %x, i8 %y) {
+; CHECK-LABEL: @test_smin_and_multiuse(
+; CHECK-NEXT:    [[X1:%.*]] = and i8 [[X:%.*]], 31
+; CHECK-NEXT:    [[Y1:%.*]] = and i8 [[Y:%.*]], 31
+; CHECK-NEXT:    call void @use(i8 [[Y1]])
+; CHECK-NEXT:    [[RES:%.*]] = call i8 @llvm.smin.i8(i8 [[X1]], i8 [[Y1]])
+; CHECK-NEXT:    ret i8 [[RES]]
+;
+  %x1 = and i8 %x, 31
+  %y1 = and i8 %y, 31
+  call void @use(i8 %y1)
+  %res = call i8 @llvm.smin.i8(i8 %x1, i8 %y1)
+  ret i8 %res
+}
diff --git a/llvm/test/Transforms/InstCombine/mul-inseltpoison.ll b/llvm/test/Transforms/InstCombine/mul-inseltpoison.ll
index 8fe4261bbf009..f47c5577075cb 100644
--- a/llvm/test/Transforms/InstCombine/mul-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/mul-inseltpoison.ll
@@ -784,7 +784,7 @@ define <2 x i8> @negate_if_false_commute(<2 x i8> %px, <2 x i1> %cond) {
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
   %x = sdiv <2 x i8> <i8 42, i8 5>, %px  ; thwart complexity-based canonicalization
-  %sel = select <2 x i1> %cond, <2 x i8> <i8 1, i8 undef>, <2 x i8> <i8 -1, i8 -1>
+  %sel = select <2 x i1> %cond, <2 x i8> <i8 1, i8 poison>, <2 x i8> <i8 -1, i8 -1>
   %r = mul <2 x i8> %x, %sel
   ret <2 x i8> %r
 }
@@ -931,7 +931,7 @@ define <vscale x 2 x i64> @mul_scalable_splat_zero(<vscale x 2 x i64> %z) {
 ; CHECK-LABEL: @mul_scalable_splat_zero(
 ; CHECK-NEXT:    ret <vscale x 2 x i64> zeroinitializer
 ;
-  %shuf = shufflevector <vscale x 2 x i64> insertelement (<vscale x 2 x i64> undef, i64 0, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %shuf = shufflevector <vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 0, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
   %t3 = mul <vscale x 2 x i64> %shuf, %z
   ret <vscale x 2 x i64> %t3
 }
@@ -973,14 +973,14 @@ define <2 x i32> @mulsub1_vec_nonuniform(<2 x i32> %a0, <2 x i32> %a1) {
   ret <2 x i32> %mul
 }
 
-define <2 x i32> @mulsub1_vec_nonuniform_undef(<2 x i32> %a0, <2 x i32> %a1) {
-; CHECK-LABEL: @mulsub1_vec_nonuniform_undef(
+define <2 x i32> @mulsub1_vec_nonuniform_poison(<2 x i32> %a0, <2 x i32> %a1) {
+; CHECK-LABEL: @mulsub1_vec_nonuniform_poison(
 ; CHECK-NEXT:    [[SUB_NEG:%.*]] = sub <2 x i32> [[A0:%.*]], [[A1:%.*]]
 ; CHECK-NEXT:    [[MUL:%.*]] = shl <2 x i32> [[SUB_NEG]], <i32 2, i32 0>
 ; CHECK-NEXT:    ret <2 x i32> [[MUL]]
 ;
   %sub = sub <2 x i32> %a1, %a0
-  %mul = mul <2 x i32> %sub, <i32 -4, i32 undef>
+  %mul = mul <2 x i32> %sub, <i32 -4, i32 poison>
   ret <2 x i32> %mul
 }
 
@@ -1017,14 +1017,14 @@ define <2 x i32> @mulsub2_vec_nonuniform(<2 x i32> %a0) {
   ret <2 x i32> %mul
 }
 
-define <2 x i32> @mulsub2_vec_nonuniform_undef(<2 x i32> %a0) {
-; CHECK-LABEL: @mulsub2_vec_nonuniform_undef(
+define <2 x i32> @mulsub2_vec_nonuniform_poison(<2 x i32> %a0) {
+; CHECK-LABEL: @mulsub2_vec_nonuniform_poison(
 ; CHECK-NEXT:    [[SUB_NEG:%.*]] = add <2 x i32> [[A0:%.*]], <i32 -16, i32 -32>
 ; CHECK-NEXT:    [[MUL:%.*]] = shl <2 x i32> [[SUB_NEG]], <i32 2, i32 0>
 ; CHECK-NEXT:    ret <2 x i32> [[MUL]]
 ;
   %sub = sub <2 x i32> <i32 16, i32 32>, %a0
-  %mul = mul <2 x i32> %sub, <i32 -4, i32 undef>
+  %mul = mul <2 x i32> %sub, <i32 -4, i32 poison>
   ret <2 x i32> %mul
 }
 
@@ -1061,14 +1061,14 @@ define <2 x i32> @muladd2_vec_nonuniform(<2 x i32> %a0) {
   ret <2 x i32> %mul
 }
 
-define <2 x i32> @muladd2_vec_nonuniform_undef(<2 x i32> %a0) {
-; CHECK-LABEL: @muladd2_vec_nonuniform_undef(
+define <2 x i32> @muladd2_vec_nonuniform_poison(<2 x i32> %a0) {
+; CHECK-LABEL: @muladd2_vec_nonuniform_poison(
 ; CHECK-NEXT:    [[ADD_NEG:%.*]] = sub <2 x i32> <i32 -16, i32 -32>, [[A0:%.*]]
 ; CHECK-NEXT:    [[MUL:%.*]] = shl <2 x i32> [[ADD_NEG]], <i32 2, i32 0>
 ; CHECK-NEXT:    ret <2 x i32> [[MUL]]
 ;
   %add = add <2 x i32> %a0, <i32 16, i32 32>
-  %mul = mul <2 x i32> %add, <i32 -4, i32 undef>
+  %mul = mul <2 x i32> %add, <i32 -4, i32 poison>
   ret <2 x i32> %mul
 }
 
diff --git a/llvm/test/Transforms/InstCombine/mul.ll b/llvm/test/Transforms/InstCombine/mul.ll
index d4a689c60786e..227ca4a6d5cfa 100644
--- a/llvm/test/Transforms/InstCombine/mul.ll
+++ b/llvm/test/Transforms/InstCombine/mul.ll
@@ -1496,7 +1496,7 @@ define <2 x i8> @negate_if_false_commute(<2 x i8> %px, <2 x i1> %cond) {
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
   %x = sdiv <2 x i8> <i8 42, i8 5>, %px  ; thwart complexity-based canonicalization
-  %sel = select <2 x i1> %cond, <2 x i8> <i8 1, i8 undef>, <2 x i8> <i8 -1, i8 -1>
+  %sel = select <2 x i1> %cond, <2 x i8> <i8 1, i8 poison>, <2 x i8> <i8 -1, i8 -1>
   %r = mul <2 x i8> %x, %sel
   ret <2 x i8> %r
 }
@@ -1643,7 +1643,7 @@ define <vscale x 2 x i64> @mul_scalable_splat_zero(<vscale x 2 x i64> %z) {
 ; CHECK-LABEL: @mul_scalable_splat_zero(
 ; CHECK-NEXT:    ret <vscale x 2 x i64> zeroinitializer
 ;
-  %shuf = shufflevector <vscale x 2 x i64> insertelement (<vscale x 2 x i64> undef, i64 0, i32 0), <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  %shuf = shufflevector <vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 0, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
   %t3 = mul <vscale x 2 x i64> %shuf, %z
   ret <vscale x 2 x i64> %t3
 }
@@ -1752,14 +1752,14 @@ define <2 x i32> @mulsub1_vec_nonuniform(<2 x i32> %a0, <2 x i32> %a1) {
   ret <2 x i32> %mul
 }
 
-define <2 x i32> @mulsub1_vec_nonuniform_undef(<2 x i32> %a0, <2 x i32> %a1) {
-; CHECK-LABEL: @mulsub1_vec_nonuniform_undef(
+define <2 x i32> @mulsub1_vec_nonuniform_poison(<2 x i32> %a0, <2 x i32> %a1) {
+; CHECK-LABEL: @mulsub1_vec_nonuniform_poison(
 ; CHECK-NEXT:    [[SUB_NEG:%.*]] = sub <2 x i32> [[A0:%.*]], [[A1:%.*]]
 ; CHECK-NEXT:    [[MUL:%.*]] = shl <2 x i32> [[SUB_NEG]], <i32 2, i32 0>
 ; CHECK-NEXT:    ret <2 x i32> [[MUL]]
 ;
   %sub = sub <2 x i32> %a1, %a0
-  %mul = mul <2 x i32> %sub, <i32 -4, i32 undef>
+  %mul = mul <2 x i32> %sub, <i32 -4, i32 poison>
   ret <2 x i32> %mul
 }
 
@@ -1796,14 +1796,14 @@ define <2 x i32> @mulsub2_vec_nonuniform(<2 x i32> %a0) {
   ret <2 x i32> %mul
 }
 
-define <2 x i32> @mulsub2_vec_nonuniform_undef(<2 x i32> %a0) {
-; CHECK-LABEL: @mulsub2_vec_nonuniform_undef(
+define <2 x i32> @mulsub2_vec_nonuniform_poison(<2 x i32> %a0) {
+; CHECK-LABEL: @mulsub2_vec_nonuniform_poison(
 ; CHECK-NEXT:    [[SUB_NEG:%.*]] = add <2 x i32> [[A0:%.*]], <i32 -16, i32 -32>
 ; CHECK-NEXT:    [[MUL:%.*]] = shl <2 x i32> [[SUB_NEG]], <i32 2, i32 0>
 ; CHECK-NEXT:    ret <2 x i32> [[MUL]]
 ;
   %sub = sub <2 x i32> <i32 16, i32 32>, %a0
-  %mul = mul <2 x i32> %sub, <i32 -4, i32 undef>
+  %mul = mul <2 x i32> %sub, <i32 -4, i32 poison>
   ret <2 x i32> %mul
 }
 
@@ -1819,15 +1819,15 @@ define i8 @mulsub_nsw(i8 %a1, i8 %a2) {
 }
 
 ; It would be safe to keep the nsw on the shl here, but only because the mul
-; to shl transform happens to replace undef with 0.
-define <2 x i8> @mulsub_nsw_undef(<2 x i8> %a1, <2 x i8> %a2) {
-; CHECK-LABEL: @mulsub_nsw_undef(
+; to shl transform happens to replace poison with 0.
+define <2 x i8> @mulsub_nsw_poison(<2 x i8> %a1, <2 x i8> %a2) {
+; CHECK-LABEL: @mulsub_nsw_poison(
 ; CHECK-NEXT:    [[A_NEG:%.*]] = sub nsw <2 x i8> [[A2:%.*]], [[A1:%.*]]
 ; CHECK-NEXT:    [[MUL:%.*]] = shl <2 x i8> [[A_NEG]], <i8 1, i8 0>
 ; CHECK-NEXT:    ret <2 x i8> [[MUL]]
 ;
   %a = sub nsw <2 x i8> %a1, %a2
-  %mul = mul nsw <2 x i8> %a, <i8 -2, i8 undef>
+  %mul = mul nsw <2 x i8> %a, <i8 -2, i8 poison>
   ret <2 x i8> %mul
 }
 
@@ -1864,14 +1864,14 @@ define <2 x i32> @muladd2_vec_nonuniform(<2 x i32> %a0) {
   ret <2 x i32> %mul
 }
 
-define <2 x i32> @muladd2_vec_nonuniform_undef(<2 x i32> %a0) {
-; CHECK-LABEL: @muladd2_vec_nonuniform_undef(
+define <2 x i32> @muladd2_vec_nonuniform_poison(<2 x i32> %a0) {
+; CHECK-LABEL: @muladd2_vec_nonuniform_poison(
 ; CHECK-NEXT:    [[ADD_NEG:%.*]] = sub <2 x i32> <i32 -16, i32 -32>, [[A0:%.*]]
 ; CHECK-NEXT:    [[MUL:%.*]] = shl <2 x i32> [[ADD_NEG]], <i32 2, i32 0>
 ; CHECK-NEXT:    ret <2 x i32> [[MUL]]
 ;
   %add = add <2 x i32> %a0, <i32 16, i32 32>
-  %mul = mul <2 x i32> %add, <i32 -4, i32 undef>
+  %mul = mul <2 x i32> %add, <i32 -4, i32 poison>
   ret <2 x i32> %mul
 }
 
diff --git a/llvm/test/Transforms/InstCombine/not-add.ll b/llvm/test/Transforms/InstCombine/not-add.ll
index 877f558ffd503..9ba37b6bba39e 100644
--- a/llvm/test/Transforms/InstCombine/not-add.ll
+++ b/llvm/test/Transforms/InstCombine/not-add.ll
@@ -115,26 +115,26 @@ define <4 x i32> @vector_test(<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i32> %nota
 }
 
-define <4 x i32> @vector_test_undef(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: @vector_test_undef(
+define <4 x i32> @vector_test_poison(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @vector_test_poison(
 ; CHECK-NEXT:    [[NOTA:%.*]] = sub <4 x i32> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <4 x i32> [[NOTA]]
 ;
-  %notx = xor <4 x i32> %x, <i32 -1, i32 undef, i32 undef, i32 -1>
+  %notx = xor <4 x i32> %x, <i32 -1, i32 poison, i32 poison, i32 -1>
   %a = add <4 x i32> %notx, %y
-  %nota = xor <4 x i32> %a, <i32 -1, i32 -1, i32 undef, i32 undef>
+  %nota = xor <4 x i32> %a, <i32 -1, i32 -1, i32 poison, i32 poison>
   ret <4 x i32> %nota
 }
 
 
-define <4 x i32> @vector_test_undef_nsw_nuw(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: @vector_test_undef_nsw_nuw(
+define <4 x i32> @vector_test_poison_nsw_nuw(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @vector_test_poison_nsw_nuw(
 ; CHECK-NEXT:    [[NOTA:%.*]] = sub nuw nsw <4 x i32> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <4 x i32> [[NOTA]]
 ;
-  %notx = xor <4 x i32> %x, <i32 -1, i32 undef, i32 undef, i32 -1>
+  %notx = xor <4 x i32> %x, <i32 -1, i32 poison, i32 poison, i32 -1>
   %a = add nsw nuw <4 x i32> %notx, %y
-  %nota = xor <4 x i32> %a, <i32 -1, i32 -1, i32 undef, i32 undef>
+  %nota = xor <4 x i32> %a, <i32 -1, i32 -1, i32 poison, i32 poison>
   ret <4 x i32> %nota
 }
 
diff --git a/llvm/test/Transforms/InstCombine/not.ll b/llvm/test/Transforms/InstCombine/not.ll
index 98b5d98041560..0c2c6195e3240 100644
--- a/llvm/test/Transforms/InstCombine/not.ll
+++ b/llvm/test/Transforms/InstCombine/not.ll
@@ -430,9 +430,9 @@ define <3 x i5> @not_or_neg_commute_vec(<3 x i5> %x, <3 x i5> %p)  {
 ; CHECK-NEXT:    ret <3 x i5> [[NOT]]
 ;
   %y = mul <3 x i5> %p, <i5 1, i5 2, i5 3> ; thwart complexity-based-canonicalization
-  %s = sub <3 x i5> <i5 0, i5 0, i5 undef>, %x
+  %s = sub <3 x i5> <i5 0, i5 0, i5 poison>, %x
   %o = or <3 x i5> %y, %s
-  %not = xor <3 x i5> %o, <i5 -1, i5 undef, i5 -1>
+  %not = xor <3 x i5> %o, <i5 -1, i5 poison, i5 -1>
   ret <3 x i5> %not
 }
 
diff --git a/llvm/test/Transforms/InstCombine/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/Transforms/InstCombine/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
index c16633efe4ce3..3fd4a17d972af 100644
--- a/llvm/test/Transforms/InstCombine/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
+++ b/llvm/test/Transforms/InstCombine/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
@@ -95,41 +95,41 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %t2
 }
 
-define <4 x i1> @p6_vector_urem_by_const__nonsplat_undef0(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: @p6_vector_urem_by_const__nonsplat_undef0(
-; CHECK-NEXT:    [[T0:%.*]] = and <4 x i32> [[X:%.*]], <i32 128, i32 128, i32 undef, i32 128>
-; CHECK-NEXT:    [[T1:%.*]] = urem <4 x i32> [[T0]], <i32 6, i32 6, i32 6, i32 6>
-; CHECK-NEXT:    [[T2:%.*]] = icmp eq <4 x i32> [[T1]], zeroinitializer
+; The poison value in the vector makes the whole function UB.
+
+define <4 x i1> @p6_vector_urem_by_const__nonsplat_poison0(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @p6_vector_urem_by_const__nonsplat_poison0(
+; CHECK-NEXT:    [[T0:%.*]] = and <4 x i32> [[X:%.*]], <i32 128, i32 128, i32 poison, i32 128>
+; CHECK-NEXT:    [[T2:%.*]] = icmp eq <4 x i32> [[T0]], zeroinitializer
 ; CHECK-NEXT:    ret <4 x i1> [[T2]]
 ;
-  %t0 = and <4 x i32> %x, <i32 128, i32 128, i32 undef, i32 128>
+  %t0 = and <4 x i32> %x, <i32 128, i32 128, i32 poison, i32 128>
   %t1 = urem <4 x i32> %t0, <i32 6, i32 6, i32 6, i32 6> ; '6' is clearly not a power of two
   %t2 = icmp eq <4 x i32> %t1, <i32 0, i32 0, i32 0, i32 0>
   ret <4 x i1> %t2
 }
 
-define <4 x i1> @p7_vector_urem_by_const__nonsplat_undef2(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: @p7_vector_urem_by_const__nonsplat_undef2(
+define <4 x i1> @p7_vector_urem_by_const__nonsplat_poison2(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @p7_vector_urem_by_const__nonsplat_poison2(
 ; CHECK-NEXT:    [[T0:%.*]] = and <4 x i32> [[X:%.*]], <i32 128, i32 128, i32 128, i32 128>
-; CHECK-NEXT:    [[T2:%.*]] = icmp eq <4 x i32> [[T0]], <i32 0, i32 0, i32 undef, i32 0>
+; CHECK-NEXT:    [[T2:%.*]] = icmp eq <4 x i32> [[T0]], <i32 0, i32 0, i32 poison, i32 0>
 ; CHECK-NEXT:    ret <4 x i1> [[T2]]
 ;
   %t0 = and <4 x i32> %x, <i32 128, i32 128, i32 128, i32 128> ; clearly a power-of-two or zero
   %t1 = urem <4 x i32> %t0, <i32 6, i32 6, i32 6, i32 6> ; '6' is clearly not a power of two
-  %t2 = icmp eq <4 x i32> %t1, <i32 0, i32 0, i32 undef, i32 0>
+  %t2 = icmp eq <4 x i32> %t1, <i32 0, i32 0, i32 poison, i32 0>
   ret <4 x i1> %t2
 }
 
-define <4 x i1> @p8_vector_urem_by_const__nonsplat_undef3(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: @p8_vector_urem_by_const__nonsplat_undef3(
-; CHECK-NEXT:    [[T0:%.*]] = and <4 x i32> [[X:%.*]], <i32 128, i32 128, i32 undef, i32 128>
-; CHECK-NEXT:    [[T1:%.*]] = urem <4 x i32> [[T0]], <i32 6, i32 6, i32 6, i32 6>
-; CHECK-NEXT:    [[T2:%.*]] = icmp eq <4 x i32> [[T1]], <i32 0, i32 0, i32 undef, i32 0>
+define <4 x i1> @p8_vector_urem_by_const__nonsplat_poison3(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @p8_vector_urem_by_const__nonsplat_poison3(
+; CHECK-NEXT:    [[T0:%.*]] = and <4 x i32> [[X:%.*]], <i32 128, i32 128, i32 poison, i32 128>
+; CHECK-NEXT:    [[T2:%.*]] = icmp eq <4 x i32> [[T0]], <i32 0, i32 0, i32 poison, i32 0>
 ; CHECK-NEXT:    ret <4 x i1> [[T2]]
 ;
-  %t0 = and <4 x i32> %x, <i32 128, i32 128, i32 undef, i32 128>
+  %t0 = and <4 x i32> %x, <i32 128, i32 128, i32 poison, i32 128>
   %t1 = urem <4 x i32> %t0, <i32 6, i32 6, i32 6, i32 6> ; '6' is clearly not a power of two
-  %t2 = icmp eq <4 x i32> %t1, <i32 0, i32 0, i32 undef, i32 0>
+  %t2 = icmp eq <4 x i32> %t1, <i32 0, i32 0, i32 poison, i32 0>
   ret <4 x i1> %t2
 }
 
diff --git a/llvm/test/Transforms/InstCombine/operand-complexity.ll b/llvm/test/Transforms/InstCombine/operand-complexity.ll
index 62cfc76d9d24e..541a15275b617 100644
--- a/llvm/test/Transforms/InstCombine/operand-complexity.ll
+++ b/llvm/test/Transforms/InstCombine/operand-complexity.ll
@@ -29,15 +29,15 @@ define <2 x i8> @neg_vec(<2 x i8> %x) {
   ret <2 x i8> %r
 }
 
-define <2 x i8> @neg_vec_undef(<2 x i8> %x) {
-; CHECK-LABEL: @neg_vec_undef(
+define <2 x i8> @neg_vec_poison(<2 x i8> %x) {
+; CHECK-LABEL: @neg_vec_poison(
 ; CHECK-NEXT:    [[BO:%.*]] = udiv <2 x i8> [[X:%.*]], <i8 42, i8 -42>
-; CHECK-NEXT:    [[NEGX:%.*]] = sub <2 x i8> <i8 0, i8 undef>, [[X]]
+; CHECK-NEXT:    [[NEGX:%.*]] = sub <2 x i8> <i8 0, i8 poison>, [[X]]
 ; CHECK-NEXT:    [[R:%.*]] = xor <2 x i8> [[BO]], [[NEGX]]
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
   %bo = udiv <2 x i8> %x, <i8 42, i8 -42>
-  %negx = sub <2 x i8> <i8 0, i8 undef>, %x
+  %negx = sub <2 x i8> <i8 0, i8 poison>, %x
   %r = xor <2 x i8> %negx, %bo
   ret <2 x i8> %r
 }
@@ -70,15 +70,15 @@ define <2 x i8> @not_vec(<2 x i8> %x) {
   ret <2 x i8> %r
 }
 
-define <2 x i8> @not_vec_undef(<2 x i8> %x) {
-; CHECK-LABEL: @not_vec_undef(
+define <2 x i8> @not_vec_poison(<2 x i8> %x) {
+; CHECK-LABEL: @not_vec_poison(
 ; CHECK-NEXT:    [[BO:%.*]] = udiv <2 x i8> [[X:%.*]], <i8 42, i8 -42>
-; CHECK-NEXT:    [[NOTX:%.*]] = xor <2 x i8> [[X]], <i8 -1, i8 undef>
+; CHECK-NEXT:    [[NOTX:%.*]] = xor <2 x i8> [[X]], <i8 -1, i8 poison>
 ; CHECK-NEXT:    [[R:%.*]] = mul <2 x i8> [[BO]], [[NOTX]]
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
   %bo = udiv <2 x i8> %x, <i8 42, i8 -42>
-  %notx = xor <2 x i8> <i8 -1, i8 undef>, %x
+  %notx = xor <2 x i8> <i8 -1, i8 poison>, %x
   %r = mul <2 x i8> %notx, %bo
   ret <2 x i8> %r
 }
@@ -134,8 +134,8 @@ define <2 x float> @fneg_vec(<2 x float> %x) {
   ret <2 x float> %r
 }
 
-define <2 x float> @fneg_vec_undef(<2 x float> %x) {
-; CHECK-LABEL: @fneg_vec_undef(
+define <2 x float> @fneg_vec_poison(<2 x float> %x) {
+; CHECK-LABEL: @fneg_vec_poison(
 ; CHECK-NEXT:    [[BO:%.*]] = fdiv <2 x float> [[X:%.*]], <float 4.200000e+01, float -4.200000e+01>
 ; CHECK-NEXT:    [[FNEGX:%.*]] = fneg <2 x float> [[X]]
 ; CHECK-NEXT:    [[R:%.*]] = fmul <2 x float> [[BO]], [[FNEGX]]
@@ -143,7 +143,7 @@ define <2 x float> @fneg_vec_undef(<2 x float> %x) {
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
   %bo = fdiv <2 x float> %x, <float 42.0, float -42.0>
-  %fnegx = fsub <2 x float> <float -0.0, float undef>, %x
+  %fnegx = fsub <2 x float> <float -0.0, float poison>, %x
   %r = fmul <2 x float> %fnegx, %bo
   call void @use_vec(<2 x float> %fnegx)
   ret <2 x float> %r
diff --git a/llvm/test/Transforms/InstCombine/or.ll b/llvm/test/Transforms/InstCombine/or.ll
index 1b1a6ffbf0f2d..6e2085a8bb6c7 100644
--- a/llvm/test/Transforms/InstCombine/or.ll
+++ b/llvm/test/Transforms/InstCombine/or.ll
@@ -262,26 +262,26 @@ define <2 x i1> @and_icmp_eq_0_vector(<2 x i32> %A, <2 x i32> %B) {
   ret <2 x i1> %D
 }
 
-define <2 x i1> @and_icmp_eq_0_vector_undef1(<2 x i32> %A, <2 x i32> %B) {
-; CHECK-LABEL: @and_icmp_eq_0_vector_undef1(
+define <2 x i1> @and_icmp_eq_0_vector_poison1(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: @and_icmp_eq_0_vector_poison1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = or <2 x i32> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[D:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[D]]
 ;
-  %C1 = icmp eq <2 x i32> %A, <i32 0, i32 undef>
-  %C2 = icmp eq <2 x i32> %B, <i32 0, i32 undef>
+  %C1 = icmp eq <2 x i32> %A, <i32 0, i32 poison>
+  %C2 = icmp eq <2 x i32> %B, <i32 0, i32 poison>
   %D = and <2 x i1> %C1, %C2
   ret <2 x i1> %D
 }
 
-define <2 x i1> @and_icmp_eq_0_vector_undef2(<2 x i32> %A, <2 x i32> %B) {
-; CHECK-LABEL: @and_icmp_eq_0_vector_undef2(
+define <2 x i1> @and_icmp_eq_0_vector_poison2(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: @and_icmp_eq_0_vector_poison2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = or <2 x i32> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[D:%.*]] = icmp eq <2 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[D]]
 ;
-  %C1 = icmp eq <2 x i32> %A, <i32 0, i32 undef>
-  %C2 = icmp eq <2 x i32> %B, <i32 undef, i32 0>
+  %C1 = icmp eq <2 x i32> %A, <i32 0, i32 poison>
+  %C2 = icmp eq <2 x i32> %B, <i32 poison, i32 0>
   %D = and <2 x i1> %C1, %C2
   ret <2 x i1> %D
 }
@@ -566,17 +566,17 @@ define <2 x i1> @test37_uniform(<2 x i32> %x) {
   ret <2 x i1> %ret1
 }
 
-define <2 x i1> @test37_undef(<2 x i32> %x) {
-; CHECK-LABEL: @test37_undef(
-; CHECK-NEXT:    [[ADD1:%.*]] = add <2 x i32> [[X:%.*]], <i32 7, i32 undef>
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult <2 x i32> [[ADD1]], <i32 30, i32 undef>
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq <2 x i32> [[X]], <i32 23, i32 undef>
+define <2 x i1> @test37_poison(<2 x i32> %x) {
+; CHECK-LABEL: @test37_poison(
+; CHECK-NEXT:    [[ADD1:%.*]] = add <2 x i32> [[X:%.*]], <i32 7, i32 poison>
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult <2 x i32> [[ADD1]], <i32 30, i32 poison>
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq <2 x i32> [[X]], <i32 23, i32 poison>
 ; CHECK-NEXT:    [[RET1:%.*]] = or <2 x i1> [[CMP1]], [[CMP2]]
 ; CHECK-NEXT:    ret <2 x i1> [[RET1]]
 ;
-  %add1 = add <2 x i32> %x, <i32 7, i32 undef>
-  %cmp1 = icmp ult <2 x i32> %add1, <i32 30, i32 undef>
-  %cmp2 = icmp eq <2 x i32> %x, <i32 23, i32 undef>
+  %add1 = add <2 x i32> %x, <i32 7, i32 poison>
+  %cmp1 = icmp ult <2 x i32> %add1, <i32 30, i32 poison>
+  %cmp2 = icmp eq <2 x i32> %x, <i32 23, i32 poison>
   %ret1 = or <2 x i1> %cmp1, %cmp2
   ret <2 x i1> %ret1
 }
@@ -874,19 +874,19 @@ define <2 x i1> @test46_uniform(<2 x i8> %c)  {
   ret <2 x i1> %or
 }
 
-define <2 x i1> @test46_undef(<2 x i8> %c)  {
-; CHECK-LABEL: @test46_undef(
-; CHECK-NEXT:    [[C_OFF:%.*]] = add <2 x i8> [[C:%.*]], <i8 -97, i8 undef>
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult <2 x i8> [[C_OFF]], <i8 26, i8 undef>
-; CHECK-NEXT:    [[C_OFF17:%.*]] = add <2 x i8> [[C]], <i8 -65, i8 undef>
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult <2 x i8> [[C_OFF17]], <i8 26, i8 undef>
+define <2 x i1> @test46_poison(<2 x i8> %c)  {
+; CHECK-LABEL: @test46_poison(
+; CHECK-NEXT:    [[C_OFF:%.*]] = add <2 x i8> [[C:%.*]], <i8 -97, i8 poison>
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult <2 x i8> [[C_OFF]], <i8 26, i8 poison>
+; CHECK-NEXT:    [[C_OFF17:%.*]] = add <2 x i8> [[C]], <i8 -65, i8 poison>
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult <2 x i8> [[C_OFF17]], <i8 26, i8 poison>
 ; CHECK-NEXT:    [[OR:%.*]] = or <2 x i1> [[CMP1]], [[CMP2]]
 ; CHECK-NEXT:    ret <2 x i1> [[OR]]
 ;
-  %c.off = add <2 x i8> %c, <i8 -97, i8 undef>
-  %cmp1 = icmp ult <2 x i8> %c.off, <i8 26, i8 undef>
-  %c.off17 = add <2 x i8> %c, <i8 -65, i8 undef>
-  %cmp2 = icmp ult <2 x i8> %c.off17, <i8 26, i8 undef>
+  %c.off = add <2 x i8> %c, <i8 -97, i8 poison>
+  %cmp1 = icmp ult <2 x i8> %c.off, <i8 26, i8 poison>
+  %c.off17 = add <2 x i8> %c, <i8 -65, i8 poison>
+  %cmp2 = icmp ult <2 x i8> %c.off17, <i8 26, i8 poison>
   %or = or <2 x i1> %cmp1, %cmp2
   ret <2 x i1> %or
 }
diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-b.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-b.ll
index f0c2f129e3df3..5ed7d641df65b 100644
--- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-b.ll
+++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-b.ll
@@ -89,13 +89,13 @@ define <8 x i32> @t1_vec_splat(<8 x i64> %x, <8 x i32> %nbits) {
   ret <8 x i32> %t7
 }
 
-define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
-; CHECK-LABEL: @t2_vec_splat_undef(
-; CHECK-NEXT:    [[T0:%.*]] = add <8 x i32> [[NBITS:%.*]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>
+define <8 x i32> @t2_vec_splat_poison(<8 x i64> %x, <8 x i32> %nbits) {
+; CHECK-LABEL: @t2_vec_splat_poison(
+; CHECK-NEXT:    [[T0:%.*]] = add <8 x i32> [[NBITS:%.*]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison, i32 -1>
 ; CHECK-NEXT:    [[T1:%.*]] = zext <8 x i32> [[T0]] to <8 x i64>
-; CHECK-NEXT:    [[T2:%.*]] = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, [[T1]]
-; CHECK-NEXT:    [[T3:%.*]] = xor <8 x i64> [[T2]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>
-; CHECK-NEXT:    [[T4:%.*]] = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 undef, i32 32>, [[NBITS]]
+; CHECK-NEXT:    [[T2:%.*]] = shl nsw <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>, [[T1]]
+; CHECK-NEXT:    [[T3:%.*]] = xor <8 x i64> [[T2]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>
+; CHECK-NEXT:    [[T4:%.*]] = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 poison, i32 32>, [[NBITS]]
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T0]])
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T1]])
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T2]])
@@ -106,11 +106,11 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    [[T7:%.*]] = and <8 x i32> [[TMP2]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 poison, i32 2147483647>
 ; CHECK-NEXT:    ret <8 x i32> [[T7]]
 ;
-  %t0 = add <8 x i32> %nbits, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>
+  %t0 = add <8 x i32> %nbits, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison, i32 -1>
   %t1 = zext <8 x i32> %t0 to <8 x i64>
-  %t2 = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, %t1 ; shifting by nbits-1
-  %t3 = xor <8 x i64> %t2, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>
-  %t4 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 undef, i32 32>, %nbits
+  %t2 = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>, %t1 ; shifting by nbits-1
+  %t3 = xor <8 x i64> %t2, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>
+  %t4 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 poison, i32 32>, %nbits
 
   call void @use8xi32(<8 x i32> %t0)
   call void @use8xi64(<8 x i64> %t1)
diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-c.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-c.ll
index 46d1de5781b71..1a711e58c333b 100644
--- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-c.ll
+++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-c.ll
@@ -73,11 +73,11 @@ define <8 x i32> @t1_vec_splat(<8 x i64> %x, <8 x i32> %nbits) {
   ret <8 x i32> %t5
 }
 
-define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
-; CHECK-LABEL: @t2_vec_splat_undef(
+define <8 x i32> @t2_vec_splat_poison(<8 x i64> %x, <8 x i32> %nbits) {
+; CHECK-LABEL: @t2_vec_splat_poison(
 ; CHECK-NEXT:    [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[T1:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = add <8 x i32> [[NBITS]], <i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 undef, i32 -33>
+; CHECK-NEXT:    [[T1:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>, [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = add <8 x i32> [[NBITS]], <i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 poison, i32 -33>
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T0]])
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T1]])
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T2]])
@@ -87,8 +87,8 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    ret <8 x i32> [[T5]]
 ;
   %t0 = zext <8 x i32> %nbits to <8 x i64>
-  %t1 = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, %t0
-  %t2 = add <8 x i32> %nbits, <i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 undef, i32 -33>
+  %t1 = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>, %t0
+  %t2 = add <8 x i32> %nbits, <i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 poison, i32 -33>
 
   call void @use8xi64(<8 x i64> %t0)
   call void @use8xi64(<8 x i64> %t1)
@@ -103,8 +103,8 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
 define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-LABEL: @t3_vec_nonsplat(
 ; CHECK-NEXT:    [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[T1:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = add <8 x i32> [[NBITS]], <i32 -64, i32 -63, i32 -33, i32 -32, i32 63, i32 64, i32 undef, i32 65>
+; CHECK-NEXT:    [[T1:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>, [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = add <8 x i32> [[NBITS]], <i32 -64, i32 -63, i32 -33, i32 -32, i32 63, i32 64, i32 poison, i32 65>
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T0]])
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T1]])
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T2]])
@@ -114,8 +114,8 @@ define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    ret <8 x i32> [[T5]]
 ;
   %t0 = zext <8 x i32> %nbits to <8 x i64>
-  %t1 = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, %t0
-  %t2 = add <8 x i32> %nbits, <i32 -64, i32 -63, i32 -33, i32 -32, i32 63, i32 64, i32 undef, i32 65>
+  %t1 = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>, %t0
+  %t2 = add <8 x i32> %nbits, <i32 -64, i32 -63, i32 -33, i32 -32, i32 63, i32 64, i32 poison, i32 65>
 
   call void @use8xi64(<8 x i64> %t0)
   call void @use8xi64(<8 x i64> %t1)
diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-d.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-d.ll
index 48873852cfc7c..cd0098ecdb0a6 100644
--- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-d.ll
+++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-after-truncation-variant-d.ll
@@ -81,12 +81,12 @@ define <8 x i32> @t1_vec_splat(<8 x i64> %x, <8 x i32> %nbits) {
   ret <8 x i32> %t6
 }
 
-define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
-; CHECK-LABEL: @t2_vec_splat_undef(
+define <8 x i32> @t2_vec_splat_poison(<8 x i64> %x, <8 x i32> %nbits) {
+; CHECK-LABEL: @t2_vec_splat_poison(
 ; CHECK-NEXT:    [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[T1:%.*]] = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, [[T0]]
+; CHECK-NEXT:    [[T1:%.*]] = shl nsw <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>, [[T0]]
 ; CHECK-NEXT:    [[T2:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, [[T0]]
-; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 undef, i32 -33>
+; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 poison, i32 -33>
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T0]])
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T1]])
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T2]])
@@ -97,9 +97,9 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    ret <8 x i32> [[T6]]
 ;
   %t0 = zext <8 x i32> %nbits to <8 x i64>
-  %t1 = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, %t0
+  %t1 = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>, %t0
   %t2 = lshr <8 x i64> %t1, %t0
-  %t3 = add <8 x i32> %nbits, <i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 undef, i32 -33>
+  %t3 = add <8 x i32> %nbits, <i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 -33, i32 poison, i32 -33>
 
   call void @use8xi64(<8 x i64> %t0)
   call void @use8xi64(<8 x i64> %t1)
@@ -115,9 +115,9 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
 define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-LABEL: @t3_vec_nonsplat(
 ; CHECK-NEXT:    [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[T1:%.*]] = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, [[T0]]
+; CHECK-NEXT:    [[T1:%.*]] = shl nsw <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>, [[T0]]
 ; CHECK-NEXT:    [[T2:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, [[T0]]
-; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -64, i32 -63, i32 -33, i32 -32, i32 63, i32 64, i32 undef, i32 65>
+; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -64, i32 -63, i32 -33, i32 -32, i32 63, i32 64, i32 poison, i32 65>
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T0]])
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T1]])
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T2]])
@@ -128,9 +128,9 @@ define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    ret <8 x i32> [[T6]]
 ;
   %t0 = zext <8 x i32> %nbits to <8 x i64>
-  %t1 = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, %t0
+  %t1 = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>, %t0
   %t2 = lshr <8 x i64> %t1, %t0
-  %t3 = add <8 x i32> %nbits, <i32 -64, i32 -63, i32 -33, i32 -32, i32 63, i32 64, i32 undef, i32 65>
+  %t3 = add <8 x i32> %nbits, <i32 -64, i32 -63, i32 -33, i32 -32, i32 63, i32 64, i32 poison, i32 65>
 
   call void @use8xi64(<8 x i64> %t0)
   call void @use8xi64(<8 x i64> %t1)
diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-b.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-b.ll
index 8b3f01bcb7691..1debf111b18cd 100644
--- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-b.ll
+++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-b.ll
@@ -71,12 +71,12 @@ define <8 x i32> @t1_vec_splat(<8 x i32> %x, <8 x i32> %nbits) {
   ret <8 x i32> %t5
 }
 
-define <8 x i32> @t1_vec_splat_undef(<8 x i32> %x, <8 x i32> %nbits) {
-; CHECK-LABEL: @t1_vec_splat_undef(
-; CHECK-NEXT:    [[T0:%.*]] = add <8 x i32> [[NBITS:%.*]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>
-; CHECK-NEXT:    [[T1:%.*]] = shl <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = xor <8 x i32> [[T1]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>
-; CHECK-NEXT:    [[T4:%.*]] = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 undef, i32 32>, [[NBITS]]
+define <8 x i32> @t1_vec_splat_poison(<8 x i32> %x, <8 x i32> %nbits) {
+; CHECK-LABEL: @t1_vec_splat_poison(
+; CHECK-NEXT:    [[T0:%.*]] = add <8 x i32> [[NBITS:%.*]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison, i32 -1>
+; CHECK-NEXT:    [[T1:%.*]] = shl nsw <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison, i32 -1>, [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = xor <8 x i32> [[T1]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison, i32 -1>
+; CHECK-NEXT:    [[T4:%.*]] = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 poison, i32 32>, [[NBITS]]
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T0]])
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T1]])
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T2]])
@@ -85,11 +85,11 @@ define <8 x i32> @t1_vec_splat_undef(<8 x i32> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    [[T5:%.*]] = and <8 x i32> [[TMP1]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 poison, i32 2147483647>
 ; CHECK-NEXT:    ret <8 x i32> [[T5]]
 ;
-  %t0 = add <8 x i32> %nbits, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>
-  %t1 = shl <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>, %t0
-  %t2 = xor <8 x i32> %t1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>
+  %t0 = add <8 x i32> %nbits, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison, i32 -1>
+  %t1 = shl <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison, i32 -1>, %t0
+  %t2 = xor <8 x i32> %t1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison, i32 -1>
   %t3 = and <8 x i32> %t2, %x
-  %t4 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 undef, i32 32>, %nbits
+  %t4 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 poison, i32 32>, %nbits
   call void @use8xi32(<8 x i32> %t0)
   call void @use8xi32(<8 x i32> %t1)
   call void @use8xi32(<8 x i32> %t2)
diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-c.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-c.ll
index 58a905063fac4..55d0b3f80a519 100644
--- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-c.ll
+++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-c.ll
@@ -55,19 +55,19 @@ define <8 x i32> @t1_vec_splat(<8 x i32> %x, <8 x i32> %nbits) {
   ret <8 x i32> %t3
 }
 
-define <8 x i32> @t1_vec_splat_undef(<8 x i32> %x, <8 x i32> %nbits) {
-; CHECK-LABEL: @t1_vec_splat_undef(
-; CHECK-NEXT:    [[T0:%.*]] = lshr <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T2:%.*]] = add <8 x i32> [[NBITS]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>
+define <8 x i32> @t1_vec_splat_poison(<8 x i32> %x, <8 x i32> %nbits) {
+; CHECK-LABEL: @t1_vec_splat_poison(
+; CHECK-NEXT:    [[T0:%.*]] = lshr <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison, i32 -1>, [[NBITS:%.*]]
+; CHECK-NEXT:    [[T2:%.*]] = add <8 x i32> [[NBITS]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison, i32 -1>
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T0]])
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T2]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl <8 x i32> [[X:%.*]], [[T2]]
 ; CHECK-NEXT:    [[T3:%.*]] = and <8 x i32> [[TMP1]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 poison, i32 2147483647>
 ; CHECK-NEXT:    ret <8 x i32> [[T3]]
 ;
-  %t0 = lshr <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>, %nbits
+  %t0 = lshr <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison, i32 -1>, %nbits
   %t1 = and <8 x i32> %t0, %x
-  %t2 = add <8 x i32> %nbits, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>
+  %t2 = add <8 x i32> %nbits, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison, i32 -1>
   call void @use8xi32(<8 x i32> %t0)
   call void @use8xi32(<8 x i32> %t2)
   %t3 = shl <8 x i32> %t1, %t2 ; shift is smaller than mask
diff --git a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-d.ll b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-d.ll
index 9c096d1418a5b..7ad99a6bb0a38 100644
--- a/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-d.ll
+++ b/llvm/test/Transforms/InstCombine/partally-redundant-left-shift-input-masking-variant-d.ll
@@ -63,11 +63,11 @@ define <8 x i32> @t2_vec_splat(<8 x i32> %x, <8 x i32> %nbits) {
   ret <8 x i32> %t4
 }
 
-define <8 x i32> @t2_vec_splat_undef(<8 x i32> %x, <8 x i32> %nbits) {
-; CHECK-LABEL: @t2_vec_splat_undef(
-; CHECK-NEXT:    [[T0:%.*]] = shl <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>, [[NBITS:%.*]]
+define <8 x i32> @t2_vec_splat_poison(<8 x i32> %x, <8 x i32> %nbits) {
+; CHECK-LABEL: @t2_vec_splat_poison(
+; CHECK-NEXT:    [[T0:%.*]] = shl nsw <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison, i32 -1>, [[NBITS:%.*]]
 ; CHECK-NEXT:    [[T1:%.*]] = lshr <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, [[NBITS]]
-; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>
+; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison, i32 -1>
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T0]])
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T1]])
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[T3]])
@@ -75,10 +75,10 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i32> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    [[T4:%.*]] = and <8 x i32> [[TMP1]], <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 poison, i32 2147483647>
 ; CHECK-NEXT:    ret <8 x i32> [[T4]]
 ;
-  %t0 = shl <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>, %nbits
+  %t0 = shl <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison, i32 -1>, %nbits
   %t1 = lshr <8 x i32> %t0, %nbits
   %t2 = and <8 x i32> %t1, %x
-  %t3 = add <8 x i32> %nbits, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 undef, i32 -1>
+  %t3 = add <8 x i32> %nbits, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 poison, i32 -1>
   call void @use8xi32(<8 x i32> %t0)
   call void @use8xi32(<8 x i32> %t1)
   call void @use8xi32(<8 x i32> %t3)
diff --git a/llvm/test/Transforms/InstCombine/pr27236.ll b/llvm/test/Transforms/InstCombine/pr27236.ll
index 61ea344b1bdbd..67c320d352466 100644
--- a/llvm/test/Transforms/InstCombine/pr27236.ll
+++ b/llvm/test/Transforms/InstCombine/pr27236.ll
@@ -4,7 +4,7 @@
 define float @test1(i32 %scale) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.smax.i32(i32 [[SCALE:%.*]], i32 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp i32 [[TMP1]] to float
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp nneg i32 [[TMP1]] to float
 ; CHECK-NEXT:    ret float [[TMP2]]
 ;
   %1 = icmp sgt i32 1, %scale
diff --git a/llvm/test/Transforms/InstCombine/pr53357.ll b/llvm/test/Transforms/InstCombine/pr53357.ll
index 0a6d2993ce46a..0ae690869c1c4 100644
--- a/llvm/test/Transforms/InstCombine/pr53357.ll
+++ b/llvm/test/Transforms/InstCombine/pr53357.ll
@@ -30,16 +30,16 @@ define <2 x i32> @src_vec(<2 x i32> noundef %0, <2 x i32> noundef %1) {
   ret <2 x i32> %6
 }
 
-; vector version of src with undef values
-define <2 x i32> @src_vec_undef(<2 x i32> noundef %0, <2 x i32> noundef %1) {
-; CHECK-LABEL: @src_vec_undef(
+; vector version of src with poison values
+define <2 x i32> @src_vec_poison(<2 x i32> noundef %0, <2 x i32> noundef %1) {
+; CHECK-LABEL: @src_vec_poison(
 ; CHECK-NEXT:    [[TMP3:%.*]] = xor <2 x i32> [[TMP1:%.*]], [[TMP0:%.*]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = xor <2 x i32> [[TMP3]], <i32 -1, i32 -1>
 ; CHECK-NEXT:    ret <2 x i32> [[TMP4]]
 ;
   %3 = and <2 x i32> %1, %0
   %4 = or  <2 x i32> %1, %0
-  %5 = xor <2 x i32> %4, <i32 -1, i32 undef>
+  %5 = xor <2 x i32> %4, <i32 -1, i32 poison>
   %6 = add <2 x i32> %3, %5
   ret <2 x i32> %6
 }
diff --git a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-b.ll b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-b.ll
index d49cfe990d82d..cb6775e689b8c 100644
--- a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-b.ll
+++ b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-b.ll
@@ -89,12 +89,12 @@ define <8 x i32> @t1_vec_splat(<8 x i64> %x, <8 x i32> %nbits) {
   ret <8 x i32> %t6
 }
 
-define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
-; CHECK-LABEL: @t2_vec_splat_undef(
+define <8 x i32> @t2_vec_splat_poison(<8 x i64> %x, <8 x i32> %nbits) {
+; CHECK-LABEL: @t2_vec_splat_poison(
 ; CHECK-NEXT:    [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[T1:%.*]] = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = xor <8 x i64> [[T1]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>
-; CHECK-NEXT:    [[T3:%.*]] = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 undef, i32 32>, [[NBITS]]
+; CHECK-NEXT:    [[T1:%.*]] = shl nsw <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>, [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = xor <8 x i64> [[T1]], <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>
+; CHECK-NEXT:    [[T3:%.*]] = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 poison, i32 32>, [[NBITS]]
 ; CHECK-NEXT:    [[T4:%.*]] = and <8 x i64> [[T2]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use8xi32(<8 x i32> [[NBITS]])
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T0]])
@@ -107,9 +107,9 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    ret <8 x i32> [[T6]]
 ;
   %t0 = zext <8 x i32> %nbits to <8 x i64>
-  %t1 = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, %t0
-  %t2 = xor <8 x i64> %t1, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>
-  %t3 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 undef, i32 32>, %nbits
+  %t1 = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>, %t0
+  %t2 = xor <8 x i64> %t1, <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>
+  %t3 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 poison, i32 32>, %nbits
   %t4 = and <8 x i64> %t2, %x
 
   call void @use8xi32(<8 x i32> %nbits)
diff --git a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-c.ll b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-c.ll
index fbbeffbba630b..a78246781c7f9 100644
--- a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-c.ll
+++ b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-c.ll
@@ -77,11 +77,11 @@ define <8 x i32> @t1_vec_splat(<8 x i64> %x, <8 x i32> %nbits) {
   ret <8 x i32> %t5
 }
 
-define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
-; CHECK-LABEL: @t2_vec_splat_undef(
+define <8 x i32> @t2_vec_splat_poison(<8 x i64> %x, <8 x i32> %nbits) {
+; CHECK-LABEL: @t2_vec_splat_poison(
 ; CHECK-NEXT:    [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[T1:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = add <8 x i32> [[NBITS]], <i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 undef, i32 -32>
+; CHECK-NEXT:    [[T1:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>, [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = add <8 x i32> [[NBITS]], <i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 poison, i32 -32>
 ; CHECK-NEXT:    [[T3:%.*]] = and <8 x i64> [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T0]])
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T1]])
@@ -92,8 +92,8 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    ret <8 x i32> [[T5]]
 ;
   %t0 = zext <8 x i32> %nbits to <8 x i64>
-  %t1 = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, %t0
-  %t2 = add <8 x i32> %nbits, <i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 undef, i32 -32>
+  %t1 = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>, %t0
+  %t2 = add <8 x i32> %nbits, <i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 poison, i32 -32>
   %t3 = and <8 x i64> %t1, %x
 
   call void @use8xi64(<8 x i64> %t0)
@@ -109,8 +109,8 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
 define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-LABEL: @t3_vec_nonsplat(
 ; CHECK-NEXT:    [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[T1:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, [[T0]]
-; CHECK-NEXT:    [[T2:%.*]] = add <8 x i32> [[NBITS]], <i32 -32, i32 -1, i32 0, i32 1, i32 31, i32 32, i32 undef, i32 64>
+; CHECK-NEXT:    [[T1:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>, [[T0]]
+; CHECK-NEXT:    [[T2:%.*]] = add <8 x i32> [[NBITS]], <i32 -32, i32 -1, i32 0, i32 1, i32 31, i32 32, i32 poison, i32 64>
 ; CHECK-NEXT:    [[T3:%.*]] = and <8 x i64> [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T0]])
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T1]])
@@ -121,8 +121,8 @@ define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    ret <8 x i32> [[T5]]
 ;
   %t0 = zext <8 x i32> %nbits to <8 x i64>
-  %t1 = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, %t0
-  %t2 = add <8 x i32> %nbits, <i32 -32, i32 -1, i32 0, i32 1, i32 31, i32 32, i32 undef, i32 64>
+  %t1 = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>, %t0
+  %t2 = add <8 x i32> %nbits, <i32 -32, i32 -1, i32 0, i32 1, i32 31, i32 32, i32 poison, i32 64>
   %t3 = and <8 x i64> %t1, %x
 
   call void @use8xi64(<8 x i64> %t0)
diff --git a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-d.ll b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-d.ll
index 1a977f67a6a5a..b79ab79097527 100644
--- a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-d.ll
+++ b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-after-truncation-variant-d.ll
@@ -85,12 +85,12 @@ define <8 x i32> @t1_vec_splat(<8 x i64> %x, <8 x i32> %nbits) {
   ret <8 x i32> %t6
 }
 
-define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
-; CHECK-LABEL: @t2_vec_splat_undef(
+define <8 x i32> @t2_vec_splat_poison(<8 x i64> %x, <8 x i32> %nbits) {
+; CHECK-LABEL: @t2_vec_splat_poison(
 ; CHECK-NEXT:    [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[T1:%.*]] = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, [[T0]]
+; CHECK-NEXT:    [[T1:%.*]] = shl nsw <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>, [[T0]]
 ; CHECK-NEXT:    [[T2:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, [[T0]]
-; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 undef, i32 -32>
+; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 poison, i32 -32>
 ; CHECK-NEXT:    [[T4:%.*]] = and <8 x i64> [[T2]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T0]])
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T1]])
@@ -102,9 +102,9 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    ret <8 x i32> [[T6]]
 ;
   %t0 = zext <8 x i32> %nbits to <8 x i64>
-  %t1 = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, %t0
+  %t1 = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>, %t0
   %t2 = lshr <8 x i64> %t1, %t0
-  %t3 = add <8 x i32> %nbits, <i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 undef, i32 -32>
+  %t3 = add <8 x i32> %nbits, <i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 -32, i32 poison, i32 -32>
   %t4 = and <8 x i64> %t2, %x
 
   call void @use8xi64(<8 x i64> %t0)
@@ -121,9 +121,9 @@ define <8 x i32> @t2_vec_splat_undef(<8 x i64> %x, <8 x i32> %nbits) {
 define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-LABEL: @t3_vec_nonsplat(
 ; CHECK-NEXT:    [[T0:%.*]] = zext <8 x i32> [[NBITS:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[T1:%.*]] = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, [[T0]]
+; CHECK-NEXT:    [[T1:%.*]] = shl nsw <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>, [[T0]]
 ; CHECK-NEXT:    [[T2:%.*]] = lshr <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1>, [[T0]]
-; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -32, i32 -1, i32 0, i32 1, i32 31, i32 32, i32 undef, i32 64>
+; CHECK-NEXT:    [[T3:%.*]] = add <8 x i32> [[NBITS]], <i32 -32, i32 -1, i32 0, i32 1, i32 31, i32 32, i32 poison, i32 64>
 ; CHECK-NEXT:    [[T4:%.*]] = and <8 x i64> [[T2]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T0]])
 ; CHECK-NEXT:    call void @use8xi64(<8 x i64> [[T1]])
@@ -135,9 +135,9 @@ define <8 x i32> @t3_vec_nonsplat(<8 x i64> %x, <8 x i32> %nbits) {
 ; CHECK-NEXT:    ret <8 x i32> [[T6]]
 ;
   %t0 = zext <8 x i32> %nbits to <8 x i64>
-  %t1 = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 undef, i64 -1>, %t0
+  %t1 = shl <8 x i64> <i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 -1, i64 poison, i64 -1>, %t0
   %t2 = lshr <8 x i64> %t1, %t0
-  %t3 = add <8 x i32> %nbits, <i32 -32, i32 -1, i32 0, i32 1, i32 31, i32 32, i32 undef, i32 64>
+  %t3 = add <8 x i32> %nbits, <i32 -32, i32 -1, i32 0, i32 1, i32 31, i32 32, i32 poison, i32 64>
   %t4 = and <8 x i64> %t2, %x
 
   call void @use8xi64(<8 x i64> %t0)
diff --git a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-b.ll b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-b.ll
index ddaef5f4b47c8..4b955a894fcfe 100644
--- a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-b.ll
+++ b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-b.ll
@@ -155,12 +155,12 @@ define <3 x i32> @t4_vec_nonsplat(<3 x i32> %x, <3 x i32> %nbits) {
   ret <3 x i32> %t5
 }
 
-define <3 x i32> @t5_vec_undef(<3 x i32> %x, <3 x i32> %nbits) {
-; CHECK-LABEL: @t5_vec_undef(
-; CHECK-NEXT:    [[T1:%.*]] = shl <3 x i32> <i32 -1, i32 undef, i32 -1>, [[NBITS:%.*]]
-; CHECK-NEXT:    [[T2:%.*]] = xor <3 x i32> [[T1]], <i32 -1, i32 undef, i32 -1>
+define <3 x i32> @t5_vec_poison(<3 x i32> %x, <3 x i32> %nbits) {
+; CHECK-LABEL: @t5_vec_poison(
+; CHECK-NEXT:    [[T1:%.*]] = shl nsw <3 x i32> <i32 -1, i32 poison, i32 -1>, [[NBITS:%.*]]
+; CHECK-NEXT:    [[T2:%.*]] = xor <3 x i32> [[T1]], <i32 -1, i32 poison, i32 -1>
 ; CHECK-NEXT:    [[T3:%.*]] = and <3 x i32> [[T2]], [[X:%.*]]
-; CHECK-NEXT:    [[T4:%.*]] = sub <3 x i32> <i32 32, i32 undef, i32 32>, [[NBITS]]
+; CHECK-NEXT:    [[T4:%.*]] = sub <3 x i32> <i32 32, i32 poison, i32 32>, [[NBITS]]
 ; CHECK-NEXT:    call void @use3xi32(<3 x i32> [[NBITS]])
 ; CHECK-NEXT:    call void @use3xi32(<3 x i32> [[T1]])
 ; CHECK-NEXT:    call void @use3xi32(<3 x i32> [[T2]])
@@ -169,11 +169,11 @@ define <3 x i32> @t5_vec_undef(<3 x i32> %x, <3 x i32> %nbits) {
 ; CHECK-NEXT:    [[T5:%.*]] = shl <3 x i32> [[X]], [[T4]]
 ; CHECK-NEXT:    ret <3 x i32> [[T5]]
 ;
-  %t0 = add <3 x i32> %nbits, <i32 0, i32 undef, i32 0>
-  %t1 = shl <3 x i32> <i32 -1, i32 undef, i32 -1>, %t0
-  %t2 = xor <3 x i32> %t1, <i32 -1, i32 undef, i32 -1>
+  %t0 = add <3 x i32> %nbits, <i32 0, i32 poison, i32 0>
+  %t1 = shl <3 x i32> <i32 -1, i32 poison, i32 -1>, %t0
+  %t2 = xor <3 x i32> %t1, <i32 -1, i32 poison, i32 -1>
   %t3 = and <3 x i32> %t2, %x
-  %t4 = sub <3 x i32> <i32 32, i32 undef, i32 32>, %nbits
+  %t4 = sub <3 x i32> <i32 32, i32 poison, i32 32>, %nbits
   call void @use3xi32(<3 x i32> %t0)
   call void @use3xi32(<3 x i32> %t1)
   call void @use3xi32(<3 x i32> %t2)
diff --git a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-c.ll b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-c.ll
index c7747cfafcff5..8428ef67d6b86 100644
--- a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-c.ll
+++ b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-c.ll
@@ -99,20 +99,20 @@ define <3 x i32> @t3_vec_nonsplat(<3 x i32> %x, <3 x i32> %nbits) {
   ret <3 x i32> %t3
 }
 
-define <3 x i32> @t4_vec_undef(<3 x i32> %x, <3 x i32> %nbits) {
-; CHECK-LABEL: @t4_vec_undef(
-; CHECK-NEXT:    [[T0:%.*]] = lshr <3 x i32> <i32 -1, i32 undef, i32 -1>, [[NBITS:%.*]]
+define <3 x i32> @t4_vec_poison(<3 x i32> %x, <3 x i32> %nbits) {
+; CHECK-LABEL: @t4_vec_poison(
+; CHECK-NEXT:    [[T0:%.*]] = lshr <3 x i32> <i32 -1, i32 poison, i32 -1>, [[NBITS:%.*]]
 ; CHECK-NEXT:    [[T1:%.*]] = and <3 x i32> [[T0]], [[X:%.*]]
-; CHECK-NEXT:    [[T2:%.*]] = add <3 x i32> [[NBITS]], <i32 1, i32 undef, i32 1>
+; CHECK-NEXT:    [[T2:%.*]] = add <3 x i32> [[NBITS]], <i32 1, i32 poison, i32 1>
 ; CHECK-NEXT:    call void @use3xi32(<3 x i32> [[T0]])
 ; CHECK-NEXT:    call void @use3xi32(<3 x i32> [[T1]])
 ; CHECK-NEXT:    call void @use3xi32(<3 x i32> [[T2]])
 ; CHECK-NEXT:    [[T3:%.*]] = shl <3 x i32> [[X]], [[T2]]
 ; CHECK-NEXT:    ret <3 x i32> [[T3]]
 ;
-  %t0 = lshr <3 x i32> <i32 -1, i32 undef, i32 -1>, %nbits
+  %t0 = lshr <3 x i32> <i32 -1, i32 poison, i32 -1>, %nbits
   %t1 = and <3 x i32> %t0, %x
-  %t2 = add <3 x i32> %nbits, <i32 1, i32 undef, i32 1>
+  %t2 = add <3 x i32> %nbits, <i32 1, i32 poison, i32 1>
   call void @use3xi32(<3 x i32> %t0)
   call void @use3xi32(<3 x i32> %t1)
   call void @use3xi32(<3 x i32> %t2)
diff --git a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-d.ll b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-d.ll
index 549729fe8b59c..5d8ff9e9fb71b 100644
--- a/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-d.ll
+++ b/llvm/test/Transforms/InstCombine/redundant-left-shift-input-masking-variant-d.ll
@@ -115,9 +115,9 @@ define <3 x i32> @t3_vec_nonsplat(<3 x i32> %x, <3 x i32> %nbits) {
   ret <3 x i32> %t4
 }
 
-define <3 x i32> @t4_vec_undef(<3 x i32> %x, <3 x i32> %nbits) {
-; CHECK-LABEL: @t4_vec_undef(
-; CHECK-NEXT:    [[T0:%.*]] = shl <3 x i32> <i32 -1, i32 undef, i32 -1>, [[NBITS:%.*]]
+define <3 x i32> @t4_vec_poison(<3 x i32> %x, <3 x i32> %nbits) {
+; CHECK-LABEL: @t4_vec_poison(
+; CHECK-NEXT:    [[T0:%.*]] = shl nsw <3 x i32> <i32 -1, i32 poison, i32 -1>, [[NBITS:%.*]]
 ; CHECK-NEXT:    [[T1:%.*]] = lshr <3 x i32> <i32 -1, i32 -1, i32 -1>, [[NBITS]]
 ; CHECK-NEXT:    [[T2:%.*]] = and <3 x i32> [[T1]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use3xi32(<3 x i32> [[T0]])
@@ -127,10 +127,10 @@ define <3 x i32> @t4_vec_undef(<3 x i32> %x, <3 x i32> %nbits) {
 ; CHECK-NEXT:    [[T4:%.*]] = shl <3 x i32> [[X]], [[NBITS]]
 ; CHECK-NEXT:    ret <3 x i32> [[T4]]
 ;
-  %t0 = shl <3 x i32> <i32 -1, i32 undef, i32 -1>, %nbits
+  %t0 = shl <3 x i32> <i32 -1, i32 poison, i32 -1>, %nbits
   %t1 = lshr <3 x i32> %t0, %nbits
   %t2 = and <3 x i32> %t1, %x
-  %t3 = add <3 x i32> %nbits, <i32 0, i32 undef, i32 0>
+  %t3 = add <3 x i32> %nbits, <i32 0, i32 poison, i32 0>
   call void @use3xi32(<3 x i32> %t0)
   call void @use3xi32(<3 x i32> %t1)
   call void @use3xi32(<3 x i32> %t2)
diff --git a/llvm/test/Transforms/InstCombine/reuse-constant-from-select-in-icmp.ll b/llvm/test/Transforms/InstCombine/reuse-constant-from-select-in-icmp.ll
index fd0d942ad840b..301ead708a08f 100644
--- a/llvm/test/Transforms/InstCombine/reuse-constant-from-select-in-icmp.ll
+++ b/llvm/test/Transforms/InstCombine/reuse-constant-from-select-in-icmp.ll
@@ -102,36 +102,36 @@ define <2 x i32> @p7_vec_splat_sgt(<2 x i32> %x, <2 x i32> %y) {
   ret <2 x i32> %r
 }
 
-; Vectors with undef
+; Vectors with poison
 
-define <2 x i32> @p8_vec_nonsplat_undef0(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: @p8_vec_nonsplat_undef0(
+define <2 x i32> @p8_vec_nonsplat_poison0(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @p8_vec_nonsplat_poison0(
 ; CHECK-NEXT:    [[T_INV:%.*]] = icmp ugt <2 x i32> [[X:%.*]], <i32 65535, i32 65535>
 ; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[T_INV]], <2 x i32> <i32 65535, i32 65535>, <2 x i32> [[Y:%.*]]
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %t = icmp ult <2 x i32> %x, <i32 65536, i32 undef>
+  %t = icmp ult <2 x i32> %x, <i32 65536, i32 poison>
   %r = select <2 x i1> %t, <2 x i32> %y, <2 x i32> <i32 65535, i32 65535>
   ret <2 x i32> %r
 }
-define <2 x i32> @p9_vec_nonsplat_undef1(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: @p9_vec_nonsplat_undef1(
+define <2 x i32> @p9_vec_nonsplat_poison1(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @p9_vec_nonsplat_poison1(
 ; CHECK-NEXT:    [[T_INV:%.*]] = icmp ugt <2 x i32> [[X:%.*]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[T_INV]], <2 x i32> <i32 65535, i32 undef>, <2 x i32> [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[T_INV]], <2 x i32> <i32 65535, i32 poison>, <2 x i32> [[Y:%.*]]
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %t = icmp ult <2 x i32> %x, <i32 65536, i32 65536>
-  %r = select <2 x i1> %t, <2 x i32> %y, <2 x i32> <i32 65535, i32 undef>
+  %r = select <2 x i1> %t, <2 x i32> %y, <2 x i32> <i32 65535, i32 poison>
   ret <2 x i32> %r
 }
-define <2 x i32> @p10_vec_nonsplat_undef2(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: @p10_vec_nonsplat_undef2(
+define <2 x i32> @p10_vec_nonsplat_poison2(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @p10_vec_nonsplat_poison2(
 ; CHECK-NEXT:    [[T_INV:%.*]] = icmp ugt <2 x i32> [[X:%.*]], <i32 65535, i32 65535>
-; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[T_INV]], <2 x i32> <i32 65535, i32 undef>, <2 x i32> [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[T_INV]], <2 x i32> <i32 65535, i32 poison>, <2 x i32> [[Y:%.*]]
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %t = icmp ult <2 x i32> %x, <i32 65536, i32 undef>
-  %r = select <2 x i1> %t, <2 x i32> %y, <2 x i32> <i32 65535, i32 undef>
+  %t = icmp ult <2 x i32> %x, <i32 65536, i32 poison>
+  %r = select <2 x i1> %t, <2 x i32> %y, <2 x i32> <i32 65535, i32 poison>
   ret <2 x i32> %r
 }
 
diff --git a/llvm/test/Transforms/InstCombine/rotate.ll b/llvm/test/Transforms/InstCombine/rotate.ll
index 6c70c791fd881..eec623e2f193a 100644
--- a/llvm/test/Transforms/InstCombine/rotate.ll
+++ b/llvm/test/Transforms/InstCombine/rotate.ll
@@ -65,24 +65,24 @@ define <2 x i16> @rotl_v2i16_constant_splat(<2 x i16> %x) {
   ret <2 x i16> %r
 }
 
-define <2 x i16> @rotl_v2i16_constant_splat_undef0(<2 x i16> %x) {
-; CHECK-LABEL: @rotl_v2i16_constant_splat_undef0(
+define <2 x i16> @rotl_v2i16_constant_splat_poison0(<2 x i16> %x) {
+; CHECK-LABEL: @rotl_v2i16_constant_splat_poison0(
 ; CHECK-NEXT:    [[R:%.*]] = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> [[X:%.*]], <2 x i16> [[X]], <2 x i16> <i16 1, i16 1>)
 ; CHECK-NEXT:    ret <2 x i16> [[R]]
 ;
-  %shl = shl <2 x i16> %x, <i16 undef, i16 1>
+  %shl = shl <2 x i16> %x, <i16 poison, i16 1>
   %shr = lshr <2 x i16> %x, <i16 15, i16 15>
   %r = or <2 x i16> %shl, %shr
   ret <2 x i16> %r
 }
 
-define <2 x i16> @rotl_v2i16_constant_splat_undef1(<2 x i16> %x) {
-; CHECK-LABEL: @rotl_v2i16_constant_splat_undef1(
+define <2 x i16> @rotl_v2i16_constant_splat_poison1(<2 x i16> %x) {
+; CHECK-LABEL: @rotl_v2i16_constant_splat_poison1(
 ; CHECK-NEXT:    [[R:%.*]] = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> [[X:%.*]], <2 x i16> [[X]], <2 x i16> <i16 1, i16 1>)
 ; CHECK-NEXT:    ret <2 x i16> [[R]]
 ;
   %shl = shl <2 x i16> %x, <i16 1, i16 1>
-  %shr = lshr <2 x i16> %x, <i16 15, i16 undef>
+  %shr = lshr <2 x i16> %x, <i16 15, i16 poison>
   %r = or <2 x i16> %shl, %shr
   ret <2 x i16> %r
 }
@@ -100,30 +100,30 @@ define <2 x i17> @rotr_v2i17_constant_splat(<2 x i17> %x) {
   ret <2 x i17> %r
 }
 
-define <2 x i17> @rotr_v2i17_constant_splat_undef0(<2 x i17> %x) {
-; CHECK-LABEL: @rotr_v2i17_constant_splat_undef0(
+define <2 x i17> @rotr_v2i17_constant_splat_poison0(<2 x i17> %x) {
+; CHECK-LABEL: @rotr_v2i17_constant_splat_poison0(
 ; CHECK-NEXT:    [[R:%.*]] = call <2 x i17> @llvm.fshl.v2i17(<2 x i17> [[X:%.*]], <2 x i17> [[X]], <2 x i17> <i17 12, i17 12>)
 ; CHECK-NEXT:    ret <2 x i17> [[R]]
 ;
-  %shl = shl <2 x i17> %x, <i17 12, i17 undef>
-  %shr = lshr <2 x i17> %x, <i17 undef, i17 5>
+  %shl = shl <2 x i17> %x, <i17 12, i17 poison>
+  %shr = lshr <2 x i17> %x, <i17 poison, i17 5>
   %r = or <2 x i17> %shr, %shl
   ret <2 x i17> %r
 }
 
-define <2 x i17> @rotr_v2i17_constant_splat_undef1(<2 x i17> %x) {
-; CHECK-LABEL: @rotr_v2i17_constant_splat_undef1(
+define <2 x i17> @rotr_v2i17_constant_splat_poison1(<2 x i17> %x) {
+; CHECK-LABEL: @rotr_v2i17_constant_splat_poison1(
 ; CHECK-NEXT:    [[R:%.*]] = call <2 x i17> @llvm.fshl.v2i17(<2 x i17> [[X:%.*]], <2 x i17> [[X]], <2 x i17> <i17 12, i17 12>)
 ; CHECK-NEXT:    ret <2 x i17> [[R]]
 ;
-  %shl = shl <2 x i17> %x, <i17 12, i17 undef>
-  %shr = lshr <2 x i17> %x, <i17 5, i17 undef>
+  %shl = shl <2 x i17> %x, <i17 12, i17 poison>
+  %shr = lshr <2 x i17> %x, <i17 5, i17 poison>
   %r = or <2 x i17> %shr, %shl
   ret <2 x i17> %r
 }
 
 ; Allow arbitrary shift constants.
-; Support undef elements.
+; Support poison elements.
 
 define <2 x i32> @rotr_v2i32_constant_nonsplat(<2 x i32> %x) {
 ; CHECK-LABEL: @rotr_v2i32_constant_nonsplat(
@@ -136,17 +136,6 @@ define <2 x i32> @rotr_v2i32_constant_nonsplat(<2 x i32> %x) {
   ret <2 x i32> %r
 }
 
-define <2 x i32> @rotr_v2i32_constant_nonsplat_undef0(<2 x i32> %x) {
-; CHECK-LABEL: @rotr_v2i32_constant_nonsplat_undef0(
-; CHECK-NEXT:    [[R:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[X:%.*]], <2 x i32> [[X]], <2 x i32> <i32 0, i32 19>)
-; CHECK-NEXT:    ret <2 x i32> [[R]]
-;
-  %shl = shl <2 x i32> %x, <i32 undef, i32 19>
-  %shr = lshr <2 x i32> %x, <i32 15, i32 13>
-  %r = or <2 x i32> %shl, %shr
-  ret <2 x i32> %r
-}
-
 define <2 x i32> @rotr_v2i32_constant_nonsplat_poison0(<2 x i32> %x) {
 ; CHECK-LABEL: @rotr_v2i32_constant_nonsplat_poison0(
 ; CHECK-NEXT:    [[R:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[X:%.*]], <2 x i32> [[X]], <2 x i32> <i32 poison, i32 19>)
@@ -158,13 +147,13 @@ define <2 x i32> @rotr_v2i32_constant_nonsplat_poison0(<2 x i32> %x) {
   ret <2 x i32> %r
 }
 
-define <2 x i32> @rotr_v2i32_constant_nonsplat_undef1(<2 x i32> %x) {
-; CHECK-LABEL: @rotr_v2i32_constant_nonsplat_undef1(
+define <2 x i32> @rotr_v2i32_constant_nonsplat_poison1(<2 x i32> %x) {
+; CHECK-LABEL: @rotr_v2i32_constant_nonsplat_poison1(
 ; CHECK-NEXT:    [[R:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[X:%.*]], <2 x i32> [[X]], <2 x i32> <i32 17, i32 0>)
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %shl = shl <2 x i32> %x, <i32 17, i32 19>
-  %shr = lshr <2 x i32> %x, <i32 15, i32 undef>
+  %shr = lshr <2 x i32> %x, <i32 15, i32 poison>
   %r = or <2 x i32> %shl, %shr
   ret <2 x i32> %r
 }
@@ -180,13 +169,13 @@ define <2 x i36> @rotl_v2i36_constant_nonsplat(<2 x i36> %x) {
   ret <2 x i36> %r
 }
 
-define <3 x i36> @rotl_v3i36_constant_nonsplat_undef0(<3 x i36> %x) {
-; CHECK-LABEL: @rotl_v3i36_constant_nonsplat_undef0(
-; CHECK-NEXT:    [[R:%.*]] = call <3 x i36> @llvm.fshl.v3i36(<3 x i36> [[X:%.*]], <3 x i36> [[X]], <3 x i36> <i36 21, i36 11, i36 0>)
+define <3 x i36> @rotl_v3i36_constant_nonsplat_poison0(<3 x i36> %x) {
+; CHECK-LABEL: @rotl_v3i36_constant_nonsplat_poison0(
+; CHECK-NEXT:    [[R:%.*]] = call <3 x i36> @llvm.fshl.v3i36(<3 x i36> [[X:%.*]], <3 x i36> [[X]], <3 x i36> <i36 21, i36 11, i36 poison>)
 ; CHECK-NEXT:    ret <3 x i36> [[R]]
 ;
-  %shl = shl <3 x i36> %x, <i36 21, i36 11, i36 undef>
-  %shr = lshr <3 x i36> %x, <i36 15, i36 25, i36 undef>
+  %shl = shl <3 x i36> %x, <i36 21, i36 11, i36 poison>
+  %shr = lshr <3 x i36> %x, <i36 15, i36 25, i36 poison>
   %r = or <3 x i36> %shl, %shr
   ret <3 x i36> %r
 }
diff --git a/llvm/test/Transforms/InstCombine/saturating-add-sub.ll b/llvm/test/Transforms/InstCombine/saturating-add-sub.ll
index c1bb6941d4568..57977a72cd08f 100644
--- a/llvm/test/Transforms/InstCombine/saturating-add-sub.ll
+++ b/llvm/test/Transforms/InstCombine/saturating-add-sub.ll
@@ -559,14 +559,14 @@ define <2 x i8> @test_simplify_decrement_vec(<2 x i8> %a) {
   ret <2 x i8> %i2
 }
 
-define <2 x i8> @test_simplify_decrement_vec_undef(<2 x i8> %a) {
-; CHECK-LABEL: @test_simplify_decrement_vec_undef(
+define <2 x i8> @test_simplify_decrement_vec_poison(<2 x i8> %a) {
+; CHECK-LABEL: @test_simplify_decrement_vec_poison(
 ; CHECK-NEXT:    [[I2:%.*]] = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> [[A:%.*]], <2 x i8> <i8 1, i8 1>)
 ; CHECK-NEXT:    ret <2 x i8> [[I2]]
 ;
   %i = icmp eq <2 x i8> %a, <i8 0, i8 0>
   %i1 = sub <2 x i8> %a, <i8 1, i8 1>
-  %i2 = select <2 x i1> %i, <2 x i8> <i8 0, i8 undef>, <2 x i8> %i1
+  %i2 = select <2 x i1> %i, <2 x i8> <i8 0, i8 poison>, <2 x i8> %i1
   ret <2 x i8> %i2
 }
 
@@ -1818,14 +1818,14 @@ define <4 x i32> @uadd_sat_constant_vec_commute(<4 x i32> %x) {
 
 define <4 x i32> @uadd_sat_constant_vec_commute_undefs(<4 x i32> %x) {
 ; CHECK-LABEL: @uadd_sat_constant_vec_commute_undefs(
-; CHECK-NEXT:    [[A:%.*]] = add <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 undef>
-; CHECK-NEXT:    [[C:%.*]] = icmp ult <4 x i32> [[X]], <i32 -43, i32 -43, i32 undef, i32 -43>
-; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[C]], <4 x i32> [[A]], <4 x i32> <i32 -1, i32 undef, i32 -1, i32 -1>
+; CHECK-NEXT:    [[A:%.*]] = add <4 x i32> [[X:%.*]], <i32 42, i32 42, i32 42, i32 poison>
+; CHECK-NEXT:    [[C:%.*]] = icmp ult <4 x i32> [[X]], <i32 -43, i32 -43, i32 poison, i32 -43>
+; CHECK-NEXT:    [[R:%.*]] = select <4 x i1> [[C]], <4 x i32> [[A]], <4 x i32> <i32 -1, i32 poison, i32 -1, i32 -1>
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
-  %a = add <4 x i32> %x, <i32 42, i32 42, i32 42, i32 undef>
-  %c = icmp ult <4 x i32> %x, <i32 -43, i32 -43, i32 undef, i32 -43>
-  %r = select <4 x i1> %c, <4 x i32> %a, <4 x i32> <i32 -1, i32 undef, i32 -1, i32 -1>
+  %a = add <4 x i32> %x, <i32 42, i32 42, i32 42, i32 poison>
+  %c = icmp ult <4 x i32> %x, <i32 -43, i32 -43, i32 poison, i32 -43>
+  %r = select <4 x i1> %c, <4 x i32> %a, <4 x i32> <i32 -1, i32 poison, i32 -1, i32 -1>
   ret <4 x i32> %r
 }
 
diff --git a/llvm/test/Transforms/InstCombine/select-of-bittest.ll b/llvm/test/Transforms/InstCombine/select-of-bittest.ll
index a6f14cbfbfadf..e3eb76de459e2 100644
--- a/llvm/test/Transforms/InstCombine/select-of-bittest.ll
+++ b/llvm/test/Transforms/InstCombine/select-of-bittest.ll
@@ -80,19 +80,18 @@ define <2 x i32> @and_lshr_and_vec_v2(<2 x i32> %arg) {
   ret <2 x i32> %t4
 }
 
-define <3 x i32> @and_lshr_and_vec_undef(<3 x i32> %arg) {
-; CHECK-LABEL: @and_lshr_and_vec_undef(
+define <3 x i32> @and_lshr_and_vec_poison(<3 x i32> %arg) {
+; CHECK-LABEL: @and_lshr_and_vec_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = and <3 x i32> [[ARG:%.*]], <i32 3, i32 poison, i32 3>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <3 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[T4:%.*]] = zext <3 x i1> [[TMP2]] to <3 x i32>
 ; CHECK-NEXT:    ret <3 x i32> [[T4]]
 ;
-  %t = and <3 x i32> %arg, <i32 1, i32 undef, i32 1>
-  %t1 = icmp eq <3 x i32> %t, <i32 0, i32 undef, i32 0>
-  %t2 = lshr <3 x i32> %arg, <i32 1, i32 undef, i32 1>
-  %t3 = and <3 x i32> %t2, <i32 1, i32 undef, i32 1>
-  ; The second element of %t4 is poison because it is (undef ? poison : undef).
-  %t4 = select <3 x i1> %t1, <3 x i32> %t3, <3 x i32> <i32 1, i32 undef, i32 1>
+  %t = and <3 x i32> %arg, <i32 1, i32 poison, i32 1>
+  %t1 = icmp eq <3 x i32> %t, <i32 0, i32 poison, i32 0>
+  %t2 = lshr <3 x i32> %arg, <i32 1, i32 poison, i32 1>
+  %t3 = and <3 x i32> %t2, <i32 1, i32 poison, i32 1>
+  %t4 = select <3 x i1> %t1, <3 x i32> %t3, <3 x i32> <i32 1, i32 poison, i32 1>
   ret <3 x i32> %t4
 }
 
@@ -138,17 +137,17 @@ define <2 x i32> @and_and_vec(<2 x i32> %arg) {
   ret <2 x i32> %t3
 }
 
-define <3 x i32> @and_and_vec_undef(<3 x i32> %arg) {
-; CHECK-LABEL: @and_and_vec_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = and <3 x i32> [[ARG:%.*]], <i32 3, i32 -1, i32 3>
+define <3 x i32> @and_and_vec_poison(<3 x i32> %arg) {
+; CHECK-LABEL: @and_and_vec_poison(
+; CHECK-NEXT:    [[TMP1:%.*]] = and <3 x i32> [[ARG:%.*]], <i32 3, i32 poison, i32 3>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <3 x i32> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    [[T3:%.*]] = zext <3 x i1> [[TMP2]] to <3 x i32>
 ; CHECK-NEXT:    ret <3 x i32> [[T3]]
 ;
-  %t = and <3 x i32> %arg, <i32 2, i32 undef, i32 2>
-  %t1 = icmp eq <3 x i32> %t, <i32 0, i32 undef, i32 0>
-  %t2 = and <3 x i32> %arg, <i32 1, i32 undef, i32 1>
-  %t3 = select <3 x i1> %t1, <3 x i32> %t2, <3 x i32> <i32 1, i32 undef, i32 1>
+  %t = and <3 x i32> %arg, <i32 2, i32 poison, i32 2>
+  %t1 = icmp eq <3 x i32> %t, <i32 0, i32 poison, i32 0>
+  %t2 = and <3 x i32> %arg, <i32 1, i32 poison, i32 1>
+  %t3 = select <3 x i1> %t1, <3 x i32> %t2, <3 x i32> <i32 1, i32 poison, i32 1>
   ret <3 x i32> %t3
 }
 
@@ -221,8 +220,8 @@ define <2 x i32> @f_var0_vec(<2 x i32> %arg, <2 x i32> %arg1) {
   ret <2 x i32> %t5
 }
 
-define <3 x i32> @f_var0_vec_undef(<3 x i32> %arg, <3 x i32> %arg1) {
-; CHECK-LABEL: @f_var0_vec_undef(
+define <3 x i32> @f_var0_vec_poison(<3 x i32> %arg, <3 x i32> %arg1) {
+; CHECK-LABEL: @f_var0_vec_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = or <3 x i32> [[ARG1:%.*]], <i32 2, i32 poison, i32 2>
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <3 x i32> [[TMP1]], [[ARG:%.*]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <3 x i32> [[TMP2]], zeroinitializer
@@ -230,11 +229,11 @@ define <3 x i32> @f_var0_vec_undef(<3 x i32> %arg, <3 x i32> %arg1) {
 ; CHECK-NEXT:    ret <3 x i32> [[T5]]
 ;
   %t = and <3 x i32> %arg, %arg1
-  %t2 = icmp eq <3 x i32> %t, <i32 0, i32 undef, i32 0>
-  %t3 = lshr <3 x i32> %arg, <i32 1, i32 undef, i32 1>
-  %t4 = and <3 x i32> %t3, <i32 1, i32 undef, i32 1>
-  ; The second element of %t5 is poison because it is (undef ? poison : undef).
-  %t5 = select <3 x i1> %t2, <3 x i32> %t4, <3 x i32> <i32 1, i32 undef, i32 1>
+  %t2 = icmp eq <3 x i32> %t, <i32 0, i32 poison, i32 0>
+  %t3 = lshr <3 x i32> %arg, <i32 1, i32 poison, i32 1>
+  %t4 = and <3 x i32> %t3, <i32 1, i32 poison, i32 1>
+  ; The second element of %t5 is poison because it is (poison ? poison : poison).
+  %t5 = select <3 x i1> %t2, <3 x i32> %t4, <3 x i32> <i32 1, i32 poison, i32 1>
   ret <3 x i32> %t5
 }
 
@@ -284,8 +283,8 @@ define <2 x i32> @f_var1_vec(<2 x i32> %arg, <2 x i32> %arg1) {
   ret <2 x i32> %t4
 }
 
-define <3 x i32> @f_var1_vec_undef(<3 x i32> %arg, <3 x i32> %arg1) {
-; CHECK-LABEL: @f_var1_vec_undef(
+define <3 x i32> @f_var1_vec_poison(<3 x i32> %arg, <3 x i32> %arg1) {
+; CHECK-LABEL: @f_var1_vec_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = or <3 x i32> [[ARG1:%.*]], <i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <3 x i32> [[TMP1]], [[ARG:%.*]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <3 x i32> [[TMP2]], zeroinitializer
@@ -293,9 +292,9 @@ define <3 x i32> @f_var1_vec_undef(<3 x i32> %arg, <3 x i32> %arg1) {
 ; CHECK-NEXT:    ret <3 x i32> [[T4]]
 ;
   %t = and <3 x i32> %arg, %arg1
-  %t2 = icmp eq <3 x i32> %t, <i32 0, i32 undef, i32 0>
-  %t3 = and <3 x i32> %arg, <i32 1, i32 undef, i32 1>
-  %t4 = select <3 x i1> %t2, <3 x i32> %t3, <3 x i32> <i32 1, i32 undef, i32 1>
+  %t2 = icmp eq <3 x i32> %t, <i32 0, i32 poison, i32 0>
+  %t3 = and <3 x i32> %arg, <i32 1, i32 poison, i32 1>
+  %t4 = select <3 x i1> %t2, <3 x i32> %t3, <3 x i32> <i32 1, i32 poison, i32 1>
   ret <3 x i32> %t4
 }
 
@@ -354,20 +353,20 @@ define <2 x i32> @f_var2_vec(<2 x i32> %arg, <2 x i32> %arg1) {
   ret <2 x i32> %t5
 }
 
-define <3 x i32> @f_var2_vec_undef(<3 x i32> %arg, <3 x i32> %arg1) {
-; CHECK-LABEL: @f_var2_vec_undef(
-; CHECK-NEXT:    [[T:%.*]] = and <3 x i32> [[ARG:%.*]], <i32 1, i32 undef, i32 1>
-; CHECK-NEXT:    [[T2:%.*]] = icmp eq <3 x i32> [[T]], <i32 0, i32 undef, i32 0>
+define <3 x i32> @f_var2_vec_poison(<3 x i32> %arg, <3 x i32> %arg1) {
+; CHECK-LABEL: @f_var2_vec_poison(
+; CHECK-NEXT:    [[T:%.*]] = and <3 x i32> [[ARG:%.*]], <i32 1, i32 poison, i32 1>
+; CHECK-NEXT:    [[T2:%.*]] = icmp eq <3 x i32> [[T]], <i32 0, i32 poison, i32 0>
 ; CHECK-NEXT:    [[T3:%.*]] = lshr <3 x i32> [[ARG]], [[ARG1:%.*]]
-; CHECK-NEXT:    [[T4:%.*]] = and <3 x i32> [[T3]], <i32 1, i32 undef, i32 1>
-; CHECK-NEXT:    [[T5:%.*]] = select <3 x i1> [[T2]], <3 x i32> [[T4]], <3 x i32> <i32 1, i32 undef, i32 1>
+; CHECK-NEXT:    [[T4:%.*]] = and <3 x i32> [[T3]], <i32 1, i32 poison, i32 1>
+; CHECK-NEXT:    [[T5:%.*]] = select <3 x i1> [[T2]], <3 x i32> [[T4]], <3 x i32> <i32 1, i32 poison, i32 1>
 ; CHECK-NEXT:    ret <3 x i32> [[T5]]
 ;
-  %t = and <3 x i32> %arg, <i32 1, i32 undef, i32 1>
-  %t2 = icmp eq <3 x i32> %t, <i32 0, i32 undef, i32 0>
+  %t = and <3 x i32> %arg, <i32 1, i32 poison, i32 1>
+  %t2 = icmp eq <3 x i32> %t, <i32 0, i32 poison, i32 0>
   %t3 = lshr <3 x i32> %arg, %arg1
-  %t4 = and <3 x i32> %t3, <i32 1, i32 undef, i32 1>
-  %t5 = select <3 x i1> %t2, <3 x i32> %t4, <3 x i32> <i32 1, i32 undef, i32 1>
+  %t4 = and <3 x i32> %t3, <i32 1, i32 poison, i32 1>
+  %t5 = select <3 x i1> %t2, <3 x i32> %t4, <3 x i32> <i32 1, i32 poison, i32 1>
   ret <3 x i32> %t5
 }
 
@@ -427,20 +426,20 @@ define <2 x i32> @f_var3_splatvec(<2 x i32> %arg, <2 x i32> %arg1, <2 x i32> %ar
   ret <2 x i32> %t6
 }
 
-define <3 x i32> @f_var3_vec_undef(<3 x i32> %arg, <3 x i32> %arg1, <3 x i32> %arg2) {
-; CHECK-LABEL: @f_var3_vec_undef(
+define <3 x i32> @f_var3_vec_poison(<3 x i32> %arg, <3 x i32> %arg1, <3 x i32> %arg2) {
+; CHECK-LABEL: @f_var3_vec_poison(
 ; CHECK-NEXT:    [[T:%.*]] = and <3 x i32> [[ARG:%.*]], [[ARG1:%.*]]
-; CHECK-NEXT:    [[T3:%.*]] = icmp eq <3 x i32> [[T]], <i32 0, i32 undef, i32 0>
+; CHECK-NEXT:    [[T3:%.*]] = icmp eq <3 x i32> [[T]], <i32 0, i32 poison, i32 0>
 ; CHECK-NEXT:    [[T4:%.*]] = lshr <3 x i32> [[ARG]], [[ARG2:%.*]]
-; CHECK-NEXT:    [[T5:%.*]] = and <3 x i32> [[T4]], <i32 1, i32 undef, i32 1>
-; CHECK-NEXT:    [[T6:%.*]] = select <3 x i1> [[T3]], <3 x i32> [[T5]], <3 x i32> <i32 1, i32 undef, i32 1>
+; CHECK-NEXT:    [[T5:%.*]] = and <3 x i32> [[T4]], <i32 1, i32 poison, i32 1>
+; CHECK-NEXT:    [[T6:%.*]] = select <3 x i1> [[T3]], <3 x i32> [[T5]], <3 x i32> <i32 1, i32 poison, i32 1>
 ; CHECK-NEXT:    ret <3 x i32> [[T6]]
 ;
   %t = and <3 x i32> %arg, %arg1
-  %t3 = icmp eq <3 x i32> %t, <i32 0, i32 undef, i32 0>
+  %t3 = icmp eq <3 x i32> %t, <i32 0, i32 poison, i32 0>
   %t4 = lshr <3 x i32> %arg, %arg2
-  %t5 = and <3 x i32> %t4, <i32 1, i32 undef, i32 1>
-  %t6 = select <3 x i1> %t3, <3 x i32> %t5, <3 x i32> <i32 1, i32 undef, i32 1>
+  %t5 = and <3 x i32> %t4, <i32 1, i32 poison, i32 1>
+  %t6 = select <3 x i1> %t3, <3 x i32> %t5, <3 x i32> <i32 1, i32 poison, i32 1>
   ret <3 x i32> %t6
 }
 
diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll
index bd8145ab2a35b..8654691c6f875 100644
--- a/llvm/test/Transforms/InstCombine/select.ll
+++ b/llvm/test/Transforms/InstCombine/select.ll
@@ -3109,45 +3109,46 @@ define <4 x i32> @mul_select_eq_zero_vector(<4 x i32> %x, <4 x i32> %y) {
 }
 
 ; Check that a select is folded into multiplication if condition's operand
-; is a vector consisting of zeros and undefs.
-; select (<k x elt> x == {0, undef, ...}), <k x elt> 0, <k x elt> x * y --> freeze(y) * x
-define <2 x i32> @mul_select_eq_undef_vector(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: @mul_select_eq_undef_vector(
-; CHECK-NEXT:    [[Y_FR:%.*]] = freeze <2 x i32> [[Y:%.*]]
+; is a vector consisting of zeros and poisons.
+; select (<k x elt> x == {0, poison, ...}), <k x elt> 0, <k x elt> x * y --> freeze(y) * x
+define <2 x i32> @mul_select_eq_poison_vector(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @mul_select_eq_poison_vector(
+; CHECK-NEXT:    [[C:%.*]] = icmp eq <2 x i32> [[Y_FR:%.*]], <i32 0, i32 poison>
 ; CHECK-NEXT:    [[M:%.*]] = mul <2 x i32> [[Y_FR]], [[X:%.*]]
-; CHECK-NEXT:    ret <2 x i32> [[M]]
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[C]], <2 x i32> <i32 0, i32 42>, <2 x i32> [[M]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %c = icmp eq <2 x i32> %x, <i32 0, i32 undef>
+  %c = icmp eq <2 x i32> %x, <i32 0, i32 poison>
   %m = mul <2 x i32> %x, %y
   %r = select <2 x i1> %c, <2 x i32> <i32 0, i32 42>, <2 x i32> %m
   ret <2 x i32> %r
 }
 
 ; Check that a select is folded into multiplication if other select's operand
-; is a vector consisting of zeros and undefs.
-; select (<k x elt> x == 0), <k x elt> {0, undef, ...}, <k x elt> x * y --> freeze(y) * x
-define <2 x i32> @mul_select_eq_zero_sel_undef_vector(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: @mul_select_eq_zero_sel_undef_vector(
+; is a vector consisting of zeros and poisons.
+; select (<k x elt> x == 0), <k x elt> {0, poison, ...}, <k x elt> x * y --> freeze(y) * x
+define <2 x i32> @mul_select_eq_zero_sel_poison_vector(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @mul_select_eq_zero_sel_poison_vector(
 ; CHECK-NEXT:    [[Y_FR:%.*]] = freeze <2 x i32> [[Y:%.*]]
 ; CHECK-NEXT:    [[M:%.*]] = mul <2 x i32> [[Y_FR]], [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x i32> [[M]]
 ;
   %c = icmp eq <2 x i32> %x, zeroinitializer
   %m = mul <2 x i32> %x, %y
-  %r = select <2 x i1> %c, <2 x i32> <i32 0, i32 undef>, <2 x i32> %m
+  %r = select <2 x i1> %c, <2 x i32> <i32 0, i32 poison>, <2 x i32> %m
   ret <2 x i32> %r
 }
 
 ; Negative test: select should not be folded into mul because
 ; condition's operand and select's operand do not merge into zero vector.
-define <2 x i32> @mul_select_eq_undef_vector_not_merging_to_zero(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: @mul_select_eq_undef_vector_not_merging_to_zero(
-; CHECK-NEXT:    [[C:%.*]] = icmp eq <2 x i32> [[X:%.*]], <i32 0, i32 undef>
+define <2 x i32> @mul_select_eq_poison_vector_not_merging_to_zero(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @mul_select_eq_poison_vector_not_merging_to_zero(
+; CHECK-NEXT:    [[C:%.*]] = icmp eq <2 x i32> [[X:%.*]], <i32 0, i32 poison>
 ; CHECK-NEXT:    [[M:%.*]] = mul <2 x i32> [[X]], [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[C]], <2 x i32> <i32 1, i32 0>, <2 x i32> [[M]]
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %c = icmp eq <2 x i32> %x, <i32 0, i32 undef>
+  %c = icmp eq <2 x i32> %x, <i32 0, i32 poison>
   %m = mul <2 x i32> %x, %y
   %r = select <2 x i1> %c, <2 x i32> <i32 1, i32 0>, <2 x i32> %m
   ret <2 x i32> %r
diff --git a/llvm/test/Transforms/InstCombine/select_meta.ll b/llvm/test/Transforms/InstCombine/select_meta.ll
index aa794e82e0fdc..3898fd9fa1f57 100644
--- a/llvm/test/Transforms/InstCombine/select_meta.ll
+++ b/llvm/test/Transforms/InstCombine/select_meta.ll
@@ -301,15 +301,15 @@ define <2 x i32> @not_cond_vec(<2 x i1> %c, <2 x i32> %tv, <2 x i32> %fv) {
   ret <2 x i32> %r
 }
 
-; Should match vector 'not' with undef element.
+; Should match vector 'not' with poison element.
 ; The condition is inverted, and the select ops are swapped. The metadata should be swapped.
 
-define <2 x i32> @not_cond_vec_undef(<2 x i1> %c, <2 x i32> %tv, <2 x i32> %fv) {
-; CHECK-LABEL: @not_cond_vec_undef(
+define <2 x i32> @not_cond_vec_poison(<2 x i1> %c, <2 x i32> %tv, <2 x i32> %fv) {
+; CHECK-LABEL: @not_cond_vec_poison(
 ; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> [[C:%.*]], <2 x i32> [[FV:%.*]], <2 x i32> [[TV:%.*]], !prof [[PROF1]]
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %notc = xor <2 x i1> %c, <i1 undef, i1 true>
+  %notc = xor <2 x i1> %c, <i1 poison, i1 true>
   %r = select <2 x i1> %notc, <2 x i32> %tv, <2 x i32> %fv, !prof !1
   ret <2 x i32> %r
 }
diff --git a/llvm/test/Transforms/InstCombine/set-lowbits-mask-canonicalize.ll b/llvm/test/Transforms/InstCombine/set-lowbits-mask-canonicalize.ll
index 3ee0224eb1d03..a3c8d3393d04f 100644
--- a/llvm/test/Transforms/InstCombine/set-lowbits-mask-canonicalize.ll
+++ b/llvm/test/Transforms/InstCombine/set-lowbits-mask-canonicalize.ll
@@ -196,36 +196,36 @@ define <2 x i32> @shl_add_vec(<2 x i32> %NBits) {
   ret <2 x i32> %ret
 }
 
-define <3 x i32> @shl_add_vec_undef0(<3 x i32> %NBits) {
-; CHECK-LABEL: @shl_add_vec_undef0(
+define <3 x i32> @shl_add_vec_poison0(<3 x i32> %NBits) {
+; CHECK-LABEL: @shl_add_vec_poison0(
 ; CHECK-NEXT:    [[NOTMASK:%.*]] = shl nsw <3 x i32> <i32 -1, i32 -1, i32 -1>, [[NBITS:%.*]]
 ; CHECK-NEXT:    [[RET:%.*]] = xor <3 x i32> [[NOTMASK]], <i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    ret <3 x i32> [[RET]]
 ;
-  %setbit = shl <3 x i32> <i32 1, i32 undef, i32 1>, %NBits
+  %setbit = shl <3 x i32> <i32 1, i32 poison, i32 1>, %NBits
   %ret = add <3 x i32> %setbit, <i32 -1, i32 -1, i32 -1>
   ret <3 x i32> %ret
 }
 
-define <3 x i32> @shl_add_vec_undef1(<3 x i32> %NBits) {
-; CHECK-LABEL: @shl_add_vec_undef1(
+define <3 x i32> @shl_add_vec_poison1(<3 x i32> %NBits) {
+; CHECK-LABEL: @shl_add_vec_poison1(
 ; CHECK-NEXT:    [[NOTMASK:%.*]] = shl nsw <3 x i32> <i32 -1, i32 -1, i32 -1>, [[NBITS:%.*]]
 ; CHECK-NEXT:    [[RET:%.*]] = xor <3 x i32> [[NOTMASK]], <i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    ret <3 x i32> [[RET]]
 ;
   %setbit = shl <3 x i32> <i32 1, i32 1, i32 1>, %NBits
-  %ret = add <3 x i32> %setbit, <i32 -1, i32 undef, i32 -1>
+  %ret = add <3 x i32> %setbit, <i32 -1, i32 poison, i32 -1>
   ret <3 x i32> %ret
 }
 
-define <3 x i32> @shl_add_vec_undef2(<3 x i32> %NBits) {
-; CHECK-LABEL: @shl_add_vec_undef2(
+define <3 x i32> @shl_add_vec_poison2(<3 x i32> %NBits) {
+; CHECK-LABEL: @shl_add_vec_poison2(
 ; CHECK-NEXT:    [[NOTMASK:%.*]] = shl nsw <3 x i32> <i32 -1, i32 -1, i32 -1>, [[NBITS:%.*]]
 ; CHECK-NEXT:    [[RET:%.*]] = xor <3 x i32> [[NOTMASK]], <i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    ret <3 x i32> [[RET]]
 ;
-  %setbit = shl <3 x i32> <i32 1, i32 undef, i32 1>, %NBits
-  %ret = add <3 x i32> %setbit, <i32 -1, i32 undef, i32 -1>
+  %setbit = shl <3 x i32> <i32 1, i32 poison, i32 1>, %NBits
+  %ret = add <3 x i32> %setbit, <i32 -1, i32 poison, i32 -1>
   ret <3 x i32> %ret
 }
 
diff --git a/llvm/test/Transforms/InstCombine/sext.ll b/llvm/test/Transforms/InstCombine/sext.ll
index e3b6058ce7f80..6d263cfcda057 100644
--- a/llvm/test/Transforms/InstCombine/sext.ll
+++ b/llvm/test/Transforms/InstCombine/sext.ll
@@ -167,39 +167,39 @@ define <2 x i32> @test10_vec_nonuniform(<2 x i32> %i) {
   ret <2 x i32> %D
 }
 
-define <2 x i32> @test10_vec_undef0(<2 x i32> %i) {
-; CHECK-LABEL: @test10_vec_undef0(
-; CHECK-NEXT:    [[D1:%.*]] = shl <2 x i32> [[I:%.*]], <i32 30, i32 undef>
-; CHECK-NEXT:    [[D:%.*]] = ashr exact <2 x i32> [[D1]], <i32 30, i32 undef>
+define <2 x i32> @test10_vec_poison0(<2 x i32> %i) {
+; CHECK-LABEL: @test10_vec_poison0(
+; CHECK-NEXT:    [[D1:%.*]] = shl <2 x i32> [[I:%.*]], <i32 30, i32 poison>
+; CHECK-NEXT:    [[D:%.*]] = ashr exact <2 x i32> [[D1]], <i32 30, i32 poison>
 ; CHECK-NEXT:    ret <2 x i32> [[D]]
 ;
   %A = trunc <2 x i32> %i to <2 x i8>
   %B = shl <2 x i8> %A, <i8 6, i8 0>
-  %C = ashr <2 x i8> %B, <i8 6, i8 undef>
+  %C = ashr <2 x i8> %B, <i8 6, i8 poison>
   %D = sext <2 x i8> %C to <2 x i32>
   ret <2 x i32> %D
 }
-define <2 x i32> @test10_vec_undef1(<2 x i32> %i) {
-; CHECK-LABEL: @test10_vec_undef1(
+define <2 x i32> @test10_vec_poison1(<2 x i32> %i) {
+; CHECK-LABEL: @test10_vec_poison1(
 ; CHECK-NEXT:    [[D1:%.*]] = shl <2 x i32> [[I:%.*]], <i32 30, i32 undef>
 ; CHECK-NEXT:    [[D:%.*]] = ashr exact <2 x i32> [[D1]], <i32 30, i32 undef>
 ; CHECK-NEXT:    ret <2 x i32> [[D]]
 ;
   %A = trunc <2 x i32> %i to <2 x i8>
-  %B = shl <2 x i8> %A, <i8 6, i8 undef>
+  %B = shl <2 x i8> %A, <i8 6, i8 poison>
   %C = ashr <2 x i8> %B, <i8 6, i8 0>
   %D = sext <2 x i8> %C to <2 x i32>
   ret <2 x i32> %D
 }
-define <2 x i32> @test10_vec_undef2(<2 x i32> %i) {
-; CHECK-LABEL: @test10_vec_undef2(
-; CHECK-NEXT:    [[D1:%.*]] = shl <2 x i32> [[I:%.*]], <i32 30, i32 undef>
-; CHECK-NEXT:    [[D:%.*]] = ashr exact <2 x i32> [[D1]], <i32 30, i32 undef>
+define <2 x i32> @test10_vec_poison2(<2 x i32> %i) {
+; CHECK-LABEL: @test10_vec_poison2(
+; CHECK-NEXT:    [[D1:%.*]] = shl <2 x i32> [[I:%.*]], <i32 30, i32 poison>
+; CHECK-NEXT:    [[D:%.*]] = ashr exact <2 x i32> [[D1]], <i32 30, i32 poison>
 ; CHECK-NEXT:    ret <2 x i32> [[D]]
 ;
   %A = trunc <2 x i32> %i to <2 x i8>
-  %B = shl <2 x i8> %A, <i8 6, i8 undef>
-  %C = ashr <2 x i8> %B, <i8 6, i8 undef>
+  %B = shl <2 x i8> %A, <i8 6, i8 poison>
+  %C = ashr <2 x i8> %B, <i8 6, i8 poison>
   %D = sext <2 x i8> %C to <2 x i32>
   ret <2 x i32> %D
 }
diff --git a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest.ll b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest.ll
index 0262db1a01e5c..96d429c62a88f 100644
--- a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest.ll
+++ b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-in-bittest.ll
@@ -143,34 +143,34 @@ define <2 x i1> @t8_const_lshr_shl_ne_vec_nonsplat(<2 x i32> %x, <2 x i32> %y) {
   %t3 = icmp ne <2 x i32> %t2, <i32 0, i32 0>
   ret <2 x i1> %t3
 }
-define <3 x i1> @t9_const_lshr_shl_ne_vec_undef0(<3 x i32> %x, <3 x i32> %y) {
-; CHECK-LABEL: @t9_const_lshr_shl_ne_vec_undef0(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 2, i32 undef, i32 2>
+define <3 x i1> @t9_const_lshr_shl_ne_vec_poison0(<3 x i32> %x, <3 x i32> %y) {
+; CHECK-LABEL: @t9_const_lshr_shl_ne_vec_poison0(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 2, i32 poison, i32 2>
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <3 x i32> [[TMP1]], [[Y:%.*]]
 ; CHECK-NEXT:    [[T3:%.*]] = icmp ne <3 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[T3]]
 ;
-  %t0 = lshr <3 x i32> %x, <i32 1, i32 undef, i32 1>
+  %t0 = lshr <3 x i32> %x, <i32 1, i32 poison, i32 1>
   %t1 = shl <3 x i32> %y, <i32 1, i32 1, i32 1>
   %t2 = and <3 x i32> %t1, %t0
   %t3 = icmp ne <3 x i32> %t2, <i32 0, i32 0, i32 0>
   ret <3 x i1> %t3
 }
-define <3 x i1> @t10_const_lshr_shl_ne_vec_undef1(<3 x i32> %x, <3 x i32> %y) {
-; CHECK-LABEL: @t10_const_lshr_shl_ne_vec_undef1(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 2, i32 undef, i32 2>
+define <3 x i1> @t10_const_lshr_shl_ne_vec_poison1(<3 x i32> %x, <3 x i32> %y) {
+; CHECK-LABEL: @t10_const_lshr_shl_ne_vec_poison1(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 2, i32 poison, i32 2>
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <3 x i32> [[TMP1]], [[Y:%.*]]
 ; CHECK-NEXT:    [[T3:%.*]] = icmp ne <3 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[T3]]
 ;
   %t0 = lshr <3 x i32> %x, <i32 1, i32 1, i32 1>
-  %t1 = shl <3 x i32> %y, <i32 1, i32 undef, i32 1>
+  %t1 = shl <3 x i32> %y, <i32 1, i32 poison, i32 1>
   %t2 = and <3 x i32> %t1, %t0
   %t3 = icmp ne <3 x i32> %t2, <i32 0, i32 0, i32 0>
   ret <3 x i1> %t3
 }
-define <3 x i1> @t11_const_lshr_shl_ne_vec_undef2(<3 x i32> %x, <3 x i32> %y) {
-; CHECK-LABEL: @t11_const_lshr_shl_ne_vec_undef2(
+define <3 x i1> @t11_const_lshr_shl_ne_vec_poison2(<3 x i32> %x, <3 x i32> %y) {
+; CHECK-LABEL: @t11_const_lshr_shl_ne_vec_poison2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 2, i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <3 x i32> [[TMP1]], [[Y:%.*]]
 ; CHECK-NEXT:    [[T3:%.*]] = icmp ne <3 x i32> [[TMP2]], zeroinitializer
@@ -179,59 +179,59 @@ define <3 x i1> @t11_const_lshr_shl_ne_vec_undef2(<3 x i32> %x, <3 x i32> %y) {
   %t0 = lshr <3 x i32> %x, <i32 1, i32 1, i32 1>
   %t1 = shl <3 x i32> %y, <i32 1, i32 1, i32 1>
   %t2 = and <3 x i32> %t1, %t0
-  %t3 = icmp ne <3 x i32> %t2, <i32 0, i32 undef, i32 0>
+  %t3 = icmp ne <3 x i32> %t2, <i32 0, i32 poison, i32 0>
   ret <3 x i1> %t3
 }
-define <3 x i1> @t12_const_lshr_shl_ne_vec_undef3(<3 x i32> %x, <3 x i32> %y) {
-; CHECK-LABEL: @t12_const_lshr_shl_ne_vec_undef3(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 2, i32 undef, i32 2>
+define <3 x i1> @t12_const_lshr_shl_ne_vec_poison3(<3 x i32> %x, <3 x i32> %y) {
+; CHECK-LABEL: @t12_const_lshr_shl_ne_vec_poison3(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 2, i32 poison, i32 2>
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <3 x i32> [[TMP1]], [[Y:%.*]]
 ; CHECK-NEXT:    [[T3:%.*]] = icmp ne <3 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[T3]]
 ;
-  %t0 = lshr <3 x i32> %x, <i32 1, i32 undef, i32 1>
-  %t1 = shl <3 x i32> %y, <i32 1, i32 undef, i32 1>
+  %t0 = lshr <3 x i32> %x, <i32 1, i32 poison, i32 1>
+  %t1 = shl <3 x i32> %y, <i32 1, i32 poison, i32 1>
   %t2 = and <3 x i32> %t1, %t0
   %t3 = icmp ne <3 x i32> %t2, <i32 0, i32 0, i32 0>
   ret <3 x i1> %t3
 }
-define <3 x i1> @t13_const_lshr_shl_ne_vec_undef4(<3 x i32> %x, <3 x i32> %y) {
-; CHECK-LABEL: @t13_const_lshr_shl_ne_vec_undef4(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 2, i32 undef, i32 2>
+define <3 x i1> @t13_const_lshr_shl_ne_vec_poison4(<3 x i32> %x, <3 x i32> %y) {
+; CHECK-LABEL: @t13_const_lshr_shl_ne_vec_poison4(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 2, i32 poison, i32 2>
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <3 x i32> [[TMP1]], [[Y:%.*]]
 ; CHECK-NEXT:    [[T3:%.*]] = icmp ne <3 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[T3]]
 ;
   %t0 = lshr <3 x i32> %x, <i32 1, i32 1, i32 1>
-  %t1 = shl <3 x i32> %y, <i32 1, i32 undef, i32 1>
+  %t1 = shl <3 x i32> %y, <i32 1, i32 poison, i32 1>
   %t2 = and <3 x i32> %t1, %t0
-  %t3 = icmp ne <3 x i32> %t2, <i32 0, i32 undef, i32 0>
+  %t3 = icmp ne <3 x i32> %t2, <i32 0, i32 poison, i32 0>
   ret <3 x i1> %t3
 }
-define <3 x i1> @t14_const_lshr_shl_ne_vec_undef5(<3 x i32> %x, <3 x i32> %y) {
-; CHECK-LABEL: @t14_const_lshr_shl_ne_vec_undef5(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 2, i32 undef, i32 2>
+define <3 x i1> @t14_const_lshr_shl_ne_vec_poison5(<3 x i32> %x, <3 x i32> %y) {
+; CHECK-LABEL: @t14_const_lshr_shl_ne_vec_poison5(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 2, i32 poison, i32 2>
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <3 x i32> [[TMP1]], [[Y:%.*]]
 ; CHECK-NEXT:    [[T3:%.*]] = icmp ne <3 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[T3]]
 ;
-  %t0 = lshr <3 x i32> %x, <i32 1, i32 undef, i32 1>
+  %t0 = lshr <3 x i32> %x, <i32 1, i32 poison, i32 1>
   %t1 = shl <3 x i32> %y, <i32 1, i32 1, i32 1>
   %t2 = and <3 x i32> %t1, %t0
-  %t3 = icmp ne <3 x i32> %t2, <i32 0, i32 undef, i32 0>
+  %t3 = icmp ne <3 x i32> %t2, <i32 0, i32 poison, i32 0>
   ret <3 x i1> %t3
 }
-define <3 x i1> @t15_const_lshr_shl_ne_vec_undef6(<3 x i32> %x, <3 x i32> %y) {
-; CHECK-LABEL: @t15_const_lshr_shl_ne_vec_undef6(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 2, i32 undef, i32 2>
+define <3 x i1> @t15_const_lshr_shl_ne_vec_poison6(<3 x i32> %x, <3 x i32> %y) {
+; CHECK-LABEL: @t15_const_lshr_shl_ne_vec_poison6(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 2, i32 poison, i32 2>
 ; CHECK-NEXT:    [[TMP2:%.*]] = and <3 x i32> [[TMP1]], [[Y:%.*]]
 ; CHECK-NEXT:    [[T3:%.*]] = icmp ne <3 x i32> [[TMP2]], zeroinitializer
 ; CHECK-NEXT:    ret <3 x i1> [[T3]]
 ;
-  %t0 = lshr <3 x i32> %x, <i32 1, i32 undef, i32 1>
-  %t1 = shl <3 x i32> %y, <i32 1, i32 undef, i32 1>
+  %t0 = lshr <3 x i32> %x, <i32 1, i32 poison, i32 1>
+  %t1 = shl <3 x i32> %y, <i32 1, i32 poison, i32 1>
   %t2 = and <3 x i32> %t1, %t0
-  %t3 = icmp ne <3 x i32> %t2, <i32 0, i32 undef, i32 0>
+  %t3 = icmp ne <3 x i32> %t2, <i32 0, i32 poison, i32 0>
   ret <3 x i1> %t3
 }
 
diff --git a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-ashr.ll b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-ashr.ll
index 84dd4c57ebc61..9efc30cc9d916 100644
--- a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-ashr.ll
+++ b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-ashr.ll
@@ -42,13 +42,13 @@ define <2 x i16> @t1_vec_splat(<2 x i32> %x, <2 x i16> %y) {
   ret <2 x i16> %t5
 }
 
-define <3 x i16> @t3_vec_nonsplat_undef0(<3 x i32> %x, <3 x i16> %y) {
-; CHECK-LABEL: @t3_vec_nonsplat_undef0(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <3 x i32> [[X:%.*]], <i32 31, i32 0, i32 31>
+define <3 x i16> @t3_vec_nonsplat_poison0(<3 x i32> %x, <3 x i16> %y) {
+; CHECK-LABEL: @t3_vec_nonsplat_poison0(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <3 x i32> [[X:%.*]], <i32 31, i32 poison, i32 31>
 ; CHECK-NEXT:    [[T5:%.*]] = trunc <3 x i32> [[TMP1]] to <3 x i16>
 ; CHECK-NEXT:    ret <3 x i16> [[T5]]
 ;
-  %t0 = sub <3 x i16> <i16 32, i16 undef, i16 32>, %y
+  %t0 = sub <3 x i16> <i16 32, i16 poison, i16 32>, %y
   %t1 = zext <3 x i16> %t0 to <3 x i32>
   %t2 = ashr <3 x i32> %x, %t1
   %t3 = trunc <3 x i32> %t2 to <3 x i16>
@@ -57,9 +57,9 @@ define <3 x i16> @t3_vec_nonsplat_undef0(<3 x i32> %x, <3 x i16> %y) {
   ret <3 x i16> %t5
 }
 
-define <3 x i16> @t4_vec_nonsplat_undef1(<3 x i32> %x, <3 x i16> %y) {
-; CHECK-LABEL: @t4_vec_nonsplat_undef1(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <3 x i32> [[X:%.*]], <i32 31, i32 0, i32 31>
+define <3 x i16> @t4_vec_nonsplat_poison1(<3 x i32> %x, <3 x i16> %y) {
+; CHECK-LABEL: @t4_vec_nonsplat_poison1(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <3 x i32> [[X:%.*]], <i32 31, i32 poison, i32 31>
 ; CHECK-NEXT:    [[T5:%.*]] = trunc <3 x i32> [[TMP1]] to <3 x i16>
 ; CHECK-NEXT:    ret <3 x i16> [[T5]]
 ;
@@ -67,22 +67,22 @@ define <3 x i16> @t4_vec_nonsplat_undef1(<3 x i32> %x, <3 x i16> %y) {
   %t1 = zext <3 x i16> %t0 to <3 x i32>
   %t2 = ashr <3 x i32> %x, %t1
   %t3 = trunc <3 x i32> %t2 to <3 x i16>
-  %t4 = add <3 x i16> %y, <i16 -1, i16 undef, i16 -1>
+  %t4 = add <3 x i16> %y, <i16 -1, i16 poison, i16 -1>
   %t5 = ashr <3 x i16> %t3, %t4
   ret <3 x i16> %t5
 }
 
-define <3 x i16> @t5_vec_nonsplat_undef1(<3 x i32> %x, <3 x i16> %y) {
-; CHECK-LABEL: @t5_vec_nonsplat_undef1(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr <3 x i32> [[X:%.*]], <i32 31, i32 0, i32 31>
+define <3 x i16> @t5_vec_nonsplat_poison1(<3 x i32> %x, <3 x i16> %y) {
+; CHECK-LABEL: @t5_vec_nonsplat_poison1(
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr <3 x i32> [[X:%.*]], <i32 31, i32 poison, i32 31>
 ; CHECK-NEXT:    [[T5:%.*]] = trunc <3 x i32> [[TMP1]] to <3 x i16>
 ; CHECK-NEXT:    ret <3 x i16> [[T5]]
 ;
-  %t0 = sub <3 x i16> <i16 32, i16 undef, i16 32>, %y
+  %t0 = sub <3 x i16> <i16 32, i16 poison, i16 32>, %y
   %t1 = zext <3 x i16> %t0 to <3 x i32>
   %t2 = ashr <3 x i32> %x, %t1
   %t3 = trunc <3 x i32> %t2 to <3 x i16>
-  %t4 = add <3 x i16> %y, <i16 -1, i16 undef, i16 -1>
+  %t4 = add <3 x i16> %y, <i16 -1, i16 poison, i16 -1>
   %t5 = ashr <3 x i16> %t3, %t4
   ret <3 x i16> %t5
 }
diff --git a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-lshr.ll b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-lshr.ll
index 214ec88d2e551..c31b6ed3ea2ba 100644
--- a/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-lshr.ll
+++ b/llvm/test/Transforms/InstCombine/shift-amount-reassociation-with-truncation-lshr.ll
@@ -42,13 +42,13 @@ define <2 x i16> @t1_vec_splat(<2 x i32> %x, <2 x i16> %y) {
   ret <2 x i16> %t5
 }
 
-define <3 x i16> @t3_vec_nonsplat_undef0(<3 x i32> %x, <3 x i16> %y) {
-; CHECK-LABEL: @t3_vec_nonsplat_undef0(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 31, i32 0, i32 31>
-; CHECK-NEXT:    [[T5:%.*]] = trunc <3 x i32> [[TMP1]] to <3 x i16>
+define <3 x i16> @t3_vec_nonsplat_poison0(<3 x i32> %x, <3 x i16> %y) {
+; CHECK-LABEL: @t3_vec_nonsplat_poison0(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 31, i32 poison, i32 31>
+; CHECK-NEXT:    [[T5:%.*]] = trunc nuw nsw <3 x i32> [[TMP1]] to <3 x i16>
 ; CHECK-NEXT:    ret <3 x i16> [[T5]]
 ;
-  %t0 = sub <3 x i16> <i16 32, i16 undef, i16 32>, %y
+  %t0 = sub <3 x i16> <i16 32, i16 poison, i16 32>, %y
   %t1 = zext <3 x i16> %t0 to <3 x i32>
   %t2 = lshr <3 x i32> %x, %t1
   %t3 = trunc <3 x i32> %t2 to <3 x i16>
@@ -57,32 +57,32 @@ define <3 x i16> @t3_vec_nonsplat_undef0(<3 x i32> %x, <3 x i16> %y) {
   ret <3 x i16> %t5
 }
 
-define <3 x i16> @t4_vec_nonsplat_undef1(<3 x i32> %x, <3 x i16> %y) {
-; CHECK-LABEL: @t4_vec_nonsplat_undef1(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 31, i32 0, i32 31>
-; CHECK-NEXT:    [[T5:%.*]] = trunc <3 x i32> [[TMP1]] to <3 x i16>
+define <3 x i16> @t4_vec_nonsplat_poison1(<3 x i32> %x, <3 x i16> %y) {
+; CHECK-LABEL: @t4_vec_nonsplat_poison1(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 31, i32 poison, i32 31>
+; CHECK-NEXT:    [[T5:%.*]] = trunc nuw nsw <3 x i32> [[TMP1]] to <3 x i16>
 ; CHECK-NEXT:    ret <3 x i16> [[T5]]
 ;
   %t0 = sub <3 x i16> <i16 32, i16 32, i16 32>, %y
   %t1 = zext <3 x i16> %t0 to <3 x i32>
   %t2 = lshr <3 x i32> %x, %t1
   %t3 = trunc <3 x i32> %t2 to <3 x i16>
-  %t4 = add <3 x i16> %y, <i16 -1, i16 undef, i16 -1>
+  %t4 = add <3 x i16> %y, <i16 -1, i16 poison, i16 -1>
   %t5 = lshr <3 x i16> %t3, %t4
   ret <3 x i16> %t5
 }
 
-define <3 x i16> @t5_vec_nonsplat_undef1(<3 x i32> %x, <3 x i16> %y) {
-; CHECK-LABEL: @t5_vec_nonsplat_undef1(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 31, i32 0, i32 31>
-; CHECK-NEXT:    [[T5:%.*]] = trunc <3 x i32> [[TMP1]] to <3 x i16>
+define <3 x i16> @t5_vec_nonsplat_poison1(<3 x i32> %x, <3 x i16> %y) {
+; CHECK-LABEL: @t5_vec_nonsplat_poison1(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 31, i32 poison, i32 31>
+; CHECK-NEXT:    [[T5:%.*]] = trunc nuw nsw <3 x i32> [[TMP1]] to <3 x i16>
 ; CHECK-NEXT:    ret <3 x i16> [[T5]]
 ;
-  %t0 = sub <3 x i16> <i16 32, i16 undef, i16 32>, %y
+  %t0 = sub <3 x i16> <i16 32, i16 poison, i16 32>, %y
   %t1 = zext <3 x i16> %t0 to <3 x i32>
   %t2 = lshr <3 x i32> %x, %t1
   %t3 = trunc <3 x i32> %t2 to <3 x i16>
-  %t4 = add <3 x i16> %y, <i16 -1, i16 undef, i16 -1>
+  %t4 = add <3 x i16> %y, <i16 -1, i16 poison, i16 -1>
   %t5 = lshr <3 x i16> %t3, %t4
   ret <3 x i16> %t5
 }
diff --git a/llvm/test/Transforms/InstCombine/shift-amount-reassociation.ll b/llvm/test/Transforms/InstCombine/shift-amount-reassociation.ll
index b96bcd6bab4f1..6bbe4c5151e45 100644
--- a/llvm/test/Transforms/InstCombine/shift-amount-reassociation.ll
+++ b/llvm/test/Transforms/InstCombine/shift-amount-reassociation.ll
@@ -48,38 +48,38 @@ define <2 x i32> @t2_vec_nonsplat(<2 x i32> %x, <2 x i32> %y) {
 
 ; Basic vector tests
 
-define <3 x i32> @t3_vec_nonsplat_undef0(<3 x i32> %x, <3 x i32> %y) {
-; CHECK-LABEL: @t3_vec_nonsplat_undef0(
-; CHECK-NEXT:    [[T3:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 30, i32 undef, i32 30>
+define <3 x i32> @t3_vec_nonsplat_poison0(<3 x i32> %x, <3 x i32> %y) {
+; CHECK-LABEL: @t3_vec_nonsplat_poison0(
+; CHECK-NEXT:    [[T3:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 30, i32 poison, i32 30>
 ; CHECK-NEXT:    ret <3 x i32> [[T3]]
 ;
-  %t0 = sub <3 x i32> <i32 32, i32 undef, i32 32>, %y
+  %t0 = sub <3 x i32> <i32 32, i32 poison, i32 32>, %y
   %t1 = lshr <3 x i32> %x, %t0
   %t2 = add <3 x i32> %y, <i32 -2, i32 -2, i32 -2>
   %t3 = lshr <3 x i32> %t1, %t2
   ret <3 x i32> %t3
 }
 
-define <3 x i32> @t4_vec_nonsplat_undef1(<3 x i32> %x, <3 x i32> %y) {
-; CHECK-LABEL: @t4_vec_nonsplat_undef1(
-; CHECK-NEXT:    [[T3:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 30, i32 undef, i32 30>
+define <3 x i32> @t4_vec_nonsplat_poison1(<3 x i32> %x, <3 x i32> %y) {
+; CHECK-LABEL: @t4_vec_nonsplat_poison1(
+; CHECK-NEXT:    [[T3:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 30, i32 poison, i32 30>
 ; CHECK-NEXT:    ret <3 x i32> [[T3]]
 ;
   %t0 = sub <3 x i32> <i32 32, i32 32, i32 32>, %y
   %t1 = lshr <3 x i32> %x, %t0
-  %t2 = add <3 x i32> %y, <i32 -2, i32 undef, i32 -2>
+  %t2 = add <3 x i32> %y, <i32 -2, i32 poison, i32 -2>
   %t3 = lshr <3 x i32> %t1, %t2
   ret <3 x i32> %t3
 }
 
-define <3 x i32> @t5_vec_nonsplat_undef1(<3 x i32> %x, <3 x i32> %y) {
-; CHECK-LABEL: @t5_vec_nonsplat_undef1(
-; CHECK-NEXT:    [[T3:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 30, i32 undef, i32 30>
+define <3 x i32> @t5_vec_nonsplat_poison1(<3 x i32> %x, <3 x i32> %y) {
+; CHECK-LABEL: @t5_vec_nonsplat_poison1(
+; CHECK-NEXT:    [[T3:%.*]] = lshr <3 x i32> [[X:%.*]], <i32 30, i32 poison, i32 30>
 ; CHECK-NEXT:    ret <3 x i32> [[T3]]
 ;
-  %t0 = sub <3 x i32> <i32 32, i32 undef, i32 32>, %y
+  %t0 = sub <3 x i32> <i32 32, i32 poison, i32 32>, %y
   %t1 = lshr <3 x i32> %x, %t0
-  %t2 = add <3 x i32> %y, <i32 -2, i32 undef, i32 -2>
+  %t2 = add <3 x i32> %y, <i32 -2, i32 poison, i32 -2>
   %t3 = lshr <3 x i32> %t1, %t2
   ret <3 x i32> %t3
 }
diff --git a/llvm/test/Transforms/InstCombine/shift-logic.ll b/llvm/test/Transforms/InstCombine/shift-logic.ll
index c982b45b504e9..b591400c6a260 100644
--- a/llvm/test/Transforms/InstCombine/shift-logic.ll
+++ b/llvm/test/Transforms/InstCombine/shift-logic.ll
@@ -44,18 +44,18 @@ define i16 @shl_or(i16 %x, i16 %py) {
   ret i16 %sh1
 }
 
-define <2 x i16> @shl_or_undef(<2 x i16> %x, <2 x i16> %py) {
-; CHECK-LABEL: @shl_or_undef(
+define <2 x i16> @shl_or_poison(<2 x i16> %x, <2 x i16> %py) {
+; CHECK-LABEL: @shl_or_poison(
 ; CHECK-NEXT:    [[Y:%.*]] = srem <2 x i16> [[PY:%.*]], <i16 42, i16 42>
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i16> [[X:%.*]], <i16 12, i16 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = shl <2 x i16> [[Y]], <i16 7, i16 undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i16> [[X:%.*]], <i16 12, i16 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw <2 x i16> [[Y]], <i16 7, i16 poison>
 ; CHECK-NEXT:    [[SH1:%.*]] = or <2 x i16> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <2 x i16> [[SH1]]
 ;
   %y = srem <2 x i16> %py, <i16 42, i16 42> ; thwart complexity-based canonicalization
-  %sh0 = shl <2 x i16> %x, <i16 5, i16 undef>
+  %sh0 = shl <2 x i16> %x, <i16 5, i16 poison>
   %r = or <2 x i16> %y, %sh0
-  %sh1 = shl <2 x i16> %r, <i16 7, i16 undef>
+  %sh1 = shl <2 x i16> %r, <i16 7, i16 poison>
   ret <2 x i16> %sh1
 }
 
@@ -100,18 +100,18 @@ define i64 @lshr_and(i64 %x, i64 %py) {
   ret i64 %sh1
 }
 
-define <2 x i64> @lshr_and_undef(<2 x i64> %x, <2 x i64> %py) {
-; CHECK-LABEL: @lshr_and_undef(
+define <2 x i64> @lshr_and_poison(<2 x i64> %x, <2 x i64> %py) {
+; CHECK-LABEL: @lshr_and_poison(
 ; CHECK-NEXT:    [[Y:%.*]] = srem <2 x i64> [[PY:%.*]], <i64 42, i64 42>
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <2 x i64> [[X:%.*]], <i64 12, i64 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr <2 x i64> [[Y]], <i64 7, i64 undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr <2 x i64> [[X:%.*]], <i64 12, i64 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr <2 x i64> [[Y]], <i64 7, i64 poison>
 ; CHECK-NEXT:    [[SH1:%.*]] = and <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <2 x i64> [[SH1]]
 ;
   %y = srem <2 x i64> %py, <i64 42, i64 42> ; thwart complexity-based canonicalization
-  %sh0 = lshr <2 x i64> %x, <i64 5, i64 undef>
+  %sh0 = lshr <2 x i64> %x, <i64 5, i64 poison>
   %r = and <2 x i64> %y, %sh0
-  %sh1 = lshr <2 x i64> %r, <i64 7, i64 undef>
+  %sh1 = lshr <2 x i64> %r, <i64 7, i64 poison>
   ret <2 x i64> %sh1
 }
 
@@ -212,16 +212,16 @@ define i32 @ashr_overshift_xor(i32 %x, i32 %y) {
   ret i32 %sh1
 }
 
-define <2 x i32> @ashr_undef_undef_xor(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: @ashr_undef_undef_xor(
-; CHECK-NEXT:    [[SH0:%.*]] = ashr <2 x i32> [[X:%.*]], <i32 15, i32 undef>
+define <2 x i32> @ashr_poison_poison_xor(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @ashr_poison_poison_xor(
+; CHECK-NEXT:    [[SH0:%.*]] = ashr <2 x i32> [[X:%.*]], <i32 15, i32 poison>
 ; CHECK-NEXT:    [[R:%.*]] = xor <2 x i32> [[SH0]], [[Y:%.*]]
-; CHECK-NEXT:    [[SH1:%.*]] = ashr <2 x i32> [[R]], <i32 undef, i32 17>
+; CHECK-NEXT:    [[SH1:%.*]] = ashr <2 x i32> [[R]], <i32 poison, i32 17>
 ; CHECK-NEXT:    ret <2 x i32> [[SH1]]
 ;
-  %sh0 = ashr <2 x i32> %x, <i32 15, i32 undef>
+  %sh0 = ashr <2 x i32> %x, <i32 15, i32 poison>
   %r = xor <2 x i32> %y, %sh0
-  %sh1 = ashr <2 x i32> %r, <i32 undef, i32 17>
+  %sh1 = ashr <2 x i32> %r, <i32 poison, i32 17>
   ret <2 x i32> %sh1
 }
 
@@ -390,18 +390,18 @@ define <2 x i8> @shl_add_nonuniform(<2 x i8> %x, <2 x i8> %y) {
 }
 
 
-define <2 x i64> @shl_add_undef(<2 x i64> %x, <2 x i64> %py) {
-; CHECK-LABEL: @shl_add_undef(
+define <2 x i64> @shl_add_poison(<2 x i64> %x, <2 x i64> %py) {
+; CHECK-LABEL: @shl_add_poison(
 ; CHECK-NEXT:    [[Y:%.*]] = srem <2 x i64> [[PY:%.*]], <i64 42, i64 42>
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i64> [[X:%.*]], <i64 12, i64 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = shl <2 x i64> [[Y]], <i64 7, i64 undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i64> [[X:%.*]], <i64 12, i64 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw <2 x i64> [[Y]], <i64 7, i64 poison>
 ; CHECK-NEXT:    [[SH1:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <2 x i64> [[SH1]]
 ;
   %y = srem <2 x i64> %py, <i64 42, i64 42> ; thwart complexity-based canonicalization
-  %sh0 = shl <2 x i64> %x, <i64 5, i64 undef>
+  %sh0 = shl <2 x i64> %x, <i64 5, i64 poison>
   %r = add <2 x i64> %y, %sh0
-  %sh1 = shl <2 x i64> %r, <i64 7, i64 undef>
+  %sh1 = shl <2 x i64> %r, <i64 7, i64 poison>
   ret <2 x i64> %sh1
 }
 
@@ -432,18 +432,18 @@ define <2 x i8> @lshr_add_nonuniform(<2 x i8> %x, <2 x i8> %y) {
   ret <2 x i8> %sh1
 }
 
-define <2 x i64> @lshr_add_undef(<2 x i64> %x, <2 x i64> %py) {
-; CHECK-LABEL: @lshr_add_undef(
+define <2 x i64> @lshr_add_poison(<2 x i64> %x, <2 x i64> %py) {
+; CHECK-LABEL: @lshr_add_poison(
 ; CHECK-NEXT:    [[Y:%.*]] = srem <2 x i64> [[PY:%.*]], <i64 42, i64 42>
-; CHECK-NEXT:    [[SH0:%.*]] = lshr <2 x i64> [[X:%.*]], <i64 5, i64 undef>
-; CHECK-NEXT:    [[R:%.*]] = add <2 x i64> [[Y]], [[SH0]]
-; CHECK-NEXT:    [[SH1:%.*]] = lshr <2 x i64> [[R]], <i64 7, i64 undef>
+; CHECK-NEXT:    [[SH0:%.*]] = lshr <2 x i64> [[X:%.*]], <i64 5, i64 poison>
+; CHECK-NEXT:    [[R:%.*]] = add nsw <2 x i64> [[Y]], [[SH0]]
+; CHECK-NEXT:    [[SH1:%.*]] = lshr <2 x i64> [[R]], <i64 7, i64 poison>
 ; CHECK-NEXT:    ret <2 x i64> [[SH1]]
 ;
   %y = srem <2 x i64> %py, <i64 42, i64 42> ; thwart complexity-based canonicalization
-  %sh0 = lshr <2 x i64> %x, <i64 5, i64 undef>
+  %sh0 = lshr <2 x i64> %x, <i64 5, i64 poison>
   %r = add <2 x i64> %y, %sh0
-  %sh1 = lshr <2 x i64> %r, <i64 7, i64 undef>
+  %sh1 = lshr <2 x i64> %r, <i64 7, i64 poison>
   ret <2 x i64> %sh1
 }
 
@@ -488,18 +488,18 @@ define <2 x i8> @shl_sub_nonuniform(<2 x i8> %x, <2 x i8> %y) {
 }
 
 
-define <2 x i64> @shl_sub_undef(<2 x i64> %x, <2 x i64> %py) {
-; CHECK-LABEL: @shl_sub_undef(
+define <2 x i64> @shl_sub_poison(<2 x i64> %x, <2 x i64> %py) {
+; CHECK-LABEL: @shl_sub_poison(
 ; CHECK-NEXT:    [[Y:%.*]] = srem <2 x i64> [[PY:%.*]], <i64 42, i64 42>
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i64> [[X:%.*]], <i64 12, i64 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = shl <2 x i64> [[Y]], <i64 7, i64 undef>
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i64> [[X:%.*]], <i64 12, i64 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nsw <2 x i64> [[Y]], <i64 7, i64 poison>
 ; CHECK-NEXT:    [[SH1:%.*]] = sub <2 x i64> [[TMP2]], [[TMP1]]
 ; CHECK-NEXT:    ret <2 x i64> [[SH1]]
 ;
   %y = srem <2 x i64> %py, <i64 42, i64 42> ; thwart complexity-based canonicalization
-  %sh0 = shl <2 x i64> %x, <i64 5, i64 undef>
+  %sh0 = shl <2 x i64> %x, <i64 5, i64 poison>
   %r = sub <2 x i64> %y, %sh0
-  %sh1 = shl <2 x i64> %r, <i64 7, i64 undef>
+  %sh1 = shl <2 x i64> %r, <i64 7, i64 poison>
   ret <2 x i64> %sh1
 }
 
@@ -530,17 +530,17 @@ define <2 x i8> @lshr_sub_nonuniform(<2 x i8> %x, <2 x i8> %y) {
   ret <2 x i8> %sh1
 }
 
-define <2 x i64> @lshr_sub_undef(<2 x i64> %x, <2 x i64> %py) {
-; CHECK-LABEL: @lshr_sub_undef(
+define <2 x i64> @lshr_sub_poison(<2 x i64> %x, <2 x i64> %py) {
+; CHECK-LABEL: @lshr_sub_poison(
 ; CHECK-NEXT:    [[Y:%.*]] = srem <2 x i64> [[PY:%.*]], <i64 42, i64 42>
-; CHECK-NEXT:    [[SH0:%.*]] = lshr <2 x i64> [[X:%.*]], <i64 5, i64 undef>
-; CHECK-NEXT:    [[R:%.*]] = sub <2 x i64> [[Y]], [[SH0]]
-; CHECK-NEXT:    [[SH1:%.*]] = lshr <2 x i64> [[R]], <i64 7, i64 undef>
+; CHECK-NEXT:    [[SH0:%.*]] = lshr <2 x i64> [[X:%.*]], <i64 5, i64 poison>
+; CHECK-NEXT:    [[R:%.*]] = sub nsw <2 x i64> [[Y]], [[SH0]]
+; CHECK-NEXT:    [[SH1:%.*]] = lshr <2 x i64> [[R]], <i64 7, i64 poison>
 ; CHECK-NEXT:    ret <2 x i64> [[SH1]]
 ;
   %y = srem <2 x i64> %py, <i64 42, i64 42> ; thwart complexity-based canonicalization
-  %sh0 = lshr <2 x i64> %x, <i64 5, i64 undef>
+  %sh0 = lshr <2 x i64> %x, <i64 5, i64 poison>
   %r = sub <2 x i64> %y, %sh0
-  %sh1 = lshr <2 x i64> %r, <i64 7, i64 undef>
+  %sh1 = lshr <2 x i64> %r, <i64 7, i64 poison>
   ret <2 x i64> %sh1
 }
diff --git a/llvm/test/Transforms/InstCombine/shl-and-negC-icmpeq-zero.ll b/llvm/test/Transforms/InstCombine/shl-and-negC-icmpeq-zero.ll
index 406dc72f2646e..daa4955796594 100644
--- a/llvm/test/Transforms/InstCombine/shl-and-negC-icmpeq-zero.ll
+++ b/llvm/test/Transforms/InstCombine/shl-and-negC-icmpeq-zero.ll
@@ -81,39 +81,39 @@ define <4 x i1> @vec_4xi32_shl_and_negC_eq(<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %r
 }
 
-define <4 x i1> @vec_shl_and_negC_eq_undef1(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: @vec_shl_and_negC_eq_undef1(
+define <4 x i1> @vec_shl_and_negC_eq_poison1(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @vec_shl_and_negC_eq_poison1(
 ; CHECK-NEXT:    [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = icmp ult <4 x i32> [[SHL]], <i32 8, i32 8, i32 8, i32 8>
 ; CHECK-NEXT:    ret <4 x i1> [[R]]
 ;
   %shl = shl <4 x i32> %x, %y
-  %and = and <4 x i32> %shl, <i32 4294967288, i32 undef, i32 4294967288, i32 4294967288>  ; ~7
+  %and = and <4 x i32> %shl, <i32 4294967288, i32 poison, i32 4294967288, i32 4294967288>  ; ~7
   %r = icmp eq <4 x i32> %and, <i32 0, i32 0, i32 0, i32 0>
   ret <4 x i1> %r
 }
 
-define <4 x i1> @vec_shl_and_negC_eq_undef2(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: @vec_shl_and_negC_eq_undef2(
+define <4 x i1> @vec_shl_and_negC_eq_poison2(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @vec_shl_and_negC_eq_poison2(
 ; CHECK-NEXT:    [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = icmp ult <4 x i32> [[SHL]], <i32 8, i32 8, i32 8, i32 8>
 ; CHECK-NEXT:    ret <4 x i1> [[R]]
 ;
   %shl = shl <4 x i32> %x, %y
   %and = and <4 x i32> %shl, <i32 4294967288, i32 4294967288, i32 4294967288, i32 4294967288>  ; ~7
-  %r = icmp eq <4 x i32> %and, <i32 0, i32 0, i32 0, i32 undef>
+  %r = icmp eq <4 x i32> %and, <i32 0, i32 0, i32 0, i32 poison>
   ret <4 x i1> %r
 }
 
-define <4 x i1> @vec_shl_and_negC_eq_undef3(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: @vec_shl_and_negC_eq_undef3(
+define <4 x i1> @vec_shl_and_negC_eq_poison3(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @vec_shl_and_negC_eq_poison3(
 ; CHECK-NEXT:    [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = icmp ult <4 x i32> [[SHL]], <i32 8, i32 8, i32 8, i32 8>
 ; CHECK-NEXT:    ret <4 x i1> [[R]]
 ;
   %shl = shl <4 x i32> %x, %y
-  %and = and <4 x i32> %shl, <i32 4294967288, i32 4294967288, i32 undef, i32 4294967288>  ; ~7
-  %r = icmp eq <4 x i32> %and, <i32 0, i32 0, i32 0, i32 undef>
+  %and = and <4 x i32> %shl, <i32 4294967288, i32 4294967288, i32 poison, i32 4294967288>  ; ~7
+  %r = icmp eq <4 x i32> %and, <i32 0, i32 0, i32 0, i32 poison>
   ret <4 x i1> %r
 }
 
diff --git a/llvm/test/Transforms/InstCombine/shl-and-signbit-icmpeq-zero.ll b/llvm/test/Transforms/InstCombine/shl-and-signbit-icmpeq-zero.ll
index 4c2c876e3925b..dcc181945357d 100644
--- a/llvm/test/Transforms/InstCombine/shl-and-signbit-icmpeq-zero.ll
+++ b/llvm/test/Transforms/InstCombine/shl-and-signbit-icmpeq-zero.ll
@@ -81,39 +81,39 @@ define <4 x i1> @vec_4xi32_shl_and_signbit_eq(<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i1> %r
 }
 
-define <4 x i1> @vec_4xi32_shl_and_signbit_eq_undef1(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: @vec_4xi32_shl_and_signbit_eq_undef1(
+define <4 x i1> @vec_4xi32_shl_and_signbit_eq_poison1(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @vec_4xi32_shl_and_signbit_eq_poison1(
 ; CHECK-NEXT:    [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = icmp sgt <4 x i32> [[SHL]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    ret <4 x i1> [[R]]
 ;
   %shl = shl <4 x i32> %x, %y
-  %and = and <4 x i32> %shl, <i32 2147483648, i32 undef, i32 2147483648, i32 2147483648>
+  %and = and <4 x i32> %shl, <i32 2147483648, i32 poison, i32 2147483648, i32 2147483648>
   %r = icmp eq <4 x i32> %and, <i32 0, i32 0, i32 0, i32 0>
   ret <4 x i1> %r
 }
 
-define <4 x i1> @vec_4xi32_shl_and_signbit_eq_undef2(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: @vec_4xi32_shl_and_signbit_eq_undef2(
+define <4 x i1> @vec_4xi32_shl_and_signbit_eq_poison2(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @vec_4xi32_shl_and_signbit_eq_poison2(
 ; CHECK-NEXT:    [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = icmp sgt <4 x i32> [[SHL]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    ret <4 x i1> [[R]]
 ;
   %shl = shl <4 x i32> %x, %y
   %and = and <4 x i32> %shl, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>
-  %r = icmp eq <4 x i32> %and, <i32 undef, i32 0, i32 0, i32 0>
+  %r = icmp eq <4 x i32> %and, <i32 poison, i32 0, i32 0, i32 0>
   ret <4 x i1> %r
 }
 
-define <4 x i1> @vec_4xi32_shl_and_signbit_eq_undef3(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: @vec_4xi32_shl_and_signbit_eq_undef3(
+define <4 x i1> @vec_4xi32_shl_and_signbit_eq_poison3(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @vec_4xi32_shl_and_signbit_eq_poison3(
 ; CHECK-NEXT:    [[SHL:%.*]] = shl <4 x i32> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = icmp sgt <4 x i32> [[SHL]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    ret <4 x i1> [[R]]
 ;
   %shl = shl <4 x i32> %x, %y
-  %and = and <4 x i32> %shl, <i32 2147483648, i32 undef, i32 2147483648, i32 2147483648>
-  %r = icmp eq <4 x i32> %and, <i32 0, i32 0, i32 0, i32 undef>
+  %and = and <4 x i32> %shl, <i32 2147483648, i32 poison, i32 2147483648, i32 2147483648>
+  %r = icmp eq <4 x i32> %and, <i32 0, i32 0, i32 0, i32 poison>
   ret <4 x i1> %r
 }
 
diff --git a/llvm/test/Transforms/InstCombine/signmask-of-sext-vs-of-shl-of-zext.ll b/llvm/test/Transforms/InstCombine/signmask-of-sext-vs-of-shl-of-zext.ll
index aeb4c8bb62cba..e7505721cad60 100644
--- a/llvm/test/Transforms/InstCombine/signmask-of-sext-vs-of-shl-of-zext.ll
+++ b/llvm/test/Transforms/InstCombine/signmask-of-sext-vs-of-shl-of-zext.ll
@@ -129,40 +129,56 @@ define <2 x i32> @t8(<2 x i16> %x) {
   %r = and <2 x i32> %i1, <i32 -2147483648, i32 -2147483648>
   ret <2 x i32> %r
 }
+
 define <2 x i32> @t9(<2 x i16> %x) {
 ; CHECK-LABEL: @t9(
-; CHECK-NEXT:    [[X_SIGNEXT:%.*]] = sext <2 x i16> [[X:%.*]] to <2 x i32>
-; CHECK-NEXT:    [[R:%.*]] = and <2 x i32> [[X_SIGNEXT]], <i32 -2147483648, i32 undef>
+; CHECK-NEXT:    [[I1:%.*]] = sext <2 x i16> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i32> [[I1]], <i32 -2147483648, i32 -2147483648>
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %i0 = zext <2 x i16> %x to <2 x i32>
-  %i1 = shl <2 x i32> %i0, <i32 16, i32 undef>
+  %i1 = shl <2 x i32> %i0, <i32 16, i32 poison>
   %r = and <2 x i32> %i1, <i32 -2147483648, i32 -2147483648>
-  ; Here undef can be propagated into the mask.
   ret <2 x i32> %r
 }
-define <2 x i32> @t10(<2 x i16> %x) {
-; CHECK-LABEL: @t10(
-; CHECK-NEXT:    [[X_SIGNEXT:%.*]] = sext <2 x i16> [[X:%.*]] to <2 x i32>
-; CHECK-NEXT:    [[R:%.*]] = and <2 x i32> [[X_SIGNEXT]], <i32 -2147483648, i32 0>
+
+; If we folded this, we wouldn't be able to keep the undef mask.
+define <2 x i32> @t10_undef(<2 x i16> %x) {
+; CHECK-LABEL: @t10_undef(
+; CHECK-NEXT:    [[I0:%.*]] = zext <2 x i16> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[I1:%.*]] = shl nuw <2 x i32> [[I0]], <i32 16, i32 16>
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i32> [[I1]], <i32 -2147483648, i32 undef>
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %i0 = zext <2 x i16> %x to <2 x i32>
   %i1 = shl <2 x i32> %i0, <i32 16, i32 16>
   %r = and <2 x i32> %i1, <i32 -2147483648, i32 undef>
-  ; CAREFUL! We can't keep undef mask here, since high bits are no longer zero,
+  ret <2 x i32> %r
+}
+
+define <2 x i32> @t10_poison(<2 x i16> %x) {
+; CHECK-LABEL: @t10_poison(
+; CHECK-NEXT:    [[I1:%.*]] = sext <2 x i16> [[X:%.*]] to <2 x i32>
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i32> [[I1]], <i32 -2147483648, i32 poison>
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %i0 = zext <2 x i16> %x to <2 x i32>
+  %i1 = shl <2 x i32> %i0, <i32 16, i32 16>
+  %r = and <2 x i32> %i1, <i32 -2147483648, i32 poison>
+  ; CAREFUL! We can't keep poison mask here, since high bits are no longer zero,
   ; we must sanitize it to 0.
   ret <2 x i32> %r
 }
+
 define <2 x i32> @t11(<2 x i16> %x) {
 ; CHECK-LABEL: @t11(
 ; CHECK-NEXT:    [[X_SIGNEXT:%.*]] = sext <2 x i16> [[X:%.*]] to <2 x i32>
-; CHECK-NEXT:    [[R:%.*]] = and <2 x i32> [[X_SIGNEXT]], <i32 -2147483648, i32 undef>
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i32> [[X_SIGNEXT]], <i32 -2147483648, i32 poison>
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %i0 = zext <2 x i16> %x to <2 x i32>
-  %i1 = shl <2 x i32> %i0, <i32 16, i32 undef>
-  %r = and <2 x i32> %i1, <i32 -2147483648, i32 undef>
-  ; Here undef mask is fine.
+  %i1 = shl <2 x i32> %i0, <i32 16, i32 poison>
+  %r = and <2 x i32> %i1, <i32 -2147483648, i32 poison>
+  ; Here poison mask is fine.
   ret <2 x i32> %r
 }
diff --git a/llvm/test/Transforms/InstCombine/sitofp.ll b/llvm/test/Transforms/InstCombine/sitofp.ll
index cc6b6425eb03c..51eff39cd900e 100644
--- a/llvm/test/Transforms/InstCombine/sitofp.ll
+++ b/llvm/test/Transforms/InstCombine/sitofp.ll
@@ -256,7 +256,7 @@ define i25 @consider_lowbits_masked_input(i25 %A) {
 define i32 @overflow_masked_input(i32 %A) {
 ; CHECK-LABEL: @overflow_masked_input(
 ; CHECK-NEXT:    [[M:%.*]] = and i32 [[A:%.*]], 16777217
-; CHECK-NEXT:    [[B:%.*]] = uitofp i32 [[M]] to float
+; CHECK-NEXT:    [[B:%.*]] = uitofp nneg i32 [[M]] to float
 ; CHECK-NEXT:    [[C:%.*]] = fptoui float [[B]] to i32
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/sub-not.ll b/llvm/test/Transforms/InstCombine/sub-not.ll
index ec36754d3e9b1..89ccf5aa3c8f4 100644
--- a/llvm/test/Transforms/InstCombine/sub-not.ll
+++ b/llvm/test/Transforms/InstCombine/sub-not.ll
@@ -34,7 +34,7 @@ define <2 x i8> @sub_not_vec(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
   %s = sub <2 x i8> %x, %y
-  %r = xor <2 x i8> %s, <i8 -1, i8 undef>
+  %r = xor <2 x i8> %s, <i8 -1, i8 poison>
   ret <2 x i8> %r
 }
 
@@ -69,7 +69,7 @@ define <2 x i8> @dec_sub_vec(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
   %s = sub <2 x i8> %x, %y
-  %r = add <2 x i8> %s, <i8 -1, i8 undef>
+  %r = add <2 x i8> %s, <i8 -1, i8 poison>
   ret <2 x i8> %r
 }
 
@@ -103,7 +103,7 @@ define <2 x i8> @sub_inc_vec(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-NEXT:    [[R:%.*]] = add <2 x i8> [[S_NEG]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
-  %s = add <2 x i8> %x, <i8 undef, i8 1>
+  %s = add <2 x i8> %x, <i8 poison, i8 1>
   %r = sub <2 x i8> %y, %s
   ret <2 x i8> %r
 }
@@ -138,7 +138,7 @@ define <2 x i8> @sub_dec_vec(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-NEXT:    [[R:%.*]] = add <2 x i8> [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
-  %s = add <2 x i8> %x, <i8 undef, i8 -1>
+  %s = add <2 x i8> %x, <i8 poison, i8 -1>
   %r = sub <2 x i8> %s, %y
   ret <2 x i8> %r
 }
diff --git a/llvm/test/Transforms/InstCombine/sub-xor.ll b/llvm/test/Transforms/InstCombine/sub-xor.ll
index 71da73d51ae37..b4e87d0405fc4 100644
--- a/llvm/test/Transforms/InstCombine/sub-xor.ll
+++ b/llvm/test/Transforms/InstCombine/sub-xor.ll
@@ -157,3 +157,26 @@ define <2 x i8> @xor_add_splat_undef(<2 x i8> %x) {
   %add = add <2 x i8> %xor, <i8 42, i8 42>
   ret <2 x i8> %add
 }
+
+define i32 @xor_dominating_cond(i32 %x) {
+; CHECK-LABEL: @xor_dominating_cond(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[X:%.*]], 256
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[A:%.*]] = xor i32 [[X]], 255
+; CHECK-NEXT:    ret i32 [[A]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret i32 [[X]]
+;
+entry:
+  %cond = icmp ult i32 %x, 256
+  br i1 %cond, label %if.then, label %if.end
+
+if.then:
+  %a = sub i32 255, %x
+  ret i32 %a
+
+if.end:
+  ret i32 %x
+}
diff --git a/llvm/test/Transforms/InstCombine/sub.ll b/llvm/test/Transforms/InstCombine/sub.ll
index 249b5673c8acf..a84e389f13c3b 100644
--- a/llvm/test/Transforms/InstCombine/sub.ll
+++ b/llvm/test/Transforms/InstCombine/sub.ll
@@ -130,44 +130,44 @@ define <2 x i32> @neg_nsw_sub_nsw_vec(<2 x i32> %x, <2 x i32> %y) {
   ret <2 x i32> %r
 }
 
-define <2 x i32> @neg_sub_vec_undef(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: @neg_sub_vec_undef(
+define <2 x i32> @neg_sub_vec_poison(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @neg_sub_vec_poison(
 ; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[Y:%.*]], [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %neg = sub <2 x i32> <i32 0, i32 undef>, %x
+  %neg = sub <2 x i32> <i32 0, i32 poison>, %x
   %r = sub <2 x i32> %y, %neg
   ret <2 x i32> %r
 }
 
-define <2 x i32> @neg_nsw_sub_vec_undef(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: @neg_nsw_sub_vec_undef(
+define <2 x i32> @neg_nsw_sub_vec_poison(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @neg_nsw_sub_vec_poison(
 ; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[Y:%.*]], [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %neg = sub nsw <2 x i32> <i32 undef, i32 0>, %x
+  %neg = sub nsw <2 x i32> <i32 poison, i32 0>, %x
   %r = sub <2 x i32> %y, %neg
   ret <2 x i32> %r
 }
 
-define <2 x i32> @neg_sub_nsw_vec_undef(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: @neg_sub_nsw_vec_undef(
+define <2 x i32> @neg_sub_nsw_vec_poison(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @neg_sub_nsw_vec_poison(
 ; CHECK-NEXT:    [[R:%.*]] = add <2 x i32> [[Y:%.*]], [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %neg = sub <2 x i32> <i32 undef, i32 0>, %x
+  %neg = sub <2 x i32> <i32 poison, i32 0>, %x
   %r = sub nsw <2 x i32> %y, %neg
   ret <2 x i32> %r
 }
 
 ; This should not drop 'nsw'.
 
-define <2 x i32> @neg_nsw_sub_nsw_vec_undef(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: @neg_nsw_sub_nsw_vec_undef(
+define <2 x i32> @neg_nsw_sub_nsw_vec_poison(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @neg_nsw_sub_nsw_vec_poison(
 ; CHECK-NEXT:    [[R:%.*]] = add nsw <2 x i32> [[Y:%.*]], [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
-  %neg = sub nsw <2 x i32> <i32 0, i32 undef>, %x
+  %neg = sub nsw <2 x i32> <i32 0, i32 poison>, %x
   %r = sub nsw <2 x i32> %y, %neg
   ret <2 x i32> %r
 }
@@ -205,13 +205,13 @@ define <2 x i8> @notnotsub_vec(<2 x i8> %x, <2 x i8> %y) {
   ret <2 x i8> %sub
 }
 
-define <2 x i8> @notnotsub_vec_undef_elts(<2 x i8> %x, <2 x i8> %y) {
-; CHECK-LABEL: @notnotsub_vec_undef_elts(
+define <2 x i8> @notnotsub_vec_poison_elts(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @notnotsub_vec_poison_elts(
 ; CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i8> [[Y:%.*]], [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x i8> [[SUB]]
 ;
-  %nx = xor <2 x i8> %x, <i8 undef, i8 -1>
-  %ny = xor <2 x i8> %y, <i8 -1, i8 undef>
+  %nx = xor <2 x i8> %x, <i8 poison, i8 -1>
+  %ny = xor <2 x i8> %y, <i8 -1, i8 poison>
   %sub = sub <2 x i8> %nx, %ny
   ret <2 x i8> %sub
 }
@@ -2351,12 +2351,12 @@ define <2 x i8> @sub_to_and_vector1(<2 x i8> %x) {
 
 define <2 x i8> @sub_to_and_vector2(<2 x i8> %x) {
 ; CHECK-LABEL: @sub_to_and_vector2(
-; CHECK-NEXT:    [[SUB:%.*]] = sub nuw <2 x i8> <i8 71, i8 undef>, [[X:%.*]]
+; CHECK-NEXT:    [[SUB:%.*]] = sub nuw <2 x i8> <i8 71, i8 poison>, [[X:%.*]]
 ; CHECK-NEXT:    [[AND:%.*]] = and <2 x i8> [[SUB]], <i8 120, i8 120>
 ; CHECK-NEXT:    [[R:%.*]] = sub nsw <2 x i8> <i8 77, i8 77>, [[AND]]
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
-  %sub = sub nuw <2 x i8> <i8 71, i8 undef>, %x
+  %sub = sub nuw <2 x i8> <i8 71, i8 poison>, %x
   %and = and <2 x i8> %sub, <i8 120, i8 120>
   %r = sub <2 x i8>  <i8 77, i8 77>, %and
   ret <2 x i8> %r
@@ -2366,12 +2366,12 @@ define <2 x i8> @sub_to_and_vector2(<2 x i8> %x) {
 define <2 x i8> @sub_to_and_vector3(<2 x i8> %x) {
 ; CHECK-LABEL: @sub_to_and_vector3(
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nuw <2 x i8> <i8 71, i8 71>, [[X:%.*]]
-; CHECK-NEXT:    [[AND:%.*]] = and <2 x i8> [[SUB]], <i8 120, i8 undef>
+; CHECK-NEXT:    [[AND:%.*]] = and <2 x i8> [[SUB]], <i8 120, i8 poison>
 ; CHECK-NEXT:    [[R:%.*]] = sub nsw <2 x i8> <i8 44, i8 44>, [[AND]]
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
   %sub = sub nuw <2 x i8> <i8 71, i8 71>, %x
-  %and = and <2 x i8> %sub, <i8 120, i8 undef>
+  %and = and <2 x i8> %sub, <i8 120, i8 poison>
   %r = sub <2 x i8>  <i8 44, i8 44>, %and
   ret <2 x i8> %r
 }
@@ -2381,12 +2381,12 @@ define <2 x i8> @sub_to_and_vector4(<2 x i8> %x) {
 ; CHECK-LABEL: @sub_to_and_vector4(
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nuw <2 x i8> <i8 71, i8 71>, [[X:%.*]]
 ; CHECK-NEXT:    [[AND:%.*]] = and <2 x i8> [[SUB]], <i8 120, i8 120>
-; CHECK-NEXT:    [[R:%.*]] = sub <2 x i8> <i8 88, i8 undef>, [[AND]]
+; CHECK-NEXT:    [[R:%.*]] = sub nsw <2 x i8> <i8 88, i8 poison>, [[AND]]
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
   %sub = sub nuw <2 x i8> <i8 71, i8 71>, %x
   %and = and <2 x i8> %sub, <i8 120, i8 120>
-  %r = sub <2 x i8>  <i8 88, i8 undef>, %and
+  %r = sub <2 x i8>  <i8 88, i8 poison>, %and
   ret <2 x i8> %r
 }
 
diff --git a/llvm/test/Transforms/InstCombine/threadlocal_address.ll b/llvm/test/Transforms/InstCombine/threadlocal_address.ll
new file mode 100644
index 0000000000000..0c220d996839e
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/threadlocal_address.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -o - -S %s -passes=instcombine | FileCheck %s
+
+@tlsvar_a4 = thread_local global i32 4, align 4
+
+define void @func_increase_alignment() {
+; CHECK-LABEL: define void @func_increase_alignment() {
+; CHECK-NEXT:    [[P:%.*]] = call align 4 ptr @llvm.threadlocal.address.p0(ptr @tlsvar_a4)
+; CHECK-NEXT:    store i32 42, ptr [[P]], align 2
+; CHECK-NEXT:    ret void
+;
+  %p = call align 2 ptr @llvm.threadlocal.address(ptr @tlsvar_a4)
+  store i32 42, ptr %p, align 2
+  ret void
+}
+
+@tlsvar_a32 = thread_local global i32 5, align 32
+
+define i1 @func_add_alignment() {
+; CHECK-LABEL: define i1 @func_add_alignment() {
+; CHECK-NEXT:    ret i1 true
+;
+  %p = call ptr @llvm.threadlocal.address(ptr @tlsvar_a32)
+  %p_int = ptrtoint ptr %p to i32
+  %lowbits = and i32 %p_int, 31
+  %zero = icmp eq i32 %lowbits, 0
+  ret i1 %zero
+}
+
+@tlsvar_a1 = thread_local global i8 6, align 1
+
+define i1 @func_dont_reduce_alignment() {
+; CHECK-LABEL: define i1 @func_dont_reduce_alignment() {
+; CHECK-NEXT:    ret i1 true
+;
+  %p = call align 4 ptr @llvm.threadlocal.address(ptr @tlsvar_a1)
+  %p_int = ptrtoint ptr %p to i32
+  %lowbits = and i32 %p_int, 3
+  %zero = icmp eq i32 %lowbits, 0
+  ret i1 %zero
+}
diff --git a/llvm/test/Transforms/InstCombine/trunc-inseltpoison.ll b/llvm/test/Transforms/InstCombine/trunc-inseltpoison.ll
index 4c857125365a9..063006ba5eea8 100644
--- a/llvm/test/Transforms/InstCombine/trunc-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/trunc-inseltpoison.ll
@@ -49,15 +49,15 @@ define <2 x i64> @test1_vec_nonuniform(<2 x i64> %a) {
   ret <2 x i64> %d
 }
 
-define <2 x i64> @test1_vec_undef(<2 x i64> %a) {
-; CHECK-LABEL: @test1_vec_undef(
+define <2 x i64> @test1_vec_poison(<2 x i64> %a) {
+; CHECK-LABEL: @test1_vec_poison(
 ; CHECK-NEXT:    [[B:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32>
-; CHECK-NEXT:    [[D:%.*]] = and <2 x i64> [[A]], <i64 15, i64 0>
+; CHECK-NEXT:    [[D:%.*]] = and <2 x i64> [[A]], <i64 15, i64 poison>
 ; CHECK-NEXT:    call void @use_vec(<2 x i32> [[B]])
 ; CHECK-NEXT:    ret <2 x i64> [[D]]
 ;
   %b = trunc <2 x i64> %a to <2 x i32>
-  %c = and <2 x i32> %b, <i32 15, i32 undef>
+  %c = and <2 x i32> %b, <i32 15, i32 poison>
   %d = zext <2 x i32> %c to <2 x i64>
   call void @use_vec(<2 x i32> %b)
   ret <2 x i64> %d
@@ -111,17 +111,17 @@ define <2 x i64> @test2_vec_nonuniform(<2 x i64> %a) {
   ret <2 x i64> %d
 }
 
-define <2 x i64> @test2_vec_undef(<2 x i64> %a) {
-; CHECK-LABEL: @test2_vec_undef(
+define <2 x i64> @test2_vec_poison(<2 x i64> %a) {
+; CHECK-LABEL: @test2_vec_poison(
 ; CHECK-NEXT:    [[B:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32>
-; CHECK-NEXT:    [[D1:%.*]] = shl <2 x i64> [[A]], <i64 36, i64 undef>
-; CHECK-NEXT:    [[D:%.*]] = ashr exact <2 x i64> [[D1]], <i64 36, i64 undef>
+; CHECK-NEXT:    [[D1:%.*]] = shl <2 x i64> [[A]], <i64 36, i64 poison>
+; CHECK-NEXT:    [[D:%.*]] = ashr exact <2 x i64> [[D1]], <i64 36, i64 poison>
 ; CHECK-NEXT:    call void @use_vec(<2 x i32> [[B]])
 ; CHECK-NEXT:    ret <2 x i64> [[D]]
 ;
   %b = trunc <2 x i64> %a to <2 x i32>
-  %c = shl <2 x i32> %b, <i32 4, i32 undef>
-  %q = ashr <2 x i32> %c, <i32 4, i32 undef>
+  %c = shl <2 x i32> %b, <i32 4, i32 poison>
+  %q = ashr <2 x i32> %c, <i32 4, i32 poison>
   %d = sext <2 x i32> %q to <2 x i64>
   call void @use_vec(<2 x i32> %b)
   ret <2 x i64> %d
@@ -300,18 +300,17 @@ define <2 x i64> @test8_vec_nonuniform(<2 x i32> %A, <2 x i32> %B) {
   ret <2 x i64> %G
 }
 
-define <2 x i64> @test8_vec_undef(<2 x i32> %A, <2 x i32> %B) {
-; CHECK-LABEL: @test8_vec_undef(
-; CHECK-NEXT:    [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i128>
-; CHECK-NEXT:    [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i128>
-; CHECK-NEXT:    [[E:%.*]] = shl <2 x i128> [[D]], <i128 32, i128 undef>
-; CHECK-NEXT:    [[F:%.*]] = or <2 x i128> [[E]], [[C]]
-; CHECK-NEXT:    [[G:%.*]] = trunc <2 x i128> [[F]] to <2 x i64>
+define <2 x i64> @test8_vec_poison(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: @test8_vec_poison(
+; CHECK-NEXT:    [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[E:%.*]] = shl nuw <2 x i64> [[D]], <i64 32, i64 poison>
+; CHECK-NEXT:    [[G:%.*]] = or disjoint <2 x i64> [[E]], [[C]]
 ; CHECK-NEXT:    ret <2 x i64> [[G]]
 ;
   %C = zext <2 x i32> %A to <2 x i128>
   %D = zext <2 x i32> %B to <2 x i128>
-  %E = shl <2 x i128> %D, <i128 32, i128 undef>
+  %E = shl <2 x i128> %D, <i128 32, i128 poison>
   %F = or <2 x i128> %E, %C
   %G = trunc <2 x i128> %F to <2 x i64>
   ret <2 x i64> %G
@@ -388,18 +387,17 @@ define <2 x i64> @test11_vec_nonuniform(<2 x i32> %A, <2 x i32> %B) {
   ret <2 x i64> %G
 }
 
-define <2 x i64> @test11_vec_undef(<2 x i32> %A, <2 x i32> %B) {
-; CHECK-LABEL: @test11_vec_undef(
-; CHECK-NEXT:    [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i128>
-; CHECK-NEXT:    [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i128>
-; CHECK-NEXT:    [[E:%.*]] = and <2 x i128> [[D]], <i128 31, i128 undef>
-; CHECK-NEXT:    [[F:%.*]] = shl <2 x i128> [[C]], [[E]]
-; CHECK-NEXT:    [[G:%.*]] = trunc <2 x i128> [[F]] to <2 x i64>
+define <2 x i64> @test11_vec_poison(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: @test11_vec_poison(
+; CHECK-NEXT:    [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[B:%.*]], <i32 31, i32 poison>
+; CHECK-NEXT:    [[E:%.*]] = zext nneg <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[G:%.*]] = shl nuw nsw <2 x i64> [[C]], [[E]]
 ; CHECK-NEXT:    ret <2 x i64> [[G]]
 ;
   %C = zext <2 x i32> %A to <2 x i128>
   %D = zext <2 x i32> %B to <2 x i128>
-  %E = and <2 x i128> %D, <i128 31, i128 undef>
+  %E = and <2 x i128> %D, <i128 31, i128 poison>
   %F = shl <2 x i128> %C, %E
   %G = trunc <2 x i128> %F to <2 x i64>
   ret <2 x i64> %G
@@ -453,18 +451,17 @@ define <2 x i64> @test12_vec_nonuniform(<2 x i32> %A, <2 x i32> %B) {
   ret <2 x i64> %G
 }
 
-define <2 x i64> @test12_vec_undef(<2 x i32> %A, <2 x i32> %B) {
-; CHECK-LABEL: @test12_vec_undef(
-; CHECK-NEXT:    [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i128>
-; CHECK-NEXT:    [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i128>
-; CHECK-NEXT:    [[E:%.*]] = and <2 x i128> [[D]], <i128 31, i128 undef>
-; CHECK-NEXT:    [[F:%.*]] = lshr <2 x i128> [[C]], [[E]]
-; CHECK-NEXT:    [[G:%.*]] = trunc nuw nsw <2 x i128> [[F]] to <2 x i64>
+define <2 x i64> @test12_vec_poison(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: @test12_vec_poison(
+; CHECK-NEXT:    [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[B:%.*]], <i32 31, i32 poison>
+; CHECK-NEXT:    [[E:%.*]] = zext nneg <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[G:%.*]] = lshr <2 x i64> [[C]], [[E]]
 ; CHECK-NEXT:    ret <2 x i64> [[G]]
 ;
   %C = zext <2 x i32> %A to <2 x i128>
   %D = zext <2 x i32> %B to <2 x i128>
-  %E = and <2 x i128> %D, <i128 31, i128 undef>
+  %E = and <2 x i128> %D, <i128 31, i128 poison>
   %F = lshr <2 x i128> %C, %E
   %G = trunc <2 x i128> %F to <2 x i64>
   ret <2 x i64> %G
@@ -518,18 +515,17 @@ define <2 x i64> @test13_vec_nonuniform(<2 x i32> %A, <2 x i32> %B) {
   ret <2 x i64> %G
 }
 
-define <2 x i64> @test13_vec_undef(<2 x i32> %A, <2 x i32> %B) {
-; CHECK-LABEL: @test13_vec_undef(
-; CHECK-NEXT:    [[C:%.*]] = sext <2 x i32> [[A:%.*]] to <2 x i128>
-; CHECK-NEXT:    [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i128>
-; CHECK-NEXT:    [[E:%.*]] = and <2 x i128> [[D]], <i128 31, i128 undef>
-; CHECK-NEXT:    [[F:%.*]] = ashr <2 x i128> [[C]], [[E]]
-; CHECK-NEXT:    [[G:%.*]] = trunc nsw <2 x i128> [[F]] to <2 x i64>
+define <2 x i64> @test13_vec_poison(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: @test13_vec_poison(
+; CHECK-NEXT:    [[C:%.*]] = sext <2 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[B:%.*]], <i32 31, i32 poison>
+; CHECK-NEXT:    [[E:%.*]] = zext nneg <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[G:%.*]] = ashr <2 x i64> [[C]], [[E]]
 ; CHECK-NEXT:    ret <2 x i64> [[G]]
 ;
   %C = sext <2 x i32> %A to <2 x i128>
   %D = zext <2 x i32> %B to <2 x i128>
-  %E = and <2 x i128> %D, <i128 31, i128 undef>
+  %E = and <2 x i128> %D, <i128 31, i128 poison>
   %F = ashr <2 x i128> %C, %E
   %G = trunc <2 x i128> %F to <2 x i64>
   ret <2 x i64> %G
@@ -766,13 +762,13 @@ define <2 x i32> @trunc_shl_v2i32_v2i64_uniform(<2 x i64> %val) {
   ret <2 x i32> %trunc
 }
 
-define <2 x i32> @trunc_shl_v2i32_v2i64_undef(<2 x i64> %val) {
-; CHECK-LABEL: @trunc_shl_v2i32_v2i64_undef(
+define <2 x i32> @trunc_shl_v2i32_v2i64_poison(<2 x i64> %val) {
+; CHECK-LABEL: @trunc_shl_v2i32_v2i64_poison(
 ; CHECK-NEXT:    [[VAL_TR:%.*]] = trunc <2 x i64> [[VAL:%.*]] to <2 x i32>
-; CHECK-NEXT:    [[TRUNC:%.*]] = shl <2 x i32> [[VAL_TR]], <i32 31, i32 undef>
+; CHECK-NEXT:    [[TRUNC:%.*]] = shl <2 x i32> [[VAL_TR]], <i32 31, i32 poison>
 ; CHECK-NEXT:    ret <2 x i32> [[TRUNC]]
 ;
-  %shl = shl <2 x i64> %val, <i64 31, i64 undef>
+  %shl = shl <2 x i64> %val, <i64 31, i64 poison>
   %trunc = trunc <2 x i64> %shl to <2 x i32>
   ret <2 x i32> %trunc
 }
@@ -917,7 +913,7 @@ define <4 x i8> @wide_shuf(<4 x i32> %x) {
   ret <4 x i8> %trunc
 }
 
-; trunc (shuffle X, undef, SplatMask) --> shuffle (trunc X), undef, SplatMask
+; trunc (shuffle X, poison, SplatMask) --> shuffle (trunc X), poison, SplatMask
 
 define <4 x i8> @wide_splat1(<4 x i32> %x) {
 ; CHECK-LABEL: @wide_splat1(
@@ -931,7 +927,7 @@ define <4 x i8> @wide_splat1(<4 x i32> %x) {
 }
 
 ; Test weird types.
-; trunc (shuffle X, undef, SplatMask) --> shuffle (trunc X), undef, SplatMask
+; trunc (shuffle X, poison, SplatMask) --> shuffle (trunc X), poison, SplatMask
 
 define <3 x i31> @wide_splat2(<3 x i33> %x) {
 ; CHECK-LABEL: @wide_splat2(
@@ -945,8 +941,8 @@ define <3 x i31> @wide_splat2(<3 x i33> %x) {
 }
 
 ; FIXME:
-; trunc (shuffle X, undef, SplatMask) --> shuffle (trunc X), undef, SplatMask
-; A mask with undef elements should still be considered a splat mask.
+; trunc (shuffle X, poison, SplatMask) --> shuffle (trunc X), poison, SplatMask
+; A mask with poison elements should still be considered a splat mask.
 
 define <3 x i31> @wide_splat3(<3 x i33> %x) {
 ; CHECK-LABEL: @wide_splat3(
@@ -954,7 +950,7 @@ define <3 x i31> @wide_splat3(<3 x i33> %x) {
 ; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <3 x i33> [[SHUF]] to <3 x i31>
 ; CHECK-NEXT:    ret <3 x i31> [[TRUNC]]
 ;
-  %shuf = shufflevector <3 x i33> %x, <3 x i33> poison, <3 x i32> <i32 undef, i32 1, i32 1>
+  %shuf = shufflevector <3 x i33> %x, <3 x i33> poison, <3 x i32> <i32 poison, i32 1, i32 1>
   %trunc = trunc <3 x i33> %shuf to <3 x i31>
   ret <3 x i31> %trunc
 }
diff --git a/llvm/test/Transforms/InstCombine/trunc-shift-trunc.ll b/llvm/test/Transforms/InstCombine/trunc-shift-trunc.ll
index 2c5f428cf98de..c50a3d06d24b9 100644
--- a/llvm/test/Transforms/InstCombine/trunc-shift-trunc.ll
+++ b/llvm/test/Transforms/InstCombine/trunc-shift-trunc.ll
@@ -56,14 +56,14 @@ define <2 x i8> @trunc_lshr_trunc_nonuniform(<2 x i64> %a) {
   ret <2 x i8> %d
 }
 
-define <2 x i8> @trunc_lshr_trunc_uniform_undef(<2 x i64> %a) {
-; CHECK-LABEL: @trunc_lshr_trunc_uniform_undef(
-; CHECK-NEXT:    [[C1:%.*]] = lshr <2 x i64> [[A:%.*]], <i64 24, i64 undef>
+define <2 x i8> @trunc_lshr_trunc_uniform_poison(<2 x i64> %a) {
+; CHECK-LABEL: @trunc_lshr_trunc_uniform_poison(
+; CHECK-NEXT:    [[C1:%.*]] = lshr <2 x i64> [[A:%.*]], <i64 24, i64 poison>
 ; CHECK-NEXT:    [[D:%.*]] = trunc <2 x i64> [[C1]] to <2 x i8>
 ; CHECK-NEXT:    ret <2 x i8> [[D]]
 ;
   %b = trunc <2 x i64> %a to <2 x i32>
-  %c = lshr <2 x i32> %b, <i32 24, i32 undef>
+  %c = lshr <2 x i32> %b, <i32 24, i32 poison>
   %d = trunc <2 x i32> %c to <2 x i8>
   ret <2 x i8> %d
 }
@@ -142,14 +142,14 @@ define <2 x i8> @trunc_ashr_trunc_nonuniform(<2 x i64> %a) {
   ret <2 x i8> %d
 }
 
-define <2 x i8> @trunc_ashr_trunc_uniform_undef(<2 x i64> %a) {
-; CHECK-LABEL: @trunc_ashr_trunc_uniform_undef(
-; CHECK-NEXT:    [[C1:%.*]] = ashr <2 x i64> [[A:%.*]], <i64 8, i64 undef>
+define <2 x i8> @trunc_ashr_trunc_uniform_poison(<2 x i64> %a) {
+; CHECK-LABEL: @trunc_ashr_trunc_uniform_poison(
+; CHECK-NEXT:    [[C1:%.*]] = ashr <2 x i64> [[A:%.*]], <i64 8, i64 poison>
 ; CHECK-NEXT:    [[D:%.*]] = trunc <2 x i64> [[C1]] to <2 x i8>
 ; CHECK-NEXT:    ret <2 x i8> [[D]]
 ;
   %b = trunc <2 x i64> %a to <2 x i32>
-  %c = ashr <2 x i32> %b, <i32 8, i32 undef>
+  %c = ashr <2 x i32> %b, <i32 8, i32 poison>
   %d = trunc <2 x i32> %c to <2 x i8>
   ret <2 x i8> %d
 }
diff --git a/llvm/test/Transforms/InstCombine/trunc.ll b/llvm/test/Transforms/InstCombine/trunc.ll
index c77d7269f2cf7..e59b2bea6684c 100644
--- a/llvm/test/Transforms/InstCombine/trunc.ll
+++ b/llvm/test/Transforms/InstCombine/trunc.ll
@@ -49,15 +49,15 @@ define <2 x i64> @test1_vec_nonuniform(<2 x i64> %a) {
   ret <2 x i64> %d
 }
 
-define <2 x i64> @test1_vec_undef(<2 x i64> %a) {
-; CHECK-LABEL: @test1_vec_undef(
+define <2 x i64> @test1_vec_poison(<2 x i64> %a) {
+; CHECK-LABEL: @test1_vec_poison(
 ; CHECK-NEXT:    [[B:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32>
-; CHECK-NEXT:    [[D:%.*]] = and <2 x i64> [[A]], <i64 15, i64 0>
+; CHECK-NEXT:    [[D:%.*]] = and <2 x i64> [[A]], <i64 15, i64 poison>
 ; CHECK-NEXT:    call void @use_vec(<2 x i32> [[B]])
 ; CHECK-NEXT:    ret <2 x i64> [[D]]
 ;
   %b = trunc <2 x i64> %a to <2 x i32>
-  %c = and <2 x i32> %b, <i32 15, i32 undef>
+  %c = and <2 x i32> %b, <i32 15, i32 poison>
   %d = zext <2 x i32> %c to <2 x i64>
   call void @use_vec(<2 x i32> %b)
   ret <2 x i64> %d
@@ -111,17 +111,17 @@ define <2 x i64> @test2_vec_nonuniform(<2 x i64> %a) {
   ret <2 x i64> %d
 }
 
-define <2 x i64> @test2_vec_undef(<2 x i64> %a) {
-; CHECK-LABEL: @test2_vec_undef(
+define <2 x i64> @test2_vec_poison(<2 x i64> %a) {
+; CHECK-LABEL: @test2_vec_poison(
 ; CHECK-NEXT:    [[B:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i32>
-; CHECK-NEXT:    [[D1:%.*]] = shl <2 x i64> [[A]], <i64 36, i64 undef>
-; CHECK-NEXT:    [[D:%.*]] = ashr exact <2 x i64> [[D1]], <i64 36, i64 undef>
+; CHECK-NEXT:    [[D1:%.*]] = shl <2 x i64> [[A]], <i64 36, i64 poison>
+; CHECK-NEXT:    [[D:%.*]] = ashr exact <2 x i64> [[D1]], <i64 36, i64 poison>
 ; CHECK-NEXT:    call void @use_vec(<2 x i32> [[B]])
 ; CHECK-NEXT:    ret <2 x i64> [[D]]
 ;
   %b = trunc <2 x i64> %a to <2 x i32>
-  %c = shl <2 x i32> %b, <i32 4, i32 undef>
-  %q = ashr <2 x i32> %c, <i32 4, i32 undef>
+  %c = shl <2 x i32> %b, <i32 4, i32 poison>
+  %q = ashr <2 x i32> %c, <i32 4, i32 poison>
   %d = sext <2 x i32> %q to <2 x i64>
   call void @use_vec(<2 x i32> %b)
   ret <2 x i64> %d
@@ -300,18 +300,17 @@ define <2 x i64> @test8_vec_nonuniform(<2 x i32> %A, <2 x i32> %B) {
   ret <2 x i64> %G
 }
 
-define <2 x i64> @test8_vec_undef(<2 x i32> %A, <2 x i32> %B) {
-; CHECK-LABEL: @test8_vec_undef(
-; CHECK-NEXT:    [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i128>
-; CHECK-NEXT:    [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i128>
-; CHECK-NEXT:    [[E:%.*]] = shl <2 x i128> [[D]], <i128 32, i128 undef>
-; CHECK-NEXT:    [[F:%.*]] = or <2 x i128> [[E]], [[C]]
-; CHECK-NEXT:    [[G:%.*]] = trunc <2 x i128> [[F]] to <2 x i64>
+define <2 x i64> @test8_vec_poison(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: @test8_vec_poison(
+; CHECK-NEXT:    [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[E:%.*]] = shl nuw <2 x i64> [[D]], <i64 32, i64 poison>
+; CHECK-NEXT:    [[G:%.*]] = or disjoint <2 x i64> [[E]], [[C]]
 ; CHECK-NEXT:    ret <2 x i64> [[G]]
 ;
   %C = zext <2 x i32> %A to <2 x i128>
   %D = zext <2 x i32> %B to <2 x i128>
-  %E = shl <2 x i128> %D, <i128 32, i128 undef>
+  %E = shl <2 x i128> %D, <i128 32, i128 poison>
   %F = or <2 x i128> %E, %C
   %G = trunc <2 x i128> %F to <2 x i64>
   ret <2 x i64> %G
@@ -388,18 +387,17 @@ define <2 x i64> @test11_vec_nonuniform(<2 x i32> %A, <2 x i32> %B) {
   ret <2 x i64> %G
 }
 
-define <2 x i64> @test11_vec_undef(<2 x i32> %A, <2 x i32> %B) {
-; CHECK-LABEL: @test11_vec_undef(
-; CHECK-NEXT:    [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i128>
-; CHECK-NEXT:    [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i128>
-; CHECK-NEXT:    [[E:%.*]] = and <2 x i128> [[D]], <i128 31, i128 undef>
-; CHECK-NEXT:    [[F:%.*]] = shl <2 x i128> [[C]], [[E]]
-; CHECK-NEXT:    [[G:%.*]] = trunc <2 x i128> [[F]] to <2 x i64>
+define <2 x i64> @test11_vec_poison(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: @test11_vec_poison(
+; CHECK-NEXT:    [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[B:%.*]], <i32 31, i32 poison>
+; CHECK-NEXT:    [[E:%.*]] = zext nneg <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[G:%.*]] = shl nuw nsw <2 x i64> [[C]], [[E]]
 ; CHECK-NEXT:    ret <2 x i64> [[G]]
 ;
   %C = zext <2 x i32> %A to <2 x i128>
   %D = zext <2 x i32> %B to <2 x i128>
-  %E = and <2 x i128> %D, <i128 31, i128 undef>
+  %E = and <2 x i128> %D, <i128 31, i128 poison>
   %F = shl <2 x i128> %C, %E
   %G = trunc <2 x i128> %F to <2 x i64>
   ret <2 x i64> %G
@@ -453,18 +451,17 @@ define <2 x i64> @test12_vec_nonuniform(<2 x i32> %A, <2 x i32> %B) {
   ret <2 x i64> %G
 }
 
-define <2 x i64> @test12_vec_undef(<2 x i32> %A, <2 x i32> %B) {
-; CHECK-LABEL: @test12_vec_undef(
-; CHECK-NEXT:    [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i128>
-; CHECK-NEXT:    [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i128>
-; CHECK-NEXT:    [[E:%.*]] = and <2 x i128> [[D]], <i128 31, i128 undef>
-; CHECK-NEXT:    [[F:%.*]] = lshr <2 x i128> [[C]], [[E]]
-; CHECK-NEXT:    [[G:%.*]] = trunc nuw nsw <2 x i128> [[F]] to <2 x i64>
+define <2 x i64> @test12_vec_poison(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: @test12_vec_poison(
+; CHECK-NEXT:    [[C:%.*]] = zext <2 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[B:%.*]], <i32 31, i32 poison>
+; CHECK-NEXT:    [[E:%.*]] = zext nneg <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[G:%.*]] = lshr <2 x i64> [[C]], [[E]]
 ; CHECK-NEXT:    ret <2 x i64> [[G]]
 ;
   %C = zext <2 x i32> %A to <2 x i128>
   %D = zext <2 x i32> %B to <2 x i128>
-  %E = and <2 x i128> %D, <i128 31, i128 undef>
+  %E = and <2 x i128> %D, <i128 31, i128 poison>
   %F = lshr <2 x i128> %C, %E
   %G = trunc <2 x i128> %F to <2 x i64>
   ret <2 x i64> %G
@@ -518,18 +515,17 @@ define <2 x i64> @test13_vec_nonuniform(<2 x i32> %A, <2 x i32> %B) {
   ret <2 x i64> %G
 }
 
-define <2 x i64> @test13_vec_undef(<2 x i32> %A, <2 x i32> %B) {
-; CHECK-LABEL: @test13_vec_undef(
-; CHECK-NEXT:    [[C:%.*]] = sext <2 x i32> [[A:%.*]] to <2 x i128>
-; CHECK-NEXT:    [[D:%.*]] = zext <2 x i32> [[B:%.*]] to <2 x i128>
-; CHECK-NEXT:    [[E:%.*]] = and <2 x i128> [[D]], <i128 31, i128 undef>
-; CHECK-NEXT:    [[F:%.*]] = ashr <2 x i128> [[C]], [[E]]
-; CHECK-NEXT:    [[G:%.*]] = trunc nsw <2 x i128> [[F]] to <2 x i64>
+define <2 x i64> @test13_vec_poison(<2 x i32> %A, <2 x i32> %B) {
+; CHECK-LABEL: @test13_vec_poison(
+; CHECK-NEXT:    [[C:%.*]] = sext <2 x i32> [[A:%.*]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[B:%.*]], <i32 31, i32 poison>
+; CHECK-NEXT:    [[E:%.*]] = zext nneg <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[G:%.*]] = ashr <2 x i64> [[C]], [[E]]
 ; CHECK-NEXT:    ret <2 x i64> [[G]]
 ;
   %C = sext <2 x i32> %A to <2 x i128>
   %D = zext <2 x i32> %B to <2 x i128>
-  %E = and <2 x i128> %D, <i128 31, i128 undef>
+  %E = and <2 x i128> %D, <i128 31, i128 poison>
   %F = ashr <2 x i128> %C, %E
   %G = trunc <2 x i128> %F to <2 x i64>
   ret <2 x i64> %G
@@ -766,13 +762,13 @@ define <2 x i32> @trunc_shl_v2i32_v2i64_uniform(<2 x i64> %val) {
   ret <2 x i32> %trunc
 }
 
-define <2 x i32> @trunc_shl_v2i32_v2i64_undef(<2 x i64> %val) {
-; CHECK-LABEL: @trunc_shl_v2i32_v2i64_undef(
+define <2 x i32> @trunc_shl_v2i32_v2i64_poison(<2 x i64> %val) {
+; CHECK-LABEL: @trunc_shl_v2i32_v2i64_poison(
 ; CHECK-NEXT:    [[VAL_TR:%.*]] = trunc <2 x i64> [[VAL:%.*]] to <2 x i32>
-; CHECK-NEXT:    [[TRUNC:%.*]] = shl <2 x i32> [[VAL_TR]], <i32 31, i32 undef>
+; CHECK-NEXT:    [[TRUNC:%.*]] = shl <2 x i32> [[VAL_TR]], <i32 31, i32 poison>
 ; CHECK-NEXT:    ret <2 x i32> [[TRUNC]]
 ;
-  %shl = shl <2 x i64> %val, <i64 31, i64 undef>
+  %shl = shl <2 x i64> %val, <i64 31, i64 poison>
   %trunc = trunc <2 x i64> %shl to <2 x i32>
   ret <2 x i32> %trunc
 }
@@ -917,7 +913,7 @@ define <4 x i8> @wide_shuf(<4 x i32> %x) {
   ret <4 x i8> %trunc
 }
 
-; trunc (shuffle X, undef, SplatMask) --> shuffle (trunc X), undef, SplatMask
+; trunc (shuffle X, poison, SplatMask) --> shuffle (trunc X), poison, SplatMask
 
 define <4 x i8> @wide_splat1(<4 x i32> %x) {
 ; CHECK-LABEL: @wide_splat1(
@@ -925,13 +921,13 @@ define <4 x i8> @wide_splat1(<4 x i32> %x) {
 ; CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
 ; CHECK-NEXT:    ret <4 x i8> [[TRUNC]]
 ;
-  %shuf = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
   %trunc = trunc <4 x i32> %shuf to <4 x i8>
   ret <4 x i8> %trunc
 }
 
 ; Test weird types.
-; trunc (shuffle X, undef, SplatMask) --> shuffle (trunc X), undef, SplatMask
+; trunc (shuffle X, poison, SplatMask) --> shuffle (trunc X), poison, SplatMask
 
 define <3 x i31> @wide_splat2(<3 x i33> %x) {
 ; CHECK-LABEL: @wide_splat2(
@@ -939,14 +935,14 @@ define <3 x i31> @wide_splat2(<3 x i33> %x) {
 ; CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <3 x i31> [[TMP1]], <3 x i31> poison, <3 x i32> <i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    ret <3 x i31> [[TRUNC]]
 ;
-  %shuf = shufflevector <3 x i33> %x, <3 x i33> undef, <3 x i32> <i32 1, i32 1, i32 1>
+  %shuf = shufflevector <3 x i33> %x, <3 x i33> poison, <3 x i32> <i32 1, i32 1, i32 1>
   %trunc = trunc <3 x i33> %shuf to <3 x i31>
   ret <3 x i31> %trunc
 }
 
 ; FIXME:
-; trunc (shuffle X, undef, SplatMask) --> shuffle (trunc X), undef, SplatMask
-; A mask with undef elements should still be considered a splat mask.
+; trunc (shuffle X, poison, SplatMask) --> shuffle (trunc X), poison, SplatMask
+; A mask with poison elements should still be considered a splat mask.
 
 define <3 x i31> @wide_splat3(<3 x i33> %x) {
 ; CHECK-LABEL: @wide_splat3(
@@ -954,7 +950,7 @@ define <3 x i31> @wide_splat3(<3 x i33> %x) {
 ; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <3 x i33> [[SHUF]] to <3 x i31>
 ; CHECK-NEXT:    ret <3 x i31> [[TRUNC]]
 ;
-  %shuf = shufflevector <3 x i33> %x, <3 x i33> undef, <3 x i32> <i32 undef, i32 1, i32 1>
+  %shuf = shufflevector <3 x i33> %x, <3 x i33> poison, <3 x i32> <i32 poison, i32 1, i32 1>
   %trunc = trunc <3 x i33> %shuf to <3 x i31>
   ret <3 x i31> %trunc
 }
diff --git a/llvm/test/Transforms/InstCombine/unsigned-mul-lack-of-overflow-check-via-udiv-of-allones.ll b/llvm/test/Transforms/InstCombine/unsigned-mul-lack-of-overflow-check-via-udiv-of-allones.ll
index 1ffcfb4424e31..241d9cbcde338 100644
--- a/llvm/test/Transforms/InstCombine/unsigned-mul-lack-of-overflow-check-via-udiv-of-allones.ll
+++ b/llvm/test/Transforms/InstCombine/unsigned-mul-lack-of-overflow-check-via-udiv-of-allones.ll
@@ -30,14 +30,14 @@ define <2 x i1> @t1_vec(<2 x i8> %x, <2 x i8> %y) {
   ret <2 x i1> %r
 }
 
-define <3 x i1> @t2_vec_undef(<3 x i8> %x, <3 x i8> %y) {
-; CHECK-LABEL: @t2_vec_undef(
+define <3 x i1> @t2_vec_poison(<3 x i8> %x, <3 x i8> %y) {
+; CHECK-LABEL: @t2_vec_poison(
 ; CHECK-NEXT:    [[MUL:%.*]] = call { <3 x i8>, <3 x i1> } @llvm.umul.with.overflow.v3i8(<3 x i8> [[X:%.*]], <3 x i8> [[Y:%.*]])
 ; CHECK-NEXT:    [[MUL_OV:%.*]] = extractvalue { <3 x i8>, <3 x i1> } [[MUL]], 1
 ; CHECK-NEXT:    [[MUL_NOT_OV:%.*]] = xor <3 x i1> [[MUL_OV]], <i1 true, i1 true, i1 true>
 ; CHECK-NEXT:    ret <3 x i1> [[MUL_NOT_OV]]
 ;
-  %t0 = udiv <3 x i8> <i8 -1, i8 undef, i8 -1>, %x
+  %t0 = udiv <3 x i8> <i8 -1, i8 poison, i8 -1>, %x
   %r = icmp uge <3 x i8> %t0, %y
   ret <3 x i1> %r
 }
diff --git a/llvm/test/Transforms/InstCombine/unsigned-mul-overflow-check-via-udiv-of-allones.ll b/llvm/test/Transforms/InstCombine/unsigned-mul-overflow-check-via-udiv-of-allones.ll
index 710a09f6e16a1..7eb08bdd6016c 100644
--- a/llvm/test/Transforms/InstCombine/unsigned-mul-overflow-check-via-udiv-of-allones.ll
+++ b/llvm/test/Transforms/InstCombine/unsigned-mul-overflow-check-via-udiv-of-allones.ll
@@ -28,13 +28,13 @@ define <2 x i1> @t1_vec(<2 x i8> %x, <2 x i8> %y) {
   ret <2 x i1> %r
 }
 
-define <3 x i1> @t2_vec_undef(<3 x i8> %x, <3 x i8> %y) {
-; CHECK-LABEL: @t2_vec_undef(
+define <3 x i1> @t2_vec_poison(<3 x i8> %x, <3 x i8> %y) {
+; CHECK-LABEL: @t2_vec_poison(
 ; CHECK-NEXT:    [[MUL:%.*]] = call { <3 x i8>, <3 x i1> } @llvm.umul.with.overflow.v3i8(<3 x i8> [[X:%.*]], <3 x i8> [[Y:%.*]])
 ; CHECK-NEXT:    [[MUL_OV:%.*]] = extractvalue { <3 x i8>, <3 x i1> } [[MUL]], 1
 ; CHECK-NEXT:    ret <3 x i1> [[MUL_OV]]
 ;
-  %t0 = udiv <3 x i8> <i8 -1, i8 undef, i8 -1>, %x
+  %t0 = udiv <3 x i8> <i8 -1, i8 poison, i8 -1>, %x
   %r = icmp ult <3 x i8> %t0, %y
   ret <3 x i1> %r
 }
diff --git a/llvm/test/Transforms/InstCombine/variable-signext-of-variable-high-bit-extraction.ll b/llvm/test/Transforms/InstCombine/variable-signext-of-variable-high-bit-extraction.ll
index adacf3ce99b2f..262942aa1219b 100644
--- a/llvm/test/Transforms/InstCombine/variable-signext-of-variable-high-bit-extraction.ll
+++ b/llvm/test/Transforms/InstCombine/variable-signext-of-variable-high-bit-extraction.ll
@@ -203,20 +203,20 @@ define <2 x i32> @t4_vec(<2 x i64> %data, <2 x i32> %nbits) {
   ret <2 x i32> %signextended
 }
 
-define <3 x i32> @t5_vec_undef(<3 x i64> %data, <3 x i32> %nbits) {
-; CHECK-LABEL: @t5_vec_undef(
-; CHECK-NEXT:    [[SKIP_HIGH:%.*]] = sub <3 x i32> <i32 64, i32 64, i32 undef>, [[NBITS:%.*]]
+define <3 x i32> @t5_vec_poison(<3 x i64> %data, <3 x i32> %nbits) {
+; CHECK-LABEL: @t5_vec_poison(
+; CHECK-NEXT:    [[SKIP_HIGH:%.*]] = sub <3 x i32> <i32 64, i32 64, i32 poison>, [[NBITS:%.*]]
 ; CHECK-NEXT:    [[SKIP_HIGH_WIDE:%.*]] = zext nneg <3 x i32> [[SKIP_HIGH]] to <3 x i64>
 ; CHECK-NEXT:    [[TMP1:%.*]] = ashr <3 x i64> [[DATA:%.*]], [[SKIP_HIGH_WIDE]]
 ; CHECK-NEXT:    [[SIGNEXTENDED:%.*]] = trunc <3 x i64> [[TMP1]] to <3 x i32>
 ; CHECK-NEXT:    ret <3 x i32> [[SIGNEXTENDED]]
 ;
-  %skip_high = sub <3 x i32> <i32 64, i32 64, i32 undef>, %nbits
+  %skip_high = sub <3 x i32> <i32 64, i32 64, i32 poison>, %nbits
   %skip_high_wide = zext <3 x i32> %skip_high to <3 x i64>
   %extracted = lshr <3 x i64> %data, %skip_high_wide
   %extracted_narrow = trunc <3 x i64> %extracted to <3 x i32>
-  %num_high_bits_to_smear_narrow0 = sub <3 x i32> <i32 32, i32 32, i32 undef>, %nbits
-  %num_high_bits_to_smear_narrow1 = sub <3 x i32> <i32 undef, i32 32, i32 32>, %nbits
+  %num_high_bits_to_smear_narrow0 = sub <3 x i32> <i32 32, i32 32, i32 poison>, %nbits
+  %num_high_bits_to_smear_narrow1 = sub <3 x i32> <i32 poison, i32 32, i32 32>, %nbits
   %signbit_positioned = shl <3 x i32> %extracted_narrow, %num_high_bits_to_smear_narrow0
   %signextended = ashr <3 x i32> %signbit_positioned, %num_high_bits_to_smear_narrow1
   ret <3 x i32> %signextended
diff --git a/llvm/test/Transforms/InstCombine/vec_sext.ll b/llvm/test/Transforms/InstCombine/vec_sext.ll
index a880d5e562725..9f5f957f49445 100644
--- a/llvm/test/Transforms/InstCombine/vec_sext.ll
+++ b/llvm/test/Transforms/InstCombine/vec_sext.ll
@@ -42,24 +42,24 @@ define <4 x i32> @vec_select_alternate_sign_bit_test(<4 x i32> %a, <4 x i32> %b)
   ret <4 x i32> %cond
 }
 
-define <2 x i32> @is_negative_undef_elt(<2 x i32> %a) {
-; CHECK-LABEL: @is_negative_undef_elt(
+define <2 x i32> @is_negative_poison_elt(<2 x i32> %a) {
+; CHECK-LABEL: @is_negative_poison_elt(
 ; CHECK-NEXT:    [[A_LOBIT:%.*]] = ashr <2 x i32> [[A:%.*]], <i32 31, i32 31>
 ; CHECK-NEXT:    ret <2 x i32> [[A_LOBIT]]
 ;
-  %cmp = icmp slt <2 x i32> %a, <i32 0, i32 undef>
+  %cmp = icmp slt <2 x i32> %a, <i32 0, i32 poison>
   %sext = sext <2 x i1> %cmp to <2 x i32>
   ret <2 x i32> %sext
 
 }
 
-define <2 x i32> @is_positive_undef_elt(<2 x i32> %a) {
-; CHECK-LABEL: @is_positive_undef_elt(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <2 x i32> [[A:%.*]], <i32 undef, i32 -1>
+define <2 x i32> @is_positive_poison_elt(<2 x i32> %a) {
+; CHECK-LABEL: @is_positive_poison_elt(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <2 x i32> [[A:%.*]], <i32 poison, i32 -1>
 ; CHECK-NEXT:    [[SEXT:%.*]] = sext <2 x i1> [[CMP]] to <2 x i32>
 ; CHECK-NEXT:    ret <2 x i32> [[SEXT]]
 ;
-  %cmp = icmp sgt <2 x i32> %a, <i32 undef, i32 -1>
+  %cmp = icmp sgt <2 x i32> %a, <i32 poison, i32 -1>
   %sext = sext <2 x i1> %cmp to <2 x i32>
   ret <2 x i32> %sext
 }
diff --git a/llvm/test/Transforms/InstCombine/vector-casts-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vector-casts-inseltpoison.ll
index cf1b72fbcf3e1..a87364600ba30 100644
--- a/llvm/test/Transforms/InstCombine/vector-casts-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vector-casts-inseltpoison.ll
@@ -26,26 +26,26 @@ define <2 x i1> @and_cmp_is_trunc(<2 x i64> %a) {
 
 ; This is trunc.
 
-define <2 x i1> @and_cmp_is_trunc_even_with_undef_elt(<2 x i64> %a) {
-; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elt(
+define <2 x i1> @and_cmp_is_trunc_even_with_poison_elt(<2 x i64> %a) {
+; CHECK-LABEL: @and_cmp_is_trunc_even_with_poison_elt(
 ; CHECK-NEXT:    [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
-  %t = and <2 x i64> %a, <i64 undef, i64 1>
+  %t = and <2 x i64> %a, <i64 poison, i64 1>
   %r = icmp ne <2 x i64> %t, zeroinitializer
   ret <2 x i1> %r
 }
 
-; TODO: This could be just 1 instruction (trunc), but our undef matching is incomplete.
+; TODO: This could be just 1 instruction (trunc), but our poison matching is incomplete.
 
-define <2 x i1> @and_cmp_is_trunc_even_with_undef_elts(<2 x i64> %a) {
-; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elts(
-; CHECK-NEXT:    [[T:%.*]] = and <2 x i64> [[A:%.*]], <i64 undef, i64 1>
-; CHECK-NEXT:    [[R:%.*]] = icmp ne <2 x i64> [[T]], <i64 undef, i64 0>
+define <2 x i1> @and_cmp_is_trunc_even_with_poison_elts(<2 x i64> %a) {
+; CHECK-LABEL: @and_cmp_is_trunc_even_with_poison_elts(
+; CHECK-NEXT:    [[T:%.*]] = and <2 x i64> [[A:%.*]], <i64 poison, i64 1>
+; CHECK-NEXT:    [[R:%.*]] = icmp ne <2 x i64> [[T]], <i64 poison, i64 0>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
-  %t = and <2 x i64> %a, <i64 undef, i64 1>
-  %r = icmp ne <2 x i64> %t, <i64 undef, i64 0>
+  %t = and <2 x i64> %a, <i64 poison, i64 1>
+  %r = icmp ne <2 x i64> %t, <i64 poison, i64 0>
   ret <2 x i1> %r
 }
 
diff --git a/llvm/test/Transforms/InstCombine/vector-casts.ll b/llvm/test/Transforms/InstCombine/vector-casts.ll
index 281fc5f6011ea..fd2a4ffdfb709 100644
--- a/llvm/test/Transforms/InstCombine/vector-casts.ll
+++ b/llvm/test/Transforms/InstCombine/vector-casts.ll
@@ -26,26 +26,26 @@ define <2 x i1> @and_cmp_is_trunc(<2 x i64> %a) {
 
 ; This is trunc.
 
-define <2 x i1> @and_cmp_is_trunc_even_with_undef_elt(<2 x i64> %a) {
-; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elt(
+define <2 x i1> @and_cmp_is_trunc_even_with_poison_elt(<2 x i64> %a) {
+; CHECK-LABEL: @and_cmp_is_trunc_even_with_poison_elt(
 ; CHECK-NEXT:    [[R:%.*]] = trunc <2 x i64> [[A:%.*]] to <2 x i1>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
-  %t = and <2 x i64> %a, <i64 undef, i64 1>
+  %t = and <2 x i64> %a, <i64 poison, i64 1>
   %r = icmp ne <2 x i64> %t, zeroinitializer
   ret <2 x i1> %r
 }
 
-; TODO: This could be just 1 instruction (trunc), but our undef matching is incomplete.
+; TODO: This could be just 1 instruction (trunc), but our poison matching is incomplete.
 
-define <2 x i1> @and_cmp_is_trunc_even_with_undef_elts(<2 x i64> %a) {
-; CHECK-LABEL: @and_cmp_is_trunc_even_with_undef_elts(
-; CHECK-NEXT:    [[T:%.*]] = and <2 x i64> [[A:%.*]], <i64 undef, i64 1>
-; CHECK-NEXT:    [[R:%.*]] = icmp ne <2 x i64> [[T]], <i64 undef, i64 0>
+define <2 x i1> @and_cmp_is_trunc_even_with_poison_elts(<2 x i64> %a) {
+; CHECK-LABEL: @and_cmp_is_trunc_even_with_poison_elts(
+; CHECK-NEXT:    [[T:%.*]] = and <2 x i64> [[A:%.*]], <i64 poison, i64 1>
+; CHECK-NEXT:    [[R:%.*]] = icmp ne <2 x i64> [[T]], <i64 poison, i64 0>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
-  %t = and <2 x i64> %a, <i64 undef, i64 1>
-  %r = icmp ne <2 x i64> %t, <i64 undef, i64 0>
+  %t = and <2 x i64> %a, <i64 poison, i64 1>
+  %r = icmp ne <2 x i64> %t, <i64 poison, i64 0>
   ret <2 x i1> %r
 }
 
diff --git a/llvm/test/Transforms/InstCombine/vector-urem.ll b/llvm/test/Transforms/InstCombine/vector-urem.ll
index d5c77470a20f8..627789a03ef6c 100644
--- a/llvm/test/Transforms/InstCombine/vector-urem.ll
+++ b/llvm/test/Transforms/InstCombine/vector-urem.ll
@@ -19,11 +19,11 @@ define <4 x i32> @test_v4i32_const_pow2(<4 x i32> %a0) {
   ret <4 x i32> %1
 }
 
-define <4 x i32> @test_v4i32_const_pow2_undef(<4 x i32> %a0) {
-; CHECK-LABEL: @test_v4i32_const_pow2_undef(
+define <4 x i32> @test_v4i32_const_pow2_poison(<4 x i32> %a0) {
+; CHECK-LABEL: @test_v4i32_const_pow2_poison(
 ; CHECK-NEXT:    ret <4 x i32> poison
 ;
-  %1 = urem <4 x i32> %a0, <i32 1, i32 2, i32 4, i32 undef>
+  %1 = urem <4 x i32> %a0, <i32 1, i32 2, i32 4, i32 poison>
   ret <4 x i32> %1
 }
 
@@ -37,13 +37,13 @@ define <4 x i32> @test_v4i32_one(<4 x i32> %a0) {
   ret <4 x i32> %1
 }
 
-define <4 x i32> @test_v4i32_one_undef(<4 x i32> %a0) {
-; CHECK-LABEL: @test_v4i32_one_undef(
+define <4 x i32> @test_v4i32_one_poison(<4 x i32> %a0) {
+; CHECK-LABEL: @test_v4i32_one_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[A0:%.*]], <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = zext <4 x i1> [[TMP1]] to <4 x i32>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
-  %1 = urem <4 x i32> <i32 1, i32 1, i32 1, i32 undef>, %a0
+  %1 = urem <4 x i32> <i32 1, i32 1, i32 1, i32 poison>, %a0
   ret <4 x i32> %1
 }
 
@@ -71,10 +71,10 @@ define <4 x i32> @test_v4i32_negconst(<4 x i32> %a0) {
   ret <4 x i32> %1
 }
 
-define <4 x i32> @test_v4i32_negconst_undef(<4 x i32> %a0) {
-; CHECK-LABEL: @test_v4i32_negconst_undef(
+define <4 x i32> @test_v4i32_negconst_poison(<4 x i32> %a0) {
+; CHECK-LABEL: @test_v4i32_negconst_poison(
 ; CHECK-NEXT:    ret <4 x i32> poison
 ;
-  %1 = urem <4 x i32> %a0, <i32 -3, i32 -5, i32 -7, i32 undef>
+  %1 = urem <4 x i32> %a0, <i32 -3, i32 -5, i32 -7, i32 poison>
   ret <4 x i32> %1
 }
diff --git a/llvm/test/Transforms/InstCombine/vector-xor.ll b/llvm/test/Transforms/InstCombine/vector-xor.ll
index 171dd6e35b4e1..ee593b5d15e8e 100644
--- a/llvm/test/Transforms/InstCombine/vector-xor.ll
+++ b/llvm/test/Transforms/InstCombine/vector-xor.ll
@@ -53,14 +53,14 @@ define <4 x i32> @test_v4i32_xor_bswap_const(<4 x i32> %a0) {
   ret <4 x i32> %2
 }
 
-define <4 x i32> @test_v4i32_xor_bswap_const_undef(<4 x i32> %a0) {
-; CHECK-LABEL: @test_v4i32_xor_bswap_const_undef(
+define <4 x i32> @test_v4i32_xor_bswap_const_poison(<4 x i32> %a0) {
+; CHECK-LABEL: @test_v4i32_xor_bswap_const_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> [[A0:%.*]])
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], <i32 undef, i32 0, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], <i32 poison, i32 0, i32 2, i32 3>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
   %1 = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %a0)
-  %2 = xor  <4 x i32> %1, <i32 undef, i32 0, i32 2, i32 3>
+  %2 = xor  <4 x i32> %1, <i32 poison, i32 0, i32 2, i32 3>
   ret <4 x i32> %2
 }
 
@@ -105,14 +105,14 @@ define <4 x i32> @test_v4i32_not_ashr_not(<4 x i32> %x, <4 x i32> %y) {
   ret <4 x i32> %3
 }
 
-define <4 x i32> @test_v4i32_not_ashr_not_undef(<4 x i32> %x, <4 x i32> %y) {
-; CHECK-LABEL: @test_v4i32_not_ashr_not_undef(
+define <4 x i32> @test_v4i32_not_ashr_not_poison(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @test_v4i32_not_ashr_not_poison(
 ; CHECK-NEXT:    [[DOTNOT:%.*]] = ashr <4 x i32> [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret <4 x i32> [[DOTNOT]]
 ;
-  %1 = xor  <4 x i32> <i32 -1, i32 -1, i32 -1, i32 undef>, %x
+  %1 = xor  <4 x i32> <i32 -1, i32 -1, i32 -1, i32 poison>, %x
   %2 = ashr <4 x i32> %1, %y
-  %3 = xor  <4 x i32> <i32 -1, i32 -1, i32 undef, i32 -1>, %2
+  %3 = xor  <4 x i32> <i32 -1, i32 -1, i32 poison, i32 -1>, %2
   ret <4 x i32> %3
 }
 
@@ -138,13 +138,13 @@ define <4 x i32> @test_v4i32_not_ashr_negative_const(<4 x i32> %a0) {
   ret <4 x i32> %2
 }
 
-define <4 x i32> @test_v4i32_not_ashr_negative_const_undef(<4 x i32> %a0) {
-; CHECK-LABEL: @test_v4i32_not_ashr_negative_const_undef(
+define <4 x i32> @test_v4i32_not_ashr_negative_const_poison(<4 x i32> %a0) {
+; CHECK-LABEL: @test_v4i32_not_ashr_negative_const_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> <i32 2, i32 4, i32 0, i32 8>, [[A0:%.*]]
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
-  %1 = ashr <4 x i32> <i32 -3, i32 -5, i32 undef, i32 -9>, %a0
-  %2 = xor  <4 x i32> <i32 -1, i32 -1, i32 -1, i32 undef>, %1
+  %1 = ashr <4 x i32> <i32 -3, i32 -5, i32 poison, i32 -9>, %a0
+  %2 = xor  <4 x i32> <i32 -1, i32 -1, i32 -1, i32 poison>, %1
   ret <4 x i32> %2
 }
 
@@ -170,13 +170,13 @@ define <4 x i32> @test_v4i32_not_lshr_nonnegative_const(<4 x i32> %a0) {
   ret <4 x i32> %2
 }
 
-define <4 x i32> @test_v4i32_not_lshr_nonnegative_const_undef(<4 x i32> %a0) {
-; CHECK-LABEL: @test_v4i32_not_lshr_nonnegative_const_undef(
+define <4 x i32> @test_v4i32_not_lshr_nonnegative_const_poison(<4 x i32> %a0) {
+; CHECK-LABEL: @test_v4i32_not_lshr_nonnegative_const_poison(
 ; CHECK-NEXT:    [[TMP1:%.*]] = ashr <4 x i32> <i32 -4, i32 -6, i32 -1, i32 -10>, [[A0:%.*]]
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
-  %1 = lshr <4 x i32> <i32  3, i32  5, i32 undef, i32  9>, %a0
-  %2 = xor  <4 x i32> <i32 -1, i32 -1, i32 -1, i32 undef>, %1
+  %1 = lshr <4 x i32> <i32  3, i32  5, i32 poison, i32  9>, %a0
+  %2 = xor  <4 x i32> <i32 -1, i32 -1, i32 -1, i32 poison>, %1
   ret <4 x i32> %2
 }
 
@@ -202,13 +202,13 @@ define <4 x i32> @test_v4i32_not_sub_const(<4 x i32> %a0) {
   ret <4 x i32> %2
 }
 
-define <4 x i32> @test_v4i32_not_sub_const_undef(<4 x i32> %a0) {
-; CHECK-LABEL: @test_v4i32_not_sub_const_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A0:%.*]], <i32 -4, i32 undef, i32 0, i32 -16>
+define <4 x i32> @test_v4i32_not_sub_const_poison(<4 x i32> %a0) {
+; CHECK-LABEL: @test_v4i32_not_sub_const_poison(
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A0:%.*]], <i32 -4, i32 poison, i32 0, i32 -16>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
-  %1 = sub <4 x i32> <i32  3, i32 undef, i32 -1, i32 15>, %a0
-  %2 = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 undef>, %1
+  %1 = sub <4 x i32> <i32  3, i32 poison, i32 -1, i32 15>, %a0
+  %2 = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 poison>, %1
   ret <4 x i32> %2
 }
 
@@ -235,14 +235,14 @@ define <4 x i32> @test_v4i32_xor_signmask_sub_const(<4 x i32> %a0) {
   ret <4 x i32> %2
 }
 
-define <4 x i32> @test_v4i32_xor_signmask_sub_const_undef(<4 x i32> %a0) {
-; CHECK-LABEL: @test_v4i32_xor_signmask_sub_const_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = sub <4 x i32> <i32 3, i32 undef, i32 -1, i32 15>, [[A0:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 undef>
+define <4 x i32> @test_v4i32_xor_signmask_sub_const_poison(<4 x i32> %a0) {
+; CHECK-LABEL: @test_v4i32_xor_signmask_sub_const_poison(
+; CHECK-NEXT:    [[TMP1:%.*]] = sub <4 x i32> <i32 3, i32 poison, i32 -1, i32 15>, [[A0:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 poison>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
-  %1 = sub <4 x i32> <i32  3, i32 undef, i32 -1, i32 15>, %a0
-  %2 = xor <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 undef>, %1
+  %1 = sub <4 x i32> <i32  3, i32 poison, i32 -1, i32 15>, %a0
+  %2 = xor <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 poison>, %1
   ret <4 x i32> %2
 }
 
@@ -269,13 +269,13 @@ define <4 x i32> @test_v4i32_xor_signmask_add_const(<4 x i32> %a0) {
   ret <4 x i32> %2
 }
 
-define <4 x i32> @test_v4i32_xor_signmask_add_const_undef(<4 x i32> %a0) {
-; CHECK-LABEL: @test_v4i32_xor_signmask_add_const_undef(
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A0:%.*]], <i32 3, i32 undef, i32 -1, i32 15>
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 undef>
+define <4 x i32> @test_v4i32_xor_signmask_add_const_poison(<4 x i32> %a0) {
+; CHECK-LABEL: @test_v4i32_xor_signmask_add_const_poison(
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A0:%.*]], <i32 3, i32 poison, i32 -1, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 poison>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
 ;
-  %1 = add <4 x i32> <i32  3, i32 undef, i32 -1, i32 15>, %a0
-  %2 = xor <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 undef>, %1
+  %1 = add <4 x i32> <i32  3, i32 poison, i32 -1, i32 15>, %a0
+  %2 = xor <4 x i32> <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 poison>, %1
   ret <4 x i32> %2
 }
diff --git a/llvm/test/Transforms/InstCombine/zext-bool-add-sub.ll b/llvm/test/Transforms/InstCombine/zext-bool-add-sub.ll
index 7fed952a7ff7e..12739b5686a0a 100644
--- a/llvm/test/Transforms/InstCombine/zext-bool-add-sub.ll
+++ b/llvm/test/Transforms/InstCombine/zext-bool-add-sub.ll
@@ -126,13 +126,13 @@ define <2 x i64> @zext_negate_vec(<2 x i1> %A) {
   ret <2 x i64> %sub
 }
 
-define <2 x i64> @zext_negate_vec_undef_elt(<2 x i1> %A) {
-; CHECK-LABEL: @zext_negate_vec_undef_elt(
+define <2 x i64> @zext_negate_vec_poison_elt(<2 x i1> %A) {
+; CHECK-LABEL: @zext_negate_vec_poison_elt(
 ; CHECK-NEXT:    [[EXT_NEG:%.*]] = sext <2 x i1> [[A:%.*]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[EXT_NEG]]
 ;
   %ext = zext <2 x i1> %A to <2 x i64>
-  %sub = sub <2 x i64> <i64 0, i64 undef>, %ext
+  %sub = sub <2 x i64> <i64 0, i64 poison>, %ext
   ret <2 x i64> %sub
 }
 
@@ -169,13 +169,13 @@ define <2 x i64> @zext_sub_const_vec(<2 x i1> %A) {
   ret <2 x i64> %sub
 }
 
-define <2 x i64> @zext_sub_const_vec_undef_elt(<2 x i1> %A) {
-; CHECK-LABEL: @zext_sub_const_vec_undef_elt(
-; CHECK-NEXT:    [[SUB:%.*]] = select <2 x i1> [[A:%.*]], <2 x i64> <i64 41, i64 undef>, <2 x i64> <i64 42, i64 undef>
+define <2 x i64> @zext_sub_const_vec_poison_elt(<2 x i1> %A) {
+; CHECK-LABEL: @zext_sub_const_vec_poison_elt(
+; CHECK-NEXT:    [[SUB:%.*]] = select <2 x i1> [[A:%.*]], <2 x i64> <i64 41, i64 poison>, <2 x i64> <i64 42, i64 poison>
 ; CHECK-NEXT:    ret <2 x i64> [[SUB]]
 ;
   %ext = zext <2 x i1> %A to <2 x i64>
-  %sub = sub <2 x i64> <i64 42, i64 undef>, %ext
+  %sub = sub <2 x i64> <i64 42, i64 poison>, %ext
   ret <2 x i64> %sub
 }
 
@@ -212,13 +212,13 @@ define <2 x i64> @sext_negate_vec(<2 x i1> %A) {
   ret <2 x i64> %sub
 }
 
-define <2 x i64> @sext_negate_vec_undef_elt(<2 x i1> %A) {
-; CHECK-LABEL: @sext_negate_vec_undef_elt(
+define <2 x i64> @sext_negate_vec_poison_elt(<2 x i1> %A) {
+; CHECK-LABEL: @sext_negate_vec_poison_elt(
 ; CHECK-NEXT:    [[EXT_NEG:%.*]] = zext <2 x i1> [[A:%.*]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[EXT_NEG]]
 ;
   %ext = sext <2 x i1> %A to <2 x i64>
-  %sub = sub <2 x i64> <i64 0, i64 undef>, %ext
+  %sub = sub <2 x i64> <i64 0, i64 poison>, %ext
   ret <2 x i64> %sub
 }
 
@@ -255,13 +255,13 @@ define <2 x i64> @sext_sub_const_vec(<2 x i1> %A) {
   ret <2 x i64> %sub
 }
 
-define <2 x i64> @sext_sub_const_vec_undef_elt(<2 x i1> %A) {
-; CHECK-LABEL: @sext_sub_const_vec_undef_elt(
-; CHECK-NEXT:    [[SUB:%.*]] = select <2 x i1> [[A:%.*]], <2 x i64> <i64 undef, i64 43>, <2 x i64> <i64 undef, i64 42>
+define <2 x i64> @sext_sub_const_vec_poison_elt(<2 x i1> %A) {
+; CHECK-LABEL: @sext_sub_const_vec_poison_elt(
+; CHECK-NEXT:    [[SUB:%.*]] = select <2 x i1> [[A:%.*]], <2 x i64> <i64 poison, i64 43>, <2 x i64> <i64 poison, i64 42>
 ; CHECK-NEXT:    ret <2 x i64> [[SUB]]
 ;
   %ext = sext <2 x i1> %A to <2 x i64>
-  %sub = sub <2 x i64> <i64 undef, i64 42>, %ext
+  %sub = sub <2 x i64> <i64 poison, i64 42>, %ext
   ret <2 x i64> %sub
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/AndOrXor.ll b/llvm/test/Transforms/InstSimplify/AndOrXor.ll
index 494b6bcd2b66d..2e3a605224203 100644
--- a/llvm/test/Transforms/InstSimplify/AndOrXor.ll
+++ b/llvm/test/Transforms/InstSimplify/AndOrXor.ll
@@ -12,11 +12,11 @@ define i8 @and0(i8 %x) {
   ret i8 %r
 }
 
-define <2 x i8> @and0_vec_undef_elt(<2 x i8> %x) {
-; CHECK-LABEL: @and0_vec_undef_elt(
+define <2 x i8> @and0_vec_poison_elt(<2 x i8> %x) {
+; CHECK-LABEL: @and0_vec_poison_elt(
 ; CHECK-NEXT:    ret <2 x i8> zeroinitializer
 ;
-  %r = and <2 x i8> %x, <i8 undef, i8 0>
+  %r = and <2 x i8> %x, <i8 poison, i8 0>
   ret <2 x i8> %r
 }
 
@@ -31,14 +31,14 @@ define <2 x i32> @add_nsw_signbit(<2 x i32> %x) {
   ret <2 x i32> %z
 }
 
-; Undef elements in either constant vector are ok.
+; Poison elements in either constant vector are ok.
 
-define <2 x i32> @add_nsw_signbit_undef(<2 x i32> %x) {
-; CHECK-LABEL: @add_nsw_signbit_undef(
+define <2 x i32> @add_nsw_signbit_poison(<2 x i32> %x) {
+; CHECK-LABEL: @add_nsw_signbit_poison(
 ; CHECK-NEXT:    ret <2 x i32> [[X:%.*]]
 ;
-  %y = xor <2 x i32> %x, <i32 undef, i32 -2147483648>
-  %z = add nsw <2 x i32> %y, <i32 -2147483648, i32 undef>
+  %y = xor <2 x i32> %x, <i32 poison, i32 -2147483648>
+  %z = add nsw <2 x i32> %y, <i32 -2147483648, i32 poison>
   ret <2 x i32> %z
 }
 
@@ -53,14 +53,14 @@ define <2 x i5> @add_nuw_signbit(<2 x i5> %x) {
   ret <2 x i5> %z
 }
 
-; Undef elements in either constant vector are ok.
+; Poison elements in either constant vector are ok.
 
-define <2 x i5> @add_nuw_signbit_undef(<2 x i5> %x) {
-; CHECK-LABEL: @add_nuw_signbit_undef(
+define <2 x i5> @add_nuw_signbit_poison(<2 x i5> %x) {
+; CHECK-LABEL: @add_nuw_signbit_poison(
 ; CHECK-NEXT:    ret <2 x i5> [[X:%.*]]
 ;
-  %y = xor <2 x i5> %x, <i5 -16, i5 undef>
-  %z = add nuw <2 x i5> %y, <i5 undef, i5 -16>
+  %y = xor <2 x i5> %x, <i5 -16, i5 poison>
+  %z = add nuw <2 x i5> %y, <i5 poison, i5 -16>
   ret <2 x i5> %z
 }
 
@@ -584,7 +584,7 @@ define <2 x i32> @or_xor_andn_commute2(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-NEXT:    ret <2 x i32> [[XOR]]
 ;
   %xor = xor <2 x i32> %a, %b
-  %neg = xor <2 x i32> %b, <i32 -1, i32 undef>
+  %neg = xor <2 x i32> %b, <i32 -1, i32 poison>
   %and = and <2 x i32> %a, %neg
   %or = or <2 x i32> %xor, %and
   ret <2 x i32> %or
@@ -708,15 +708,13 @@ define <2 x i32> @or_xorn_and_commute2_undef(<2 x i32> %a, <2 x i32> %b) {
   ret <2 x i32> %or
 }
 
-; TODO: Unlike the above test, this is safe to fold.
+; Unlike the above test, this is safe to fold.
 
 define <2 x i32> @or_xorn_and_commute2_poison(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: @or_xorn_and_commute2_poison(
 ; CHECK-NEXT:    [[NEGA:%.*]] = xor <2 x i32> [[A:%.*]], <i32 poison, i32 -1>
-; CHECK-NEXT:    [[AND:%.*]] = and <2 x i32> [[B:%.*]], [[A]]
-; CHECK-NEXT:    [[XOR:%.*]] = xor <2 x i32> [[B]], [[NEGA]]
-; CHECK-NEXT:    [[OR:%.*]] = or <2 x i32> [[XOR]], [[AND]]
-; CHECK-NEXT:    ret <2 x i32> [[OR]]
+; CHECK-NEXT:    [[XOR:%.*]] = xor <2 x i32> [[B:%.*]], [[NEGA]]
+; CHECK-NEXT:    ret <2 x i32> [[XOR]]
 ;
   %nega = xor <2 x i32> %a, <i32 poison, i32 -1>
   %and = and <2 x i32> %b, %a
diff --git a/llvm/test/Transforms/InstSimplify/call.ll b/llvm/test/Transforms/InstSimplify/call.ll
index 52c207a276046..c6f6b65f89dc2 100644
--- a/llvm/test/Transforms/InstSimplify/call.ll
+++ b/llvm/test/Transforms/InstSimplify/call.ll
@@ -976,7 +976,7 @@ define <2 x i8> @fshr_zero_vec(<2 x i8> %shamt) {
 ; CHECK-LABEL: @fshr_zero_vec(
 ; CHECK-NEXT:    ret <2 x i8> zeroinitializer
 ;
-  %r = call <2 x i8> @llvm.fshr.v2i8(<2 x i8> zeroinitializer, <2 x i8> <i8 0, i8 undef>, <2 x i8> %shamt)
+  %r = call <2 x i8> @llvm.fshr.v2i8(<2 x i8> zeroinitializer, <2 x i8> <i8 0, i8 poison>, <2 x i8> %shamt)
   ret <2 x i8> %r
 }
 
@@ -984,7 +984,7 @@ define <2 x i7> @fshl_ones_vec(<2 x i7> %shamt) {
 ; CHECK-LABEL: @fshl_ones_vec(
 ; CHECK-NEXT:    ret <2 x i7> <i7 -1, i7 -1>
 ;
-  %r = call <2 x i7> @llvm.fshl.v2i7(<2 x i7> <i7 undef, i7 -1>, <2 x i7> <i7 -1, i7 undef>, <2 x i7> %shamt)
+  %r = call <2 x i7> @llvm.fshl.v2i7(<2 x i7> <i7 poison, i7 -1>, <2 x i7> <i7 -1, i7 poison>, <2 x i7> %shamt)
   ret <2 x i7> %r
 }
 
@@ -1466,7 +1466,7 @@ define <3 x i33> @cttz_shl1_vec(<3 x i33> %x) {
 ; CHECK-LABEL: @cttz_shl1_vec(
 ; CHECK-NEXT:    ret <3 x i33> [[X:%.*]]
 ;
-  %s = shl <3 x i33> <i33 1, i33 1, i33 undef>, %x
+  %s = shl <3 x i33> <i33 1, i33 1, i33 poison>, %x
   %r = call <3 x i33> @llvm.cttz.v3i33(<3 x i33> %s, i1 false)
   ret <3 x i33> %r
 }
@@ -1509,7 +1509,7 @@ define <3 x i33> @ctlz_lshr_sign_bit_vec(<3 x i33> %x) {
 ; CHECK-LABEL: @ctlz_lshr_sign_bit_vec(
 ; CHECK-NEXT:    ret <3 x i33> [[X:%.*]]
 ;
-  %s = lshr <3 x i33> <i33 undef, i33 4294967296, i33 4294967296>, %x
+  %s = lshr <3 x i33> <i33 poison, i33 4294967296, i33 4294967296>, %x
   %r = call <3 x i33> @llvm.ctlz.v3i33(<3 x i33> %s, i1 false)
   ret <3 x i33> %r
 }
@@ -1549,7 +1549,7 @@ define <3 x i33> @ctlz_ashr_sign_bit_vec(<3 x i33> %x) {
 ; CHECK-LABEL: @ctlz_ashr_sign_bit_vec(
 ; CHECK-NEXT:    ret <3 x i33> zeroinitializer
 ;
-  %s = ashr <3 x i33> <i33 4294967296, i33 undef, i33 4294967296>, %x
+  %s = ashr <3 x i33> <i33 4294967296, i33 poison, i33 4294967296>, %x
   %r = call <3 x i33> @llvm.ctlz.v3i33(<3 x i33> %s, i1 true)
   ret <3 x i33> %r
 }
diff --git a/llvm/test/Transforms/InstSimplify/compare.ll b/llvm/test/Transforms/InstSimplify/compare.ll
index 1e90f0edbd800..724912d90bd86 100644
--- a/llvm/test/Transforms/InstSimplify/compare.ll
+++ b/llvm/test/Transforms/InstSimplify/compare.ll
@@ -1659,21 +1659,21 @@ define <2 x i1> @icmp_shl_1_ugt_signmask(<2 x i8> %V) {
   ret <2 x i1> %cmp
 }
 
-define <2 x i1> @icmp_shl_1_ugt_signmask_undef(<2 x i8> %V) {
-; CHECK-LABEL: @icmp_shl_1_ugt_signmask_undef(
+define <2 x i1> @icmp_shl_1_ugt_signmask_poison(<2 x i8> %V) {
+; CHECK-LABEL: @icmp_shl_1_ugt_signmask_poison(
 ; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %shl = shl <2 x i8> <i8 1, i8 1>, %V
-  %cmp = icmp ugt <2 x i8> %shl, <i8 128, i8 undef>
+  %cmp = icmp ugt <2 x i8> %shl, <i8 128, i8 poison>
   ret <2 x i1> %cmp
 }
 
-define <2 x i1> @icmp_shl_1_ugt_signmask_undef2(<2 x i8> %V) {
-; CHECK-LABEL: @icmp_shl_1_ugt_signmask_undef2(
+define <2 x i1> @icmp_shl_1_ugt_signmask_poison2(<2 x i8> %V) {
+; CHECK-LABEL: @icmp_shl_1_ugt_signmask_poison2(
 ; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
-  %shl = shl <2 x i8> <i8 1, i8 undef>, %V
-  %cmp = icmp ugt <2 x i8> %shl, <i8 undef, i8 128>
+  %shl = shl <2 x i8> <i8 1, i8 poison>, %V
+  %cmp = icmp ugt <2 x i8> %shl, <i8 poison, i8 128>
   ret <2 x i1> %cmp
 }
 
@@ -1695,21 +1695,21 @@ define <2 x i1> @icmp_shl_1_ule_signmask(<2 x i8> %V) {
   ret <2 x i1> %cmp
 }
 
-define <2 x i1> @icmp_shl_1_ule_signmask_undef(<2 x i8> %V) {
-; CHECK-LABEL: @icmp_shl_1_ule_signmask_undef(
+define <2 x i1> @icmp_shl_1_ule_signmask_poison(<2 x i8> %V) {
+; CHECK-LABEL: @icmp_shl_1_ule_signmask_poison(
 ; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
   %shl = shl <2 x i8> <i8 1, i8 1>, %V
-  %cmp = icmp ule <2 x i8> %shl, <i8 128, i8 undef>
+  %cmp = icmp ule <2 x i8> %shl, <i8 128, i8 poison>
   ret <2 x i1> %cmp
 }
 
-define <2 x i1> @icmp_shl_1_ule_signmask_undef2(<2 x i8> %V) {
-; CHECK-LABEL: @icmp_shl_1_ule_signmask_undef2(
+define <2 x i1> @icmp_shl_1_ule_signmask_poison2(<2 x i8> %V) {
+; CHECK-LABEL: @icmp_shl_1_ule_signmask_poison2(
 ; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
-  %shl = shl <2 x i8> <i8 1, i8 undef>, %V
-  %cmp = icmp ule <2 x i8> %shl, <i8 undef, i8 128>
+  %shl = shl <2 x i8> <i8 1, i8 poison>, %V
+  %cmp = icmp ule <2 x i8> %shl, <i8 poison, i8 128>
   ret <2 x i1> %cmp
 }
 
@@ -1731,12 +1731,12 @@ define <2 x i1> @shl_1_cmp_eq_nonpow2_splat(<2 x i32> %x) {
   ret <2 x i1> %c
 }
 
-define <2 x i1> @shl_1_cmp_eq_nonpow2_splat_undef(<2 x i32> %x) {
-; CHECK-LABEL: @shl_1_cmp_eq_nonpow2_splat_undef(
+define <2 x i1> @shl_1_cmp_eq_nonpow2_splat_poison(<2 x i32> %x) {
+; CHECK-LABEL: @shl_1_cmp_eq_nonpow2_splat_poison(
 ; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %s = shl <2 x i32> <i32 1, i32 1>, %x
-  %c = icmp eq <2 x i32> %s, <i32 31, i32 undef>
+  %c = icmp eq <2 x i32> %s, <i32 31, i32 poison>
   ret <2 x i1> %c
 }
 
@@ -1758,12 +1758,12 @@ define <2 x i1> @shl_1_cmp_ne_nonpow2_splat(<2 x i32> %x) {
   ret <2 x i1> %c
 }
 
-define <2 x i1> @shl_1_cmp_ne_nonpow2_splat_undef(<2 x i32> %x) {
-; CHECK-LABEL: @shl_1_cmp_ne_nonpow2_splat_undef(
+define <2 x i1> @shl_1_cmp_ne_nonpow2_splat_poison(<2 x i32> %x) {
+; CHECK-LABEL: @shl_1_cmp_ne_nonpow2_splat_poison(
 ; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
-  %s = shl <2 x i32> <i32 undef, i32 1>, %x
-  %c = icmp ne <2 x i32> %s, <i32 42, i32 undef>
+  %s = shl <2 x i32> <i32 poison, i32 1>, %x
+  %c = icmp ne <2 x i32> %s, <i32 42, i32 poison>
   ret <2 x i1> %c
 }
 
@@ -1776,12 +1776,12 @@ define i1 @shl_pow2_cmp_eq_nonpow2(i32 %x) {
   ret i1 %c
 }
 
-define <2 x i1> @shl_pow21_cmp_ne_nonpow2_splat_undef(<2 x i32> %x) {
-; CHECK-LABEL: @shl_pow21_cmp_ne_nonpow2_splat_undef(
+define <2 x i1> @shl_pow21_cmp_ne_nonpow2_splat_poison(<2 x i32> %x) {
+; CHECK-LABEL: @shl_pow21_cmp_ne_nonpow2_splat_poison(
 ; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
-  %s = shl <2 x i32> <i32 undef, i32 4>, %x
-  %c = icmp ne <2 x i32> %s, <i32 31, i32 undef>
+  %s = shl <2 x i32> <i32 poison, i32 4>, %x
+  %c = icmp ne <2 x i32> %s, <i32 31, i32 poison>
   ret <2 x i1> %c
 }
 
@@ -1820,12 +1820,12 @@ define i1 @shl_pow2_cmp_eq_zero_nuw(i32 %x) {
   ret i1 %c
 }
 
-define <2 x i1> @shl_pow2_cmp_ne_zero_nuw_splat_undef(<2 x i32> %x) {
-; CHECK-LABEL: @shl_pow2_cmp_ne_zero_nuw_splat_undef(
+define <2 x i1> @shl_pow2_cmp_ne_zero_nuw_splat_poison(<2 x i32> %x) {
+; CHECK-LABEL: @shl_pow2_cmp_ne_zero_nuw_splat_poison(
 ; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
-  %s = shl nuw <2 x i32> <i32 16, i32 undef>, %x
-  %c = icmp ne <2 x i32> %s, <i32 undef, i32 0>
+  %s = shl nuw <2 x i32> <i32 16, i32 poison>, %x
+  %c = icmp ne <2 x i32> %s, <i32 poison, i32 0>
   ret <2 x i1> %c
 }
 
@@ -1838,12 +1838,12 @@ define i1 @shl_pow2_cmp_ne_zero_nsw(i32 %x) {
   ret i1 %c
 }
 
-define <2 x i1> @shl_pow2_cmp_eq_zero_nsw_splat_undef(<2 x i32> %x) {
-; CHECK-LABEL: @shl_pow2_cmp_eq_zero_nsw_splat_undef(
+define <2 x i1> @shl_pow2_cmp_eq_zero_nsw_splat_poison(<2 x i32> %x) {
+; CHECK-LABEL: @shl_pow2_cmp_eq_zero_nsw_splat_poison(
 ; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
-  %s = shl nsw <2 x i32> <i32 undef, i32 16>, %x
-  %c = icmp eq <2 x i32> %s, <i32 0, i32 undef>
+  %s = shl nsw <2 x i32> <i32 poison, i32 16>, %x
+  %c = icmp eq <2 x i32> %s, <i32 0, i32 poison>
   ret <2 x i1> %c
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/constantfold-add-nuw-allones-to-allones.ll b/llvm/test/Transforms/InstSimplify/constantfold-add-nuw-allones-to-allones.ll
index 7c9d9a9e2c7ce..92d6cc30d6248 100644
--- a/llvm/test/Transforms/InstSimplify/constantfold-add-nuw-allones-to-allones.ll
+++ b/llvm/test/Transforms/InstSimplify/constantfold-add-nuw-allones-to-allones.ll
@@ -63,11 +63,11 @@ define <2 x i8> @add_vec(<2 x i8> %x) {
   ret <2 x i8> %ret
 }
 
-define <3 x i8> @add_vec_undef(<3 x i8> %x) {
-; CHECK-LABEL: @add_vec_undef(
-; CHECK-NEXT:    ret <3 x i8> <i8 -1, i8 undef, i8 -1>
+define <3 x i8> @add_vec_poison(<3 x i8> %x) {
+; CHECK-LABEL: @add_vec_poison(
+; CHECK-NEXT:    ret <3 x i8> <i8 -1, i8 poison, i8 -1>
 ;
-  %ret = add nuw <3 x i8> %x, <i8 -1, i8 undef, i8 -1>
+  %ret = add nuw <3 x i8> %x, <i8 -1, i8 poison, i8 -1>
   ret <3 x i8> %ret
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/constantfold-shl-nuw-C-to-C.ll b/llvm/test/Transforms/InstSimplify/constantfold-shl-nuw-C-to-C.ll
index b5b5773fee538..3f4a08807a4b4 100644
--- a/llvm/test/Transforms/InstSimplify/constantfold-shl-nuw-C-to-C.ll
+++ b/llvm/test/Transforms/InstSimplify/constantfold-shl-nuw-C-to-C.ll
@@ -78,11 +78,11 @@ define <2 x i8> @shl_vec(<2 x i8> %x) {
   ret <2 x i8> %ret
 }
 
-define <3 x i8> @shl_vec_undef(<3 x i8> %x) {
-; CHECK-LABEL: @shl_vec_undef(
-; CHECK-NEXT:    ret <3 x i8> <i8 -1, i8 undef, i8 -1>
+define <3 x i8> @shl_vec_poison(<3 x i8> %x) {
+; CHECK-LABEL: @shl_vec_poison(
+; CHECK-NEXT:    ret <3 x i8> <i8 -1, i8 poison, i8 -1>
 ;
-  %ret = shl nuw <3 x i8> <i8 -1, i8 undef, i8 -1>, %x
+  %ret = shl nuw <3 x i8> <i8 -1, i8 poison, i8 -1>, %x
   ret <3 x i8> %ret
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/div.ll b/llvm/test/Transforms/InstSimplify/div.ll
index e13b6f139bcf5..5ca2e8837b924 100644
--- a/llvm/test/Transforms/InstSimplify/div.ll
+++ b/llvm/test/Transforms/InstSimplify/div.ll
@@ -17,11 +17,11 @@ define <2 x i32> @zero_dividend_vector(<2 x i32> %A) {
   ret <2 x i32> %B
 }
 
-define <2 x i32> @zero_dividend_vector_undef_elt(<2 x i32> %A) {
-; CHECK-LABEL: @zero_dividend_vector_undef_elt(
+define <2 x i32> @zero_dividend_vector_poison_elt(<2 x i32> %A) {
+; CHECK-LABEL: @zero_dividend_vector_poison_elt(
 ; CHECK-NEXT:    ret <2 x i32> zeroinitializer
 ;
-  %B = sdiv <2 x i32> <i32 0, i32 undef>, %A
+  %B = sdiv <2 x i32> <i32 0, i32 poison>, %A
   ret <2 x i32> %B
 }
 
@@ -59,23 +59,23 @@ define <2 x i8> @udiv_zero_elt_vec(<2 x i8> %x) {
   ret <2 x i8> %div
 }
 
-define <2 x i8> @sdiv_undef_elt_vec(<2 x i8> %x) {
-; CHECK-LABEL: @sdiv_undef_elt_vec(
+define <2 x i8> @sdiv_poison_elt_vec(<2 x i8> %x) {
+; CHECK-LABEL: @sdiv_poison_elt_vec(
 ; CHECK-NEXT:    ret <2 x i8> poison
 ;
-  %div = sdiv <2 x i8> %x, <i8 -42, i8 undef>
+  %div = sdiv <2 x i8> %x, <i8 -42, i8 poison>
   ret <2 x i8> %div
 }
 
-define <2 x i8> @udiv_undef_elt_vec(<2 x i8> %x) {
-; CHECK-LABEL: @udiv_undef_elt_vec(
+define <2 x i8> @udiv_poison_elt_vec(<2 x i8> %x) {
+; CHECK-LABEL: @udiv_poison_elt_vec(
 ; CHECK-NEXT:    ret <2 x i8> poison
 ;
-  %div = udiv <2 x i8> %x, <i8 undef, i8 42>
+  %div = udiv <2 x i8> %x, <i8 poison, i8 42>
   ret <2 x i8> %div
 }
 
-; Division-by-zero is undef. UB in any vector lane means the whole op is undef.
+; Division-by-zero is poison. UB in any vector lane means the whole op is poison.
 ; Thus, we can simplify this: if any element of 'y' is 0, we can do anything.
 ; Therefore, assume that all elements of 'y' must be 1.
 
diff --git a/llvm/test/Transforms/InstSimplify/fast-math-strictfp.ll b/llvm/test/Transforms/InstSimplify/fast-math-strictfp.ll
index 4938987baccc2..b1d772890aff8 100644
--- a/llvm/test/Transforms/InstSimplify/fast-math-strictfp.ll
+++ b/llvm/test/Transforms/InstSimplify/fast-math-strictfp.ll
@@ -18,11 +18,11 @@ define float @mul_zero_2(float %a) #0 {
   ret float %b
 }
 
-define <2 x float> @mul_zero_nsz_nnan_vec_undef(<2 x float> %a) #0 {
-; CHECK-LABEL: @mul_zero_nsz_nnan_vec_undef(
+define <2 x float> @mul_zero_nsz_nnan_vec_poison(<2 x float> %a) #0 {
+; CHECK-LABEL: @mul_zero_nsz_nnan_vec_poison(
 ; CHECK-NEXT:    ret <2 x float> zeroinitializer
 ;
-  %b = call nsz nnan <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> %a, <2 x float><float 0.0, float undef>, metadata !"round.tonearest", metadata !"fpexcept.ignore")
+  %b = call nsz nnan <2 x float> @llvm.experimental.constrained.fmul.v2f32(<2 x float> %a, <2 x float><float 0.0, float poison>, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x float> %b
 }
 
@@ -98,13 +98,13 @@ define <2 x float> @fadd_unary_fnegx_commute_vec(<2 x float> %x) #0 {
   ret <2 x float> %r
 }
 
-define <2 x float> @fadd_fnegx_commute_vec_undef(<2 x float> %x) #0 {
-; CHECK-LABEL: @fadd_fnegx_commute_vec_undef(
-; CHECK-NEXT:    [[NEGX:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float undef, float -0.000000e+00>, <2 x float> [[X:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore")
+define <2 x float> @fadd_fnegx_commute_vec_poison(<2 x float> %x) #0 {
+; CHECK-LABEL: @fadd_fnegx_commute_vec_poison(
+; CHECK-NEXT:    [[NEGX:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float poison, float -0.000000e+00>, <2 x float> [[X:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore")
 ; CHECK-NEXT:    [[R:%.*]] = call nnan <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> [[X]], <2 x float> [[NEGX]], metadata !"round.tonearest", metadata !"fpexcept.ignore")
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
-  %negx = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float undef, float -0.0>, <2 x float> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore")
+  %negx = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float poison, float -0.0>, <2 x float> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   %r = call nnan <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %x, <2 x float> %negx, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x float> %r
 }
@@ -240,34 +240,34 @@ define float @fneg_x(float %a) #0 {
   ret float %ret
 }
 
-define <2 x float> @fsub_0_0_x_vec_undef1(<2 x float> %a) #0 {
-; CHECK-LABEL: @fsub_0_0_x_vec_undef1(
-; CHECK-NEXT:    [[T1:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float 0.000000e+00, float undef>, <2 x float> [[A:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore")
+define <2 x float> @fsub_0_0_x_vec_poison1(<2 x float> %a) #0 {
+; CHECK-LABEL: @fsub_0_0_x_vec_poison1(
+; CHECK-NEXT:    [[T1:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float 0.000000e+00, float poison>, <2 x float> [[A:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore")
 ; CHECK-NEXT:    [[RET:%.*]] = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> zeroinitializer, <2 x float> [[T1]], metadata !"round.tonearest", metadata !"fpexcept.ignore")
 ; CHECK-NEXT:    ret <2 x float> [[RET]]
 ;
-  %t1 = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float 0.0, float undef>, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.ignore")
+  %t1 = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float 0.0, float poison>, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   %ret = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> zeroinitializer, <2 x float> %t1, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x float> %ret
 }
 
-define <2 x float> @fneg_x_vec_undef1(<2 x float> %a) #0 {
-; CHECK-LABEL: @fneg_x_vec_undef1(
+define <2 x float> @fneg_x_vec_poison1(<2 x float> %a) #0 {
+; CHECK-LABEL: @fneg_x_vec_poison1(
 ; CHECK-NEXT:    ret <2 x float> [[A:%.*]]
 ;
   %t1 = fneg <2 x float> %a
-  %ret = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float 0.0, float undef>, <2 x float> %t1, metadata !"round.tonearest", metadata !"fpexcept.ignore")
+  %ret = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float 0.0, float poison>, <2 x float> %t1, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x float> %ret
 }
 
-define <2 x float> @fsub_0_0_x_vec_undef2(<2 x float> %a) #0 {
-; CHECK-LABEL: @fsub_0_0_x_vec_undef2(
+define <2 x float> @fsub_0_0_x_vec_poison2(<2 x float> %a) #0 {
+; CHECK-LABEL: @fsub_0_0_x_vec_poison2(
 ; CHECK-NEXT:    [[T1:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> zeroinitializer, <2 x float> [[A:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore")
-; CHECK-NEXT:    [[RET:%.*]] = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float undef, float -0.000000e+00>, <2 x float> [[T1]], metadata !"round.tonearest", metadata !"fpexcept.ignore")
+; CHECK-NEXT:    [[RET:%.*]] = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float poison, float -0.000000e+00>, <2 x float> [[T1]], metadata !"round.tonearest", metadata !"fpexcept.ignore")
 ; CHECK-NEXT:    ret <2 x float> [[RET]]
 ;
   %t1 = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> zeroinitializer, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.ignore")
-  %ret = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float undef, float -0.0>, <2 x float> %t1, metadata !"round.tonearest", metadata !"fpexcept.ignore")
+  %ret = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float poison, float -0.0>, <2 x float> %t1, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x float> %ret
 }
 
@@ -281,11 +281,11 @@ define <2 x float> @fadd_zero_nsz_vec(<2 x float> %x) #0 {
   ret <2 x float> %r
 }
 
-define <2 x float> @fadd_zero_nsz_vec_undef(<2 x float> %x) #0 {
-; CHECK-LABEL: @fadd_zero_nsz_vec_undef(
+define <2 x float> @fadd_zero_nsz_vec_poison(<2 x float> %x) #0 {
+; CHECK-LABEL: @fadd_zero_nsz_vec_poison(
 ; CHECK-NEXT:    ret <2 x float> [[X:%.*]]
 ;
-  %r = call nsz <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %x, <2 x float> <float 0.0, float undef>, metadata !"round.tonearest", metadata !"fpexcept.ignore")
+  %r = call nsz <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %x, <2 x float> <float 0.0, float poison>, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x float> %r
 }
 
@@ -375,11 +375,11 @@ define double @fdiv_zero_by_x(double %x) #0 {
   ret double %r
 }
 
-define <2 x double> @fdiv_zero_by_x_vec_undef(<2 x double> %x) #0 {
-; CHECK-LABEL: @fdiv_zero_by_x_vec_undef(
+define <2 x double> @fdiv_zero_by_x_vec_poison(<2 x double> %x) #0 {
+; CHECK-LABEL: @fdiv_zero_by_x_vec_poison(
 ; CHECK-NEXT:    ret <2 x double> zeroinitializer
 ;
-  %r = call nnan nsz <2 x double> @llvm.experimental.constrained.fdiv.v2f64(<2 x double> <double 0.0, double undef>, <2 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore")
+  %r = call nnan nsz <2 x double> @llvm.experimental.constrained.fdiv.v2f64(<2 x double> <double 0.0, double poison>, <2 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x double> %r
 }
 
@@ -394,11 +394,11 @@ define double @frem_zero_by_x(double %x) #0 {
   ret double %r
 }
 
-define <2 x double> @frem_poszero_by_x_vec_undef(<2 x double> %x) #0 {
-; CHECK-LABEL: @frem_poszero_by_x_vec_undef(
+define <2 x double> @frem_poszero_by_x_vec_poison(<2 x double> %x) #0 {
+; CHECK-LABEL: @frem_poszero_by_x_vec_poison(
 ; CHECK-NEXT:    ret <2 x double> zeroinitializer
 ;
-  %r = call nnan <2 x double> @llvm.experimental.constrained.frem.v2f64(<2 x double> <double 0.0, double undef>, <2 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore")
+  %r = call nnan <2 x double> @llvm.experimental.constrained.frem.v2f64(<2 x double> <double 0.0, double poison>, <2 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x double> %r
 }
 
@@ -413,11 +413,11 @@ define double @frem_negzero_by_x(double %x) #0 {
   ret double %r
 }
 
-define <2 x double> @frem_negzero_by_x_vec_undef(<2 x double> %x) #0 {
-; CHECK-LABEL: @frem_negzero_by_x_vec_undef(
+define <2 x double> @frem_negzero_by_x_vec_poison(<2 x double> %x) #0 {
+; CHECK-LABEL: @frem_negzero_by_x_vec_poison(
 ; CHECK-NEXT:    ret <2 x double> <double -0.000000e+00, double -0.000000e+00>
 ;
-  %r = call nnan <2 x double> @llvm.experimental.constrained.frem.v2f64(<2 x double> <double undef, double -0.0>, <2 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore")
+  %r = call nnan <2 x double> @llvm.experimental.constrained.frem.v2f64(<2 x double> <double poison, double -0.0>, <2 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x double> %r
 }
 
@@ -493,13 +493,13 @@ define float @fdiv_neg_swapped2(float %f) #0 {
   ret float %div
 }
 
-define <2 x float> @fdiv_neg_vec_undef_elt(<2 x float> %f) #0 {
-; CHECK-LABEL: @fdiv_neg_vec_undef_elt(
-; CHECK-NEXT:    [[NEG:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float 0.000000e+00, float undef>, <2 x float> [[F:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore")
+define <2 x float> @fdiv_neg_vec_poison_elt(<2 x float> %f) #0 {
+; CHECK-LABEL: @fdiv_neg_vec_poison_elt(
+; CHECK-NEXT:    [[NEG:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float 0.000000e+00, float poison>, <2 x float> [[F:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore")
 ; CHECK-NEXT:    [[DIV:%.*]] = call nnan <2 x float> @llvm.experimental.constrained.fdiv.v2f32(<2 x float> [[F]], <2 x float> [[NEG]], metadata !"round.tonearest", metadata !"fpexcept.ignore")
 ; CHECK-NEXT:    ret <2 x float> [[DIV]]
 ;
-  %neg = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float 0.000000e+00, float undef>, <2 x float> %f, metadata !"round.tonearest", metadata !"fpexcept.ignore")
+  %neg = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float 0.000000e+00, float poison>, <2 x float> %f, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   %div = call nnan <2 x float> @llvm.experimental.constrained.fdiv.v2f32(<2 x float> %f, <2 x float> %neg, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x float> %div
 }
diff --git a/llvm/test/Transforms/InstSimplify/fast-math.ll b/llvm/test/Transforms/InstSimplify/fast-math.ll
index d1818e6346d7a..287f30b162f80 100644
--- a/llvm/test/Transforms/InstSimplify/fast-math.ll
+++ b/llvm/test/Transforms/InstSimplify/fast-math.ll
@@ -18,11 +18,11 @@ define float @mul_zero_2(float %a) {
   ret float %b
 }
 
-define <2 x float> @mul_zero_nsz_nnan_vec_undef(<2 x float> %a) {
-; CHECK-LABEL: @mul_zero_nsz_nnan_vec_undef(
+define <2 x float> @mul_zero_nsz_nnan_vec_poison(<2 x float> %a) {
+; CHECK-LABEL: @mul_zero_nsz_nnan_vec_poison(
 ; CHECK-NEXT:    ret <2 x float> zeroinitializer
 ;
-  %b = fmul nsz nnan <2 x float> %a, <float 0.0, float undef>
+  %b = fmul nsz nnan <2 x float> %a, <float 0.0, float poison>
   ret <2 x float> %b
 }
 
@@ -94,11 +94,11 @@ define <2 x float> @fadd_unary_fnegx_commute_vec(<2 x float> %x) {
   ret <2 x float> %r
 }
 
-define <2 x float> @fadd_fnegx_commute_vec_undef(<2 x float> %x) {
-; CHECK-LABEL: @fadd_fnegx_commute_vec_undef(
+define <2 x float> @fadd_fnegx_commute_vec_poison(<2 x float> %x) {
+; CHECK-LABEL: @fadd_fnegx_commute_vec_poison(
 ; CHECK-NEXT:    ret <2 x float> zeroinitializer
 ;
-  %negx = fsub <2 x float> <float undef, float -0.0>, %x
+  %negx = fsub <2 x float> <float poison, float -0.0>, %x
   %r = fadd nnan <2 x float> %x, %negx
   ret <2 x float> %r
 }
@@ -226,30 +226,30 @@ define float @fneg_x(float %a) {
   ret float %ret
 }
 
-define <2 x float> @fsub_0_0_x_vec_undef1(<2 x float> %a) {
-; CHECK-LABEL: @fsub_0_0_x_vec_undef1(
+define <2 x float> @fsub_0_0_x_vec_poison1(<2 x float> %a) {
+; CHECK-LABEL: @fsub_0_0_x_vec_poison1(
 ; CHECK-NEXT:    ret <2 x float> [[A:%.*]]
 ;
-  %t1 = fsub <2 x float> <float 0.0, float undef>, %a
+  %t1 = fsub <2 x float> <float 0.0, float poison>, %a
   %ret = fsub nsz <2 x float> zeroinitializer, %t1
   ret <2 x float> %ret
 }
 
-define <2 x float> @fneg_x_vec_undef1(<2 x float> %a) {
-; CHECK-LABEL: @fneg_x_vec_undef1(
+define <2 x float> @fneg_x_vec_poison1(<2 x float> %a) {
+; CHECK-LABEL: @fneg_x_vec_poison1(
 ; CHECK-NEXT:    ret <2 x float> [[A:%.*]]
 ;
   %t1 = fneg <2 x float> %a
-  %ret = fsub nsz <2 x float> <float 0.0, float undef>, %t1
+  %ret = fsub nsz <2 x float> <float 0.0, float poison>, %t1
   ret <2 x float> %ret
 }
 
-define <2 x float> @fsub_0_0_x_vec_undef2(<2 x float> %a) {
-; CHECK-LABEL: @fsub_0_0_x_vec_undef2(
+define <2 x float> @fsub_0_0_x_vec_poison2(<2 x float> %a) {
+; CHECK-LABEL: @fsub_0_0_x_vec_poison2(
 ; CHECK-NEXT:    ret <2 x float> [[A:%.*]]
 ;
   %t1 = fsub <2 x float> zeroinitializer, %a
-  %ret = fsub nsz <2 x float> <float undef, float -0.0>, %t1
+  %ret = fsub nsz <2 x float> <float poison, float -0.0>, %t1
   ret <2 x float> %ret
 }
 
@@ -263,11 +263,11 @@ define <2 x float> @fadd_zero_nsz_vec(<2 x float> %x) {
   ret <2 x float> %r
 }
 
-define <2 x float> @fadd_zero_nsz_vec_undef(<2 x float> %x) {
-; CHECK-LABEL: @fadd_zero_nsz_vec_undef(
+define <2 x float> @fadd_zero_nsz_vec_poison(<2 x float> %x) {
+; CHECK-LABEL: @fadd_zero_nsz_vec_poison(
 ; CHECK-NEXT:    ret <2 x float> [[X:%.*]]
 ;
-  %r = fadd nsz <2 x float> %x, <float 0.0, float undef>
+  %r = fadd nsz <2 x float> %x, <float 0.0, float poison>
   ret <2 x float> %r
 }
 
@@ -357,11 +357,11 @@ define double @fdiv_zero_by_x(double %x) {
   ret double %r
 }
 
-define <2 x double> @fdiv_zero_by_x_vec_undef(<2 x double> %x) {
-; CHECK-LABEL: @fdiv_zero_by_x_vec_undef(
+define <2 x double> @fdiv_zero_by_x_vec_poison(<2 x double> %x) {
+; CHECK-LABEL: @fdiv_zero_by_x_vec_poison(
 ; CHECK-NEXT:    ret <2 x double> zeroinitializer
 ;
-  %r = fdiv nnan nsz <2 x double> <double 0.0, double undef>, %x
+  %r = fdiv nnan nsz <2 x double> <double 0.0, double poison>, %x
   ret <2 x double> %r
 }
 
@@ -376,11 +376,11 @@ define double @frem_zero_by_x(double %x) {
   ret double %r
 }
 
-define <2 x double> @frem_poszero_by_x_vec_undef(<2 x double> %x) {
-; CHECK-LABEL: @frem_poszero_by_x_vec_undef(
+define <2 x double> @frem_poszero_by_x_vec_poison(<2 x double> %x) {
+; CHECK-LABEL: @frem_poszero_by_x_vec_poison(
 ; CHECK-NEXT:    ret <2 x double> zeroinitializer
 ;
-  %r = frem nnan <2 x double> <double 0.0, double undef>, %x
+  %r = frem nnan <2 x double> <double 0.0, double poison>, %x
   ret <2 x double> %r
 }
 
@@ -395,11 +395,11 @@ define double @frem_negzero_by_x(double %x) {
   ret double %r
 }
 
-define <2 x double> @frem_negzero_by_x_vec_undef(<2 x double> %x) {
-; CHECK-LABEL: @frem_negzero_by_x_vec_undef(
+define <2 x double> @frem_negzero_by_x_vec_poison(<2 x double> %x) {
+; CHECK-LABEL: @frem_negzero_by_x_vec_poison(
 ; CHECK-NEXT:    ret <2 x double> <double -0.000000e+00, double -0.000000e+00>
 ;
-  %r = frem nnan <2 x double> <double undef, double -0.0>, %x
+  %r = frem nnan <2 x double> <double poison, double -0.0>, %x
   ret <2 x double> %r
 }
 
@@ -467,11 +467,11 @@ define float @fdiv_neg_swapped2(float %f) {
   ret float %div
 }
 
-define <2 x float> @fdiv_neg_vec_undef_elt(<2 x float> %f) {
-; CHECK-LABEL: @fdiv_neg_vec_undef_elt(
+define <2 x float> @fdiv_neg_vec_poison_elt(<2 x float> %f) {
+; CHECK-LABEL: @fdiv_neg_vec_poison_elt(
 ; CHECK-NEXT:    ret <2 x float> <float -1.000000e+00, float -1.000000e+00>
 ;
-  %neg = fsub <2 x float> <float 0.0, float undef>, %f
+  %neg = fsub <2 x float> <float 0.0, float poison>, %f
   %div = fdiv nnan <2 x float> %f, %neg
   ret <2 x float> %div
 }
diff --git a/llvm/test/Transforms/InstSimplify/fdiv.ll b/llvm/test/Transforms/InstSimplify/fdiv.ll
index 38e31257e185a..fb59011b91d5b 100644
--- a/llvm/test/Transforms/InstSimplify/fdiv.ll
+++ b/llvm/test/Transforms/InstSimplify/fdiv.ll
@@ -110,11 +110,11 @@ define <2 x float> @fdiv_nnan_ninf_by_undef_v2f32(<2 x float> %x) {
   ret <2 x float> %fdiv
 }
 
-define <2 x float> @fdiv_nnan_ninf_by_zero_undef_v2f32(<2 x float> %x) {
-; CHECK-LABEL: @fdiv_nnan_ninf_by_zero_undef_v2f32(
+define <2 x float> @fdiv_nnan_ninf_by_zero_poison_v2f32(<2 x float> %x) {
+; CHECK-LABEL: @fdiv_nnan_ninf_by_zero_poison_v2f32(
 ; CHECK-NEXT:    ret <2 x float> poison
 ;
-  %fdiv = fdiv nnan ninf <2 x float> %x, <float 0.0, float undef>
+  %fdiv = fdiv nnan ninf <2 x float> %x, <float 0.0, float poison>
   ret <2 x float> %fdiv
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/floating-point-arithmetic-strictfp.ll b/llvm/test/Transforms/InstSimplify/floating-point-arithmetic-strictfp.ll
index e4748a2402923..32ea4cb7cd198 100644
--- a/llvm/test/Transforms/InstSimplify/floating-point-arithmetic-strictfp.ll
+++ b/llvm/test/Transforms/InstSimplify/floating-point-arithmetic-strictfp.ll
@@ -24,23 +24,23 @@ define <2 x float> @fsub_-0_x_vec(<2 x float> %a) #0 {
   ret <2 x float> %ret
 }
 
-define <2 x float> @fsub_-0_x_vec_undef_elts(<2 x float> %a) #0 {
-; CHECK-LABEL: @fsub_-0_x_vec_undef_elts(
-; CHECK-NEXT:    [[T1:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float -0.000000e+00, float undef>, <2 x float> [[A:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore")
+define <2 x float> @fsub_-0_x_vec_poison_elts(<2 x float> %a) #0 {
+; CHECK-LABEL: @fsub_-0_x_vec_poison_elts(
+; CHECK-NEXT:    [[T1:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float -0.000000e+00, float poison>, <2 x float> [[A:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore")
 ; CHECK-NEXT:    [[RET:%.*]] = fneg <2 x float> [[T1]]
 ; CHECK-NEXT:    ret <2 x float> [[RET]]
 ;
-  %t1 = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float><float -0.0, float undef>, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.ignore")
+  %t1 = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float><float -0.0, float poison>, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   %ret = fneg <2 x float> %t1
   ret <2 x float> %ret
 }
 
-define <2 x float> @fsub_negzero_vec_undef_elts(<2 x float> %x) #0 {
-; CHECK-LABEL: @fsub_negzero_vec_undef_elts(
-; CHECK-NEXT:    [[R:%.*]] = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float undef, float -0.000000e+00>, <2 x float> [[X:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore")
+define <2 x float> @fsub_negzero_vec_poison_elts(<2 x float> %x) #0 {
+; CHECK-LABEL: @fsub_negzero_vec_poison_elts(
+; CHECK-NEXT:    [[R:%.*]] = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float poison, float -0.000000e+00>, <2 x float> [[X:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore")
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
-  %r = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float><float undef, float -0.0>, <2 x float> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore")
+  %r = call nsz <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float><float poison, float -0.0>, <2 x float> %x, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x float> %r
 }
 
@@ -86,23 +86,23 @@ define <2 x float> @fneg_x_vec(<2 x float> %a) #0 {
   ret <2 x float> %ret
 }
 
-define <2 x float> @fsub_-0_-0_x_vec_undef_elts(<2 x float> %a) #0 {
-; CHECK-LABEL: @fsub_-0_-0_x_vec_undef_elts(
-; CHECK-NEXT:    [[T1:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float undef, float -0.000000e+00>, <2 x float> [[A:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore")
-; CHECK-NEXT:    [[RET:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float -0.000000e+00, float undef>, <2 x float> [[T1]], metadata !"round.tonearest", metadata !"fpexcept.ignore")
+define <2 x float> @fsub_-0_-0_x_vec_poison_elts(<2 x float> %a) #0 {
+; CHECK-LABEL: @fsub_-0_-0_x_vec_poison_elts(
+; CHECK-NEXT:    [[T1:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float poison, float -0.000000e+00>, <2 x float> [[A:%.*]], metadata !"round.tonearest", metadata !"fpexcept.ignore")
+; CHECK-NEXT:    [[RET:%.*]] = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> <float -0.000000e+00, float poison>, <2 x float> [[T1]], metadata !"round.tonearest", metadata !"fpexcept.ignore")
 ; CHECK-NEXT:    ret <2 x float> [[RET]]
 ;
-  %t1 = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float><float undef, float -0.0>, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.ignore")
-  %ret = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float><float -0.0, float undef>, <2 x float> %t1, metadata !"round.tonearest", metadata !"fpexcept.ignore")
+  %t1 = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float><float poison, float -0.0>, <2 x float> %a, metadata !"round.tonearest", metadata !"fpexcept.ignore")
+  %ret = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float><float -0.0, float poison>, <2 x float> %t1, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x float> %ret
 }
 
-define <2 x float> @fneg_x_vec_undef_elts(<2 x float> %a) #0 {
-; CHECK-LABEL: @fneg_x_vec_undef_elts(
+define <2 x float> @fneg_x_vec_poison_elts(<2 x float> %a) #0 {
+; CHECK-LABEL: @fneg_x_vec_poison_elts(
 ; CHECK-NEXT:    ret <2 x float> [[A:%.*]]
 ;
   %t1 = fneg <2 x float> %a
-  %ret = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float><float -0.0, float undef>, <2 x float> %t1, metadata !"round.tonearest", metadata !"fpexcept.ignore")
+  %ret = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float><float -0.0, float poison>, <2 x float> %t1, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x float> %ret
 }
 
@@ -139,11 +139,11 @@ define float @fsub_x_0(float %x) #0 {
   ret float %r
 }
 
-define <2 x float> @fsub_x_0_vec_undef(<2 x float> %x) #0 {
-; CHECK-LABEL: @fsub_x_0_vec_undef(
+define <2 x float> @fsub_x_0_vec_poison(<2 x float> %x) #0 {
+; CHECK-LABEL: @fsub_x_0_vec_poison(
 ; CHECK-NEXT:    ret <2 x float> [[X:%.*]]
 ;
-  %r = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> %x, <2 x float><float undef, float 0.0>, metadata !"round.tonearest", metadata !"fpexcept.ignore")
+  %r = call <2 x float> @llvm.experimental.constrained.fsub.v2f32(<2 x float> %x, <2 x float><float poison, float 0.0>, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x float> %r
 }
 
@@ -156,11 +156,11 @@ define float @fadd_x_n0(float %a) #0 {
   ret float %ret
 }
 
-define <2 x float> @fadd_x_n0_vec_undef_elt(<2 x float> %a) #0 {
-; CHECK-LABEL: @fadd_x_n0_vec_undef_elt(
+define <2 x float> @fadd_x_n0_vec_poison_elt(<2 x float> %a) #0 {
+; CHECK-LABEL: @fadd_x_n0_vec_poison_elt(
 ; CHECK-NEXT:    ret <2 x float> [[A:%.*]]
 ;
-  %ret = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %a, <2 x float> <float -0.0, float undef>, metadata !"round.tonearest", metadata !"fpexcept.ignore")
+  %ret = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %a, <2 x float> <float -0.0, float poison>, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x float> %ret
 }
 
@@ -174,12 +174,12 @@ define float @fadd_x_p0(float %a) #0 {
   ret float %ret
 }
 
-define <2 x float> @fadd_x_p0_vec_undef_elt(<2 x float> %a) #0 {
-; CHECK-LABEL: @fadd_x_p0_vec_undef_elt(
-; CHECK-NEXT:    [[RET:%.*]] = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> [[A:%.*]], <2 x float> <float 0.000000e+00, float undef>, metadata !"round.tonearest", metadata !"fpexcept.ignore")
+define <2 x float> @fadd_x_p0_vec_poison_elt(<2 x float> %a) #0 {
+; CHECK-LABEL: @fadd_x_p0_vec_poison_elt(
+; CHECK-NEXT:    [[RET:%.*]] = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> [[A:%.*]], <2 x float> <float 0.000000e+00, float poison>, metadata !"round.tonearest", metadata !"fpexcept.ignore")
 ; CHECK-NEXT:    ret <2 x float> [[RET]]
 ;
-  %ret = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %a, <2 x float> <float 0.0, float undef>, metadata !"round.tonearest", metadata !"fpexcept.ignore")
+  %ret = call <2 x float> @llvm.experimental.constrained.fadd.v2f32(<2 x float> %a, <2 x float> <float 0.0, float poison>, metadata !"round.tonearest", metadata !"fpexcept.ignore")
   ret <2 x float> %ret
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll b/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll
index 5d17504c09df6..7a35f09f03b99 100644
--- a/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll
+++ b/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll
@@ -30,20 +30,20 @@ define <2 x float> @fsub_-0_x_vec(<2 x float> %a) {
   ret <2 x float> %ret
 }
 
-define <2 x float> @fsub_-0_x_vec_undef_elts(<2 x float> %a) {
-; CHECK-LABEL: @fsub_-0_x_vec_undef_elts(
+define <2 x float> @fsub_-0_x_vec_poison_elts(<2 x float> %a) {
+; CHECK-LABEL: @fsub_-0_x_vec_poison_elts(
 ; CHECK-NEXT:    ret <2 x float> [[A:%.*]]
 ;
-  %t1 = fsub <2 x float> <float -0.0, float undef>, %a
+  %t1 = fsub <2 x float> <float -0.0, float poison>, %a
   %ret = fneg <2 x float> %t1
   ret <2 x float> %ret
 }
 
-define <2 x float> @fsub_negzero_vec_undef_elts(<2 x float> %x) {
-; CHECK-LABEL: @fsub_negzero_vec_undef_elts(
+define <2 x float> @fsub_negzero_vec_poison_elts(<2 x float> %x) {
+; CHECK-LABEL: @fsub_negzero_vec_poison_elts(
 ; CHECK-NEXT:    ret <2 x float> [[X:%.*]]
 ;
-  %r = fsub nsz <2 x float> %x, <float undef, float -0.0>
+  %r = fsub nsz <2 x float> %x, <float poison, float -0.0>
   ret <2 x float> %r
 }
 
@@ -85,21 +85,21 @@ define <2 x float> @fneg_x_vec(<2 x float> %a) {
   ret <2 x float> %ret
 }
 
-define <2 x float> @fsub_-0_-0_x_vec_undef_elts(<2 x float> %a) {
-; CHECK-LABEL: @fsub_-0_-0_x_vec_undef_elts(
+define <2 x float> @fsub_-0_-0_x_vec_poison_elts(<2 x float> %a) {
+; CHECK-LABEL: @fsub_-0_-0_x_vec_poison_elts(
 ; CHECK-NEXT:    ret <2 x float> [[A:%.*]]
 ;
-  %t1 = fsub <2 x float> <float undef, float -0.0>, %a
-  %ret = fsub <2 x float> <float -0.0, float undef>, %t1
+  %t1 = fsub <2 x float> <float poison, float -0.0>, %a
+  %ret = fsub <2 x float> <float -0.0, float poison>, %t1
   ret <2 x float> %ret
 }
 
-define <2 x float> @fneg_x_vec_undef_elts(<2 x float> %a) {
-; CHECK-LABEL: @fneg_x_vec_undef_elts(
+define <2 x float> @fneg_x_vec_poison_elts(<2 x float> %a) {
+; CHECK-LABEL: @fneg_x_vec_poison_elts(
 ; CHECK-NEXT:    ret <2 x float> [[A:%.*]]
 ;
   %t1 = fneg <2 x float> %a
-  %ret = fsub <2 x float> <float -0.0, float undef>, %t1
+  %ret = fsub <2 x float> <float -0.0, float poison>, %t1
   ret <2 x float> %ret
 }
 
@@ -136,11 +136,11 @@ define float @fsub_x_0(float %x) {
   ret float %r
 }
 
-define <2 x float> @fsub_x_0_vec_undef(<2 x float> %x) {
-; CHECK-LABEL: @fsub_x_0_vec_undef(
+define <2 x float> @fsub_x_0_vec_poison(<2 x float> %x) {
+; CHECK-LABEL: @fsub_x_0_vec_poison(
 ; CHECK-NEXT:    ret <2 x float> [[X:%.*]]
 ;
-  %r = fsub <2 x float> %x, <float undef, float 0.0>
+  %r = fsub <2 x float> %x, <float poison, float 0.0>
   ret <2 x float> %r
 }
 
@@ -153,11 +153,11 @@ define float @fadd_x_n0(float %a) {
   ret float %ret
 }
 
-define <2 x float> @fadd_x_n0_vec_undef_elt(<2 x float> %a) {
-; CHECK-LABEL: @fadd_x_n0_vec_undef_elt(
+define <2 x float> @fadd_x_n0_vec_poison_elt(<2 x float> %a) {
+; CHECK-LABEL: @fadd_x_n0_vec_poison_elt(
 ; CHECK-NEXT:    ret <2 x float> [[A:%.*]]
 ;
-  %ret = fadd <2 x float> %a, <float -0.0, float undef>
+  %ret = fadd <2 x float> %a, <float -0.0, float poison>
   ret <2 x float> %ret
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/floating-point-compare.ll b/llvm/test/Transforms/InstSimplify/floating-point-compare.ll
index 3c1794c81284d..70f0321039ea9 100644
--- a/llvm/test/Transforms/InstSimplify/floating-point-compare.ll
+++ b/llvm/test/Transforms/InstSimplify/floating-point-compare.ll
@@ -547,30 +547,30 @@ define <2 x i1> @fabs_is_not_negative_anyzero(<2 x float> %V) {
   ret <2 x i1> %cmp
 }
 
-define <3 x i1> @fabs_is_not_negative_negzero_undef(<3 x float> %V) {
-; CHECK-LABEL: @fabs_is_not_negative_negzero_undef(
+define <3 x i1> @fabs_is_not_negative_negzero_poison(<3 x float> %V) {
+; CHECK-LABEL: @fabs_is_not_negative_negzero_poison(
 ; CHECK-NEXT:    ret <3 x i1> zeroinitializer
 ;
   %abs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %V)
-  %cmp = fcmp olt <3 x float> %abs, <float -0.0, float -0.0, float undef>
+  %cmp = fcmp olt <3 x float> %abs, <float -0.0, float -0.0, float poison>
   ret <3 x i1> %cmp
 }
 
-define <3 x i1> @fabs_is_not_negative_poszero_undef(<3 x float> %V) {
-; CHECK-LABEL: @fabs_is_not_negative_poszero_undef(
+define <3 x i1> @fabs_is_not_negative_poszero_poison(<3 x float> %V) {
+; CHECK-LABEL: @fabs_is_not_negative_poszero_poison(
 ; CHECK-NEXT:    ret <3 x i1> zeroinitializer
 ;
   %abs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %V)
-  %cmp = fcmp olt <3 x float> %abs, <float 0.0, float 0.0, float undef>
+  %cmp = fcmp olt <3 x float> %abs, <float 0.0, float 0.0, float poison>
   ret <3 x i1> %cmp
 }
 
-define <3 x i1> @fabs_is_not_negative_anyzero_undef(<3 x float> %V) {
-; CHECK-LABEL: @fabs_is_not_negative_anyzero_undef(
+define <3 x i1> @fabs_is_not_negative_anyzero_poison(<3 x float> %V) {
+; CHECK-LABEL: @fabs_is_not_negative_anyzero_poison(
 ; CHECK-NEXT:    ret <3 x i1> zeroinitializer
 ;
   %abs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %V)
-  %cmp = fcmp olt <3 x float> %abs, <float 0.0, float -0.0, float undef>
+  %cmp = fcmp olt <3 x float> %abs, <float 0.0, float -0.0, float poison>
   ret <3 x i1> %cmp
 }
 
@@ -1335,19 +1335,19 @@ define <2 x i1> @orderedCompareWithNaNVector(<2 x double> %A) {
   ret <2 x i1> %cmp
 }
 
-define <2 x i1> @orderedCompareWithNaNVector_undef_elt(<2 x double> %A) {
-; CHECK-LABEL: @orderedCompareWithNaNVector_undef_elt(
+define <2 x i1> @orderedCompareWithNaNVector_poison_elt(<2 x double> %A) {
+; CHECK-LABEL: @orderedCompareWithNaNVector_poison_elt(
 ; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
-  %cmp = fcmp olt <2 x double> %A, <double 0xFFFFFFFFFFFFFFFF, double undef>
+  %cmp = fcmp olt <2 x double> %A, <double 0xFFFFFFFFFFFFFFFF, double poison>
   ret <2 x i1> %cmp
 }
 
-define <2 x i1> @unorderedCompareWithNaNVector_undef_elt(<2 x double> %A) {
-; CHECK-LABEL: @unorderedCompareWithNaNVector_undef_elt(
+define <2 x i1> @unorderedCompareWithNaNVector_poison_elt(<2 x double> %A) {
+; CHECK-LABEL: @unorderedCompareWithNaNVector_poison_elt(
 ; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
-  %cmp = fcmp ult <2 x double> %A, <double undef, double 0xFFFFFFFFFFFFFFFF>
+  %cmp = fcmp ult <2 x double> %A, <double poison, double 0xFFFFFFFFFFFFFFFF>
   ret <2 x i1> %cmp
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/fminmax-folds.ll b/llvm/test/Transforms/InstSimplify/fminmax-folds.ll
index a8a9e96a652fa..668a93ddf5a42 100644
--- a/llvm/test/Transforms/InstSimplify/fminmax-folds.ll
+++ b/llvm/test/Transforms/InstSimplify/fminmax-folds.ll
@@ -493,7 +493,7 @@ define <2 x double> @maxnum_nan_op0_vec(<2 x double> %x) {
 ; CHECK-LABEL: @maxnum_nan_op0_vec(
 ; CHECK-NEXT:    ret <2 x double> [[X:%.*]]
 ;
-  %r = call <2 x double> @llvm.maxnum.v2f64(<2 x double> <double 0x7ff8000000000000, double undef>, <2 x double> %x)
+  %r = call <2 x double> @llvm.maxnum.v2f64(<2 x double> <double 0x7ff8000000000000, double poison>, <2 x double> %x)
   ret <2 x double> %r
 }
 
@@ -509,7 +509,7 @@ define <2 x double> @minnum_nan_op0_vec(<2 x double> %x) {
 ; CHECK-LABEL: @minnum_nan_op0_vec(
 ; CHECK-NEXT:    ret <2 x double> [[X:%.*]]
 ;
-  %r = call <2 x double> @llvm.minnum.v2f64(<2 x double> <double undef, double 0x7ff8000dead00000>, <2 x double> %x)
+  %r = call <2 x double> @llvm.minnum.v2f64(<2 x double> <double poison, double 0x7ff8000dead00000>, <2 x double> %x)
   ret <2 x double> %r
 }
 
@@ -873,19 +873,19 @@ define double @minimum_nan_op1(double %x) {
   ret double %r
 }
 
-define <2 x double> @maximum_nan_op0_vec_partial_undef(<2 x double> %x) {
-; CHECK-LABEL: @maximum_nan_op0_vec_partial_undef(
-; CHECK-NEXT:    ret <2 x double> <double 0x7FF8000000000000, double 0x7FF8000000000000>
+define <2 x double> @maximum_nan_op0_vec_partial_poison(<2 x double> %x) {
+; CHECK-LABEL: @maximum_nan_op0_vec_partial_poison(
+; CHECK-NEXT:    ret <2 x double> <double 0x7FF8000000000000, double poison>
 ;
-  %r = call <2 x double> @llvm.maximum.v2f64(<2 x double> <double 0x7ff8000000000000, double undef>, <2 x double> %x)
+  %r = call <2 x double> @llvm.maximum.v2f64(<2 x double> <double 0x7ff8000000000000, double poison>, <2 x double> %x)
   ret <2 x double> %r
 }
 
-define <2 x double> @maximum_nan_op1_vec_partial_undef(<2 x double> %x) {
-; CHECK-LABEL: @maximum_nan_op1_vec_partial_undef(
-; CHECK-NEXT:    ret <2 x double> <double 0x7FF8000000000000, double 0x7FF8000000000000>
+define <2 x double> @maximum_nan_op1_vec_partial_poison(<2 x double> %x) {
+; CHECK-LABEL: @maximum_nan_op1_vec_partial_poison(
+; CHECK-NEXT:    ret <2 x double> <double 0x7FF8000000000000, double poison>
 ;
-  %r = call <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double> <double 0x7ff8000000000000, double undef>)
+  %r = call <2 x double> @llvm.maximum.v2f64(<2 x double> %x, <2 x double> <double 0x7ff8000000000000, double poison>)
   ret <2 x double> %r
 }
 
@@ -897,19 +897,19 @@ define <2 x double> @maximum_nan_op1_vec(<2 x double> %x) {
   ret <2 x double> %r
 }
 
-define <2 x double> @minimum_nan_op0_vec_partial_undef(<2 x double> %x) {
-; CHECK-LABEL: @minimum_nan_op0_vec_partial_undef(
-; CHECK-NEXT:    ret <2 x double> <double 0x7FF8000000000000, double 0x7FF8000DEAD00000>
+define <2 x double> @minimum_nan_op0_vec_partial_poison(<2 x double> %x) {
+; CHECK-LABEL: @minimum_nan_op0_vec_partial_poison(
+; CHECK-NEXT:    ret <2 x double> <double poison, double 0x7FF8000DEAD00000>
 ;
-  %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> <double undef, double 0x7ff8000dead00000>, <2 x double> %x)
+  %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> <double poison, double 0x7ff8000dead00000>, <2 x double> %x)
   ret <2 x double> %r
 }
 
-define <2 x double> @minimum_nan_op1_vec_partial_undef(<2 x double> %x) {
-; CHECK-LABEL: @minimum_nan_op1_vec_partial_undef(
-; CHECK-NEXT:    ret <2 x double> <double 0x7FF8000000000000, double 0x7FF8000DEAD00000>
+define <2 x double> @minimum_nan_op1_vec_partial_poison(<2 x double> %x) {
+; CHECK-LABEL: @minimum_nan_op1_vec_partial_poison(
+; CHECK-NEXT:    ret <2 x double> <double poison, double 0x7FF8000DEAD00000>
 ;
-  %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> <double undef, double 0x7ff8000dead00000>)
+  %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> <double poison, double 0x7ff8000dead00000>)
   ret <2 x double> %r
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/fp-nan.ll b/llvm/test/Transforms/InstSimplify/fp-nan.ll
index cb0bed3790782..bb557500822c1 100644
--- a/llvm/test/Transforms/InstSimplify/fp-nan.ll
+++ b/llvm/test/Transforms/InstSimplify/fp-nan.ll
@@ -163,13 +163,13 @@ define <2 x double> @fsub_nan_poison_op1(<2 x double> %x) {
   ret <2 x double> %r
 }
 
-; Vector with undef element
+; Vector with poison element
 
-define <2 x double> @frem_nan_undef_op0(<2 x double> %x) {
-; CHECK-LABEL: @frem_nan_undef_op0(
-; CHECK-NEXT:    ret <2 x double> <double 0xFFFF00000000DEAD, double 0x7FF8000000000000>
+define <2 x double> @frem_nan_poison_op0(<2 x double> %x) {
+; CHECK-LABEL: @frem_nan_poison_op0(
+; CHECK-NEXT:    ret <2 x double> <double 0xFFFF00000000DEAD, double poison>
 ;
-  %r = frem <2 x double> <double 0xFFFF00000000DEAD, double undef>, %x
+  %r = frem <2 x double> <double 0xFFFF00000000DEAD, double poison>, %x
   ret <2 x double> %r
 }
 
@@ -177,7 +177,8 @@ define <2 x double> @frem_nan_undef_op0(<2 x double> %x) {
 
 define <3 x double> @fadd_nan_poison_undef_op1(<3 x double> %x) {
 ; CHECK-LABEL: @fadd_nan_poison_undef_op1(
-; CHECK-NEXT:    ret <3 x double> <double 0xFFFF00000000DEAD, double poison, double 0x7FF8000000000000>
+; CHECK-NEXT:    [[R:%.*]] = fadd <3 x double> [[X:%.*]], <double 0xFFFF00000000DEAD, double poison, double undef>
+; CHECK-NEXT:    ret <3 x double> [[R]]
 ;
   %r = fadd <3 x double> %x, <double 0xFFFF00000000DEAD, double poison, double undef>
   ret <3 x double> %r
diff --git a/llvm/test/Transforms/InstSimplify/icmp-bool-constant.ll b/llvm/test/Transforms/InstSimplify/icmp-bool-constant.ll
index 6205225098a7a..a501f995b6c97 100644
--- a/llvm/test/Transforms/InstSimplify/icmp-bool-constant.ll
+++ b/llvm/test/Transforms/InstSimplify/icmp-bool-constant.ll
@@ -12,11 +12,11 @@ define <2 x i1> @eq_t(<2 x i1> %a) {
   ret <2 x i1> %r
 }
 
-define <2 x i1> @eq_t_undef_elt(<2 x i1> %a) {
-; CHECK-LABEL: @eq_t_undef_elt(
+define <2 x i1> @eq_t_poison_elt(<2 x i1> %a) {
+; CHECK-LABEL: @eq_t_poison_elt(
 ; CHECK-NEXT:    ret <2 x i1> [[A:%.*]]
 ;
-  %r = icmp eq <2 x i1> %a, <i1 undef, i1 true>
+  %r = icmp eq <2 x i1> %a, <i1 poison, i1 true>
   ret <2 x i1> %r
 }
 
@@ -54,11 +54,11 @@ define <2 x i1> @ugt_t(<2 x i1> %a) {
   ret <2 x i1> %r
 }
 
-define <2 x i1> @ugt_t_undef_elt(<2 x i1> %a) {
-; CHECK-LABEL: @ugt_t_undef_elt(
+define <2 x i1> @ugt_t_poison_elt(<2 x i1> %a) {
+; CHECK-LABEL: @ugt_t_poison_elt(
 ; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
-  %r = icmp ugt <2 x i1> %a, <i1 true, i1 undef>
+  %r = icmp ugt <2 x i1> %a, <i1 true, i1 poison>
   ret <2 x i1> %r
 }
 
@@ -161,11 +161,11 @@ define <2 x i1> @sge_t(<2 x i1> %a) {
   ret <2 x i1> %r
 }
 
-define <2 x i1> @sge_t_undef_elt(<2 x i1> %a) {
-; CHECK-LABEL: @sge_t_undef_elt(
+define <2 x i1> @sge_t_poison_elt(<2 x i1> %a) {
+; CHECK-LABEL: @sge_t_poison_elt(
 ; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
-  %r = icmp sge <2 x i1> %a, <i1 true, i1 undef>
+  %r = icmp sge <2 x i1> %a, <i1 true, i1 poison>
   ret <2 x i1> %r
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/icmp-not-bool-constant.ll b/llvm/test/Transforms/InstSimplify/icmp-not-bool-constant.ll
index f4a0b6ddf6621..045d773bf3284 100644
--- a/llvm/test/Transforms/InstSimplify/icmp-not-bool-constant.ll
+++ b/llvm/test/Transforms/InstSimplify/icmp-not-bool-constant.ll
@@ -33,11 +33,11 @@ define <2 x i1> @eq_f_not_swap(<2 x i1> %a) {
   ret <2 x i1> %r
 }
 
-define <2 x i1> @eq_f_not_undef(<2 x i1> %a) {
-; CHECK-LABEL: @eq_f_not_undef(
+define <2 x i1> @eq_f_not_poison(<2 x i1> %a) {
+; CHECK-LABEL: @eq_f_not_poison(
 ; CHECK-NEXT:    ret <2 x i1> [[A:%.*]]
 ;
-  %not = xor <2 x i1> %a, <i1 undef, i1 true>
+  %not = xor <2 x i1> %a, <i1 poison, i1 true>
   %r = icmp eq <2 x i1> %not, <i1 false, i1 false>
   ret <2 x i1> %r
 }
@@ -60,11 +60,11 @@ define <2 x i1> @ne_t_not_swap(<2 x i1> %a) {
   ret <2 x i1> %r
 }
 
-define <2 x i1> @ne_t_not_undef(<2 x i1> %a) {
-; CHECK-LABEL: @ne_t_not_undef(
+define <2 x i1> @ne_t_not_poison(<2 x i1> %a) {
+; CHECK-LABEL: @ne_t_not_poison(
 ; CHECK-NEXT:    ret <2 x i1> [[A:%.*]]
 ;
-  %not = xor <2 x i1> %a, <i1 undef, i1 true>
+  %not = xor <2 x i1> %a, <i1 poison, i1 true>
   %r = icmp ne <2 x i1> %not, <i1 true, i1 true>
   ret <2 x i1> %r
 }
@@ -116,11 +116,11 @@ define <2 x i1> @ult_t_not_swap(<2 x i1> %a) {
   ret <2 x i1> %r
 }
 
-define <2 x i1> @ult_t_not_undef(<2 x i1> %a) {
-; CHECK-LABEL: @ult_t_not_undef(
+define <2 x i1> @ult_t_not_poison(<2 x i1> %a) {
+; CHECK-LABEL: @ult_t_not_poison(
 ; CHECK-NEXT:    ret <2 x i1> [[A:%.*]]
 ;
-  %not = xor <2 x i1> %a, <i1 undef, i1 true>
+  %not = xor <2 x i1> %a, <i1 poison, i1 true>
   %r = icmp ult <2 x i1> %not, <i1 true, i1 true>
   ret <2 x i1> %r
 }
@@ -152,11 +152,11 @@ define <2 x i1> @sgt_t_not_swap(<2 x i1> %a) {
   ret <2 x i1> %r
 }
 
-define <2 x i1> @sgt_t_not_undef(<2 x i1> %a) {
-; CHECK-LABEL: @sgt_t_not_undef(
+define <2 x i1> @sgt_t_not_poison(<2 x i1> %a) {
+; CHECK-LABEL: @sgt_t_not_poison(
 ; CHECK-NEXT:    ret <2 x i1> [[A:%.*]]
 ;
-  %not = xor <2 x i1> %a, <i1 undef, i1 true>
+  %not = xor <2 x i1> %a, <i1 poison, i1 true>
   %r = icmp sgt <2 x i1> %not, <i1 true, i1 true>
   ret <2 x i1> %r
 }
@@ -235,11 +235,11 @@ define <2 x i1> @ule_f_not_swap(<2 x i1> %a) {
   ret <2 x i1> %r
 }
 
-define <2 x i1> @ule_f_not_undef(<2 x i1> %a) {
-; CHECK-LABEL: @ule_f_not_undef(
+define <2 x i1> @ule_f_not_poison(<2 x i1> %a) {
+; CHECK-LABEL: @ule_f_not_poison(
 ; CHECK-NEXT:    ret <2 x i1> [[A:%.*]]
 ;
-  %not = xor <2 x i1> %a, <i1 undef, i1 true>
+  %not = xor <2 x i1> %a, <i1 poison, i1 true>
   %r = icmp ule <2 x i1> %not, <i1 false, i1 false>
   ret <2 x i1> %r
 }
@@ -271,11 +271,11 @@ define <2 x i1> @sge_f_not_swap(<2 x i1> %a) {
   ret <2 x i1> %r
 }
 
-define <2 x i1> @sge_f_not_undef(<2 x i1> %a) {
-; CHECK-LABEL: @sge_f_not_undef(
+define <2 x i1> @sge_f_not_poison(<2 x i1> %a) {
+; CHECK-LABEL: @sge_f_not_poison(
 ; CHECK-NEXT:    ret <2 x i1> [[A:%.*]]
 ;
-  %not = xor <2 x i1> %a, <i1 undef, i1 true>
+  %not = xor <2 x i1> %a, <i1 poison, i1 true>
   %r = icmp sge <2 x i1> %not, <i1 false, i1 false>
   ret <2 x i1> %r
 }
diff --git a/llvm/test/Transforms/InstSimplify/known-never-infinity.ll b/llvm/test/Transforms/InstSimplify/known-never-infinity.ll
index 74039d3ffd56c..4d662c08b1a7a 100644
--- a/llvm/test/Transforms/InstSimplify/known-never-infinity.ll
+++ b/llvm/test/Transforms/InstSimplify/known-never-infinity.ll
@@ -1109,6 +1109,106 @@ define float @fcmp_ult_neginf_implies_class_assert(float %arg) {
   ret float %mul_by_zero
 }
 
+define i1 @isKnownNeverInfinity_vector_reduce_maximum(<4 x double> %x) {
+; CHECK-LABEL: define i1 @isKnownNeverInfinity_vector_reduce_maximum
+; CHECK-SAME: (<4 x double> [[X:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
+  %ninf.x = fadd ninf <4 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0>
+  %op = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %ninf.x)
+  %cmp = fcmp une double %op, 0x7ff0000000000000
+  ret i1 %cmp
+}
+
+define i1 @isKnownNeverInfinity_vector_reduce_maximum_fail(<4 x double> %x) {
+; CHECK-LABEL: define i1 @isKnownNeverInfinity_vector_reduce_maximum_fail
+; CHECK-SAME: (<4 x double> [[X:%.*]]) {
+; CHECK-NEXT:    [[NINF_X:%.*]] = fadd <4 x double> [[X]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+; CHECK-NEXT:    [[OP:%.*]] = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> [[NINF_X]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[OP]], 0x7FF0000000000000
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %ninf.x = fadd <4 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0>
+  %op = call double @llvm.vector.reduce.fmaximum.v4f64(<4 x double> %ninf.x)
+  %cmp = fcmp une double %op, 0x7ff0000000000000
+  ret i1 %cmp
+}
+
+define i1 @isKnownNeverInfinity_vector_reduce_minimum(<4 x double> %x) {
+; CHECK-LABEL: define i1 @isKnownNeverInfinity_vector_reduce_minimum
+; CHECK-SAME: (<4 x double> [[X:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
+  %ninf.x = fadd ninf <4 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0>
+  %op = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %ninf.x)
+  %cmp = fcmp une double %op, 0x7ff0000000000000
+  ret i1 %cmp
+}
+
+define i1 @isKnownNeverInfinity_vector_reduce_minimum_fail(<4 x double> %x) {
+; CHECK-LABEL: define i1 @isKnownNeverInfinity_vector_reduce_minimum_fail
+; CHECK-SAME: (<4 x double> [[X:%.*]]) {
+; CHECK-NEXT:    [[NINF_X:%.*]] = fadd <4 x double> [[X]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+; CHECK-NEXT:    [[OP:%.*]] = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> [[NINF_X]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[OP]], 0x7FF0000000000000
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %ninf.x = fadd <4 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0>
+  %op = call double @llvm.vector.reduce.fminimum.v4f64(<4 x double> %ninf.x)
+  %cmp = fcmp une double %op, 0x7ff0000000000000
+  ret i1 %cmp
+}
+
+define i1 @isKnownNeverInfinity_vector_reduce_fmax(<4 x double> %x) {
+; CHECK-LABEL: define i1 @isKnownNeverInfinity_vector_reduce_fmax
+; CHECK-SAME: (<4 x double> [[X:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
+  %ninf.x = fadd ninf <4 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0>
+  %op = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %ninf.x)
+  %cmp = fcmp une double %op, 0x7ff0000000000000
+  ret i1 %cmp
+}
+
+define i1 @isKnownNeverInfinity_vector_reduce_fmax_fail(<4 x double> %x) {
+; CHECK-LABEL: define i1 @isKnownNeverInfinity_vector_reduce_fmax_fail
+; CHECK-SAME: (<4 x double> [[X:%.*]]) {
+; CHECK-NEXT:    [[NINF_X:%.*]] = fadd <4 x double> [[X]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+; CHECK-NEXT:    [[OP:%.*]] = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> [[NINF_X]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[OP]], 0x7FF0000000000000
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %ninf.x = fadd <4 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0>
+  %op = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %ninf.x)
+  %cmp = fcmp une double %op, 0x7ff0000000000000
+  ret i1 %cmp
+}
+
+define i1 @isKnownNeverInfinity_vector_reduce_fmin(<4 x double> %x) {
+; CHECK-LABEL: define i1 @isKnownNeverInfinity_vector_reduce_fmin
+; CHECK-SAME: (<4 x double> [[X:%.*]]) {
+; CHECK-NEXT:    ret i1 true
+;
+  %ninf.x = fadd ninf <4 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0>
+  %op = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %ninf.x)
+  %cmp = fcmp une double %op, 0x7ff0000000000000
+  ret i1 %cmp
+}
+
+define i1 @isKnownNeverInfinity_vector_reduce_fmin_fail(<4 x double> %x) {
+; CHECK-LABEL: define i1 @isKnownNeverInfinity_vector_reduce_fmin_fail
+; CHECK-SAME: (<4 x double> [[X:%.*]]) {
+; CHECK-NEXT:    [[NINF_X:%.*]] = fadd <4 x double> [[X]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
+; CHECK-NEXT:    [[OP:%.*]] = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[NINF_X]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[OP]], 0x7FF0000000000000
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %ninf.x = fadd <4 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0>
+  %op = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %ninf.x)
+  %cmp = fcmp une double %op, 0x7ff0000000000000
+  ret i1 %cmp
+}
+
 declare double @llvm.arithmetic.fence.f64(double)
 declare double @llvm.canonicalize.f64(double)
 declare double @llvm.ceil.f64(double)
diff --git a/llvm/test/Transforms/InstSimplify/ldexp.ll b/llvm/test/Transforms/InstSimplify/ldexp.ll
index c6bb0141199f2..d39f6a1e49673 100644
--- a/llvm/test/Transforms/InstSimplify/ldexp.ll
+++ b/llvm/test/Transforms/InstSimplify/ldexp.ll
@@ -57,11 +57,12 @@ define void @ldexp_f32_exp0(float %x) {
 define void @ldexp_v2f32_exp0(<2 x float> %x) {
 ; CHECK-LABEL: @ldexp_v2f32_exp0(
 ; CHECK-NEXT:    store volatile <2 x float> [[X:%.*]], ptr addrspace(1) undef, align 8
-; CHECK-NEXT:    store volatile <2 x float> [[X]], ptr addrspace(1) undef, align 8
+; CHECK-NEXT:    [[PART_UNDEF1:%.*]] = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> [[X]], <2 x i32> <i32 undef, i32 0>)
+; CHECK-NEXT:    store volatile <2 x float> [[PART_UNDEF1]], ptr addrspace(1) undef, align 8
 ; CHECK-NEXT:    store volatile <2 x float> [[X]], ptr addrspace(1) undef, align 8
 ; CHECK-NEXT:    ret void
 ;
-  %part.undef0 = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> %x, <2 x i32> <i32 0, i32 undef>)
+  %part.undef0 = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> %x, <2 x i32> <i32 0, i32 poison>)
   store volatile <2 x float> %part.undef0, ptr addrspace(1) undef
 
   %part.undef1 = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> %x, <2 x i32> <i32 undef, i32 0>)
diff --git a/llvm/test/Transforms/InstSimplify/mul.ll b/llvm/test/Transforms/InstSimplify/mul.ll
index 8ae7f1eaac92b..a1b03a30fe4f4 100644
--- a/llvm/test/Transforms/InstSimplify/mul.ll
+++ b/llvm/test/Transforms/InstSimplify/mul.ll
@@ -34,11 +34,11 @@ define <16 x i8> @mul_by_0_vec(<16 x i8> %a) {
   ret <16 x i8> %b
 }
 
-define <2 x i8> @mul_by_0_vec_undef_elt(<2 x i8> %a) {
-; CHECK-LABEL: @mul_by_0_vec_undef_elt(
+define <2 x i8> @mul_by_0_vec_poison_elt(<2 x i8> %a) {
+; CHECK-LABEL: @mul_by_0_vec_poison_elt(
 ; CHECK-NEXT:    ret <2 x i8> zeroinitializer
 ;
-  %b = mul <2 x i8> %a, <i8 undef, i8 0>
+  %b = mul <2 x i8> %a, <i8 poison, i8 0>
   ret <2 x i8> %b
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/negate.ll b/llvm/test/Transforms/InstSimplify/negate.ll
index d72a0db6d445c..d07029becd1fe 100644
--- a/llvm/test/Transforms/InstSimplify/negate.ll
+++ b/llvm/test/Transforms/InstSimplify/negate.ll
@@ -17,11 +17,11 @@ define <2 x i32> @negate_nuw_vec(<2 x i32> %x) {
   ret <2 x i32> %neg
 }
 
-define <2 x i32> @negate_nuw_vec_undef_elt(<2 x i32> %x) {
-; CHECK-LABEL: @negate_nuw_vec_undef_elt(
+define <2 x i32> @negate_nuw_vec_poison_elt(<2 x i32> %x) {
+; CHECK-LABEL: @negate_nuw_vec_poison_elt(
 ; CHECK-NEXT:    ret <2 x i32> zeroinitializer
 ;
-  %neg = sub nuw <2 x i32> <i32 0, i32 undef>, %x
+  %neg = sub nuw <2 x i32> <i32 0, i32 poison>, %x
   ret <2 x i32> %neg
 }
 
@@ -43,12 +43,12 @@ define <2 x i8> @negate_zero_or_minsigned_nsw_vec(<2 x i8> %x) {
   ret <2 x i8> %neg
 }
 
-define <2 x i8> @negate_zero_or_minsigned_nsw_vec_undef_elt(<2 x i8> %x) {
-; CHECK-LABEL: @negate_zero_or_minsigned_nsw_vec_undef_elt(
+define <2 x i8> @negate_zero_or_minsigned_nsw_vec_poison_elt(<2 x i8> %x) {
+; CHECK-LABEL: @negate_zero_or_minsigned_nsw_vec_poison_elt(
 ; CHECK-NEXT:    ret <2 x i8> zeroinitializer
 ;
   %signbit = shl <2 x i8> %x, <i8 7, i8 7>
-  %neg = sub nsw <2 x i8> <i8 undef, i8 0>, %signbit
+  %neg = sub nsw <2 x i8> <i8 poison, i8 0>, %signbit
   ret <2 x i8> %neg
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/or.ll b/llvm/test/Transforms/InstSimplify/or.ll
index 913b760dd331c..f241c6987b9e7 100644
--- a/llvm/test/Transforms/InstSimplify/or.ll
+++ b/llvm/test/Transforms/InstSimplify/or.ll
@@ -17,11 +17,11 @@ define i32 @all_ones(i32 %A) {
   ret i32 %B
 }
 
-define <3 x i8> @all_ones_vec_with_undef_elt(<3 x i8> %A) {
-; CHECK-LABEL: @all_ones_vec_with_undef_elt(
+define <3 x i8> @all_ones_vec_with_poison_elt(<3 x i8> %A) {
+; CHECK-LABEL: @all_ones_vec_with_poison_elt(
 ; CHECK-NEXT:    ret <3 x i8> <i8 -1, i8 -1, i8 -1>
 ;
-  %B = or <3 x i8> %A, <i8 -1, i8 undef, i8 -1>
+  %B = or <3 x i8> %A, <i8 -1, i8 poison, i8 -1>
   ret <3 x i8> %B
 }
 
@@ -68,11 +68,11 @@ define i32 @or_not(i32 %A) {
   ret i32 %B
 }
 
-define <2 x i4> @or_not_commute_vec_undef(<2 x i4> %A) {
-; CHECK-LABEL: @or_not_commute_vec_undef(
+define <2 x i4> @or_not_commute_vec_poison(<2 x i4> %A) {
+; CHECK-LABEL: @or_not_commute_vec_poison(
 ; CHECK-NEXT:    ret <2 x i4> <i4 -1, i4 -1>
 ;
-  %NotA = xor <2 x i4> %A, <i4 -1, i4 undef>
+  %NotA = xor <2 x i4> %A, <i4 -1, i4 poison>
   %B = or <2 x i4> %NotA, %A
   ret <2 x i4> %B
 }
@@ -335,7 +335,7 @@ define <2 x i1> @or_with_not_op_commute4(<2 x i1> %a, <2 x i1> %b) {
 ; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
 ;
   %ab = and <2 x i1> %b, %a
-  %not = xor <2 x i1> %ab, <i1 -1, i1 undef>
+  %not = xor <2 x i1> %ab, <i1 -1, i1 poison>
   %r = or <2 x i1> %not, %a
   ret <2 x i1> %r
 }
@@ -515,6 +515,21 @@ define <2 x i4> @and_or_not_or_commute7_undef_elt(<2 x i4> %A, <2 x i4> %B) {
   ret <2 x i4> %r
 }
 
+; doing the same with poison is safe.
+
+define <2 x i4> @and_or_not_or_commute7_poison_elt(<2 x i4> %A, <2 x i4> %B) {
+; CHECK-LABEL: @and_or_not_or_commute7_poison_elt(
+; CHECK-NEXT:    [[NOTA:%.*]] = xor <2 x i4> [[A:%.*]], <i4 poison, i4 -1>
+; CHECK-NEXT:    ret <2 x i4> [[NOTA]]
+;
+  %nota = xor <2 x i4> %A, <i4 poison, i4 -1>
+  %and = and <2 x i4> %B, %nota
+  %or = or <2 x i4> %B, %A
+  %notab = xor <2 x i4> %or, <i4 -1, i4 -1>
+  %r = or <2 x i4> %notab, %and
+  ret <2 x i4> %r
+}
+
 ; (A | B) | (A ^ B) --> A | B
 
 define i69 @or_or_xor(i69 %A, i69 %B) {
@@ -769,6 +784,21 @@ define <2 x i4> @or_nxor_and_undef_elt(<2 x i4> %a, <2 x i4> %b) {
   ret <2 x i4> %r
 }
 
+; Same with poison is safe.
+
+define <2 x i4> @or_nxor_and_poison_elt(<2 x i4> %a, <2 x i4> %b) {
+; CHECK-LABEL: @or_nxor_and_poison_elt(
+; CHECK-NEXT:    [[XOR:%.*]] = xor <2 x i4> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[NOT:%.*]] = xor <2 x i4> [[XOR]], <i4 -1, i4 poison>
+; CHECK-NEXT:    ret <2 x i4> [[NOT]]
+;
+  %and = and <2 x i4> %b, %a
+  %xor = xor <2 x i4> %a, %b
+  %not = xor <2 x i4> %xor, <i4 -1, i4 poison>
+  %r = or <2 x i4> %not, %and
+  ret <2 x i4> %r
+}
+
 ; ~(A ^ B) | (A | B) --> -1
 
 define i4 @or_nxor_or_commute0(i4 %a, i4 %b) {
@@ -849,15 +879,15 @@ define i4 @or_nxor_or_wrong_val2(i4 %a, i4 %b, i4 %c) {
   ret i4 %r
 }
 
-; negative test - undef in 'not' is allowed
+; negative test - poison in 'not' is allowed
 
-define <2 x i4> @or_nxor_or_undef_elt(<2 x i4> %a, <2 x i4> %b) {
-; CHECK-LABEL: @or_nxor_or_undef_elt(
+define <2 x i4> @or_nxor_or_poison_elt(<2 x i4> %a, <2 x i4> %b) {
+; CHECK-LABEL: @or_nxor_or_poison_elt(
 ; CHECK-NEXT:    ret <2 x i4> <i4 -1, i4 -1>
 ;
   %or = or <2 x i4> %b, %a
   %xor = xor <2 x i4> %a, %b
-  %not = xor <2 x i4> %xor, <i4 -1, i4 undef>
+  %not = xor <2 x i4> %xor, <i4 -1, i4 poison>
   %r = or <2 x i4> %or, %not
   ret <2 x i4> %r
 }
@@ -966,12 +996,12 @@ define i32  @or_xor_not_op_or_commute7(i32 %a, i32 %b){
   ret i32  %r
 }
 
-define <2 x i4> @or_xor_not_op_or_undef_elt(<2 x i4> %a, <2 x i4> %b) {
-; CHECK-LABEL: @or_xor_not_op_or_undef_elt(
+define <2 x i4> @or_xor_not_op_or_poison_elt(<2 x i4> %a, <2 x i4> %b) {
+; CHECK-LABEL: @or_xor_not_op_or_poison_elt(
 ; CHECK-NEXT:    ret <2 x i4> <i4 -1, i4 -1>
 ;
   %xor = xor <2 x i4> %a, %b
-  %nota = xor <2 x i4> %a, <i4 -1, i4 undef>
+  %nota = xor <2 x i4> %a, <i4 -1, i4 poison>
   %or = or <2 x i4>  %nota, %b
   %r = or <2 x i4> %xor, %or
   ret <2 x i4> %r
@@ -1082,6 +1112,21 @@ define <2 x i4> @or_nand_xor_undef_elt(<2 x i4> %x, <2 x i4> %y) {
   ret <2 x i4> %or
 }
 
+; Same with poison is safe.
+
+define <2 x i4> @or_nand_xor_poison_elt(<2 x i4> %x, <2 x i4> %y) {
+; CHECK-LABEL: @or_nand_xor_poison_elt(
+; CHECK-NEXT:    [[AND:%.*]] = and <2 x i4> [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[NAND:%.*]] = xor <2 x i4> [[AND]], <i4 poison, i4 -1>
+; CHECK-NEXT:    ret <2 x i4> [[NAND]]
+;
+  %and = and <2 x i4> %y, %x
+  %xor = xor <2 x i4> %x, %y
+  %nand = xor <2 x i4> %and, <i4 poison, i4 -1>
+  %or = or <2 x i4> %xor, %nand
+  ret <2 x i4> %or
+}
+
 declare i32 @llvm.fshl.i32 (i32, i32, i32)
 declare i32 @llvm.fshr.i32 (i32, i32, i32)
 
diff --git a/llvm/test/Transforms/InstSimplify/ptrmask.ll b/llvm/test/Transforms/InstSimplify/ptrmask.ll
index dd83abfdeee46..d2c4a5dd7f035 100644
--- a/llvm/test/Transforms/InstSimplify/ptrmask.ll
+++ b/llvm/test/Transforms/InstSimplify/ptrmask.ll
@@ -40,7 +40,8 @@ define <2 x ptr addrspace(1) > @ptrmask_simplify_poison_and_zero_i32_vec_fail(<2
 define <2 x ptr> @ptrmask_simplify_undef_and_ones_vec(<2 x ptr> %p) {
 ; CHECK-LABEL: define <2 x ptr> @ptrmask_simplify_undef_and_ones_vec
 ; CHECK-SAME: (<2 x ptr> [[P:%.*]]) {
-; CHECK-NEXT:    ret <2 x ptr> [[P]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> <i64 undef, i64 -1>)
+; CHECK-NEXT:    ret <2 x ptr> [[R]]
 ;
   %r = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %p, <2 x i64> <i64 undef, i64 -1>)
   ret <2 x ptr> %r
diff --git a/llvm/test/Transforms/InstSimplify/rem.ll b/llvm/test/Transforms/InstSimplify/rem.ll
index 5af3b5f7c5e0b..a46db0342042f 100644
--- a/llvm/test/Transforms/InstSimplify/rem.ll
+++ b/llvm/test/Transforms/InstSimplify/rem.ll
@@ -17,11 +17,11 @@ define <2 x i32> @zero_dividend_vector(<2 x i32> %A) {
   ret <2 x i32> %B
 }
 
-define <2 x i32> @zero_dividend_vector_undef_elt(<2 x i32> %A) {
-; CHECK-LABEL: @zero_dividend_vector_undef_elt(
+define <2 x i32> @zero_dividend_vector_poison_elt(<2 x i32> %A) {
+; CHECK-LABEL: @zero_dividend_vector_poison_elt(
 ; CHECK-NEXT:    ret <2 x i32> zeroinitializer
 ;
-  %B = urem <2 x i32> <i32 undef, i32 0>, %A
+  %B = urem <2 x i32> <i32 poison, i32 0>, %A
   ret <2 x i32> %B
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/saturating-add-sub.ll b/llvm/test/Transforms/InstSimplify/saturating-add-sub.ll
index 6fb12612f2f72..40b22c619f768 100644
--- a/llvm/test/Transforms/InstSimplify/saturating-add-sub.ll
+++ b/llvm/test/Transforms/InstSimplify/saturating-add-sub.ll
@@ -44,7 +44,7 @@ define <2 x i8> @uadd_vector_0_commute(<2 x i8> %a) {
 ; CHECK-LABEL: @uadd_vector_0_commute(
 ; CHECK-NEXT:    ret <2 x i8> [[A:%.*]]
 ;
-  %x2v = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> <i8 0, i8 undef>, <2 x i8> %a)
+  %x2v = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> <i8 0, i8 poison>, <2 x i8> %a)
   ret <2 x i8> %x2v
 }
 
@@ -156,7 +156,7 @@ define <2 x i8> @sadd_vector_0(<2 x i8> %a) {
 ; CHECK-LABEL: @sadd_vector_0(
 ; CHECK-NEXT:    ret <2 x i8> [[A:%.*]]
 ;
-  %y1v = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %a, <2 x i8> <i8 undef, i8 0>)
+  %y1v = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> %a, <2 x i8> <i8 poison, i8 0>)
   ret <2 x i8> %y1v
 }
 
@@ -205,10 +205,10 @@ define i8 @sadd_scalar_maxval_commute(i8 %a) {
 
 define <2 x i8> @sadd_vector_maxval_commute(<2 x i8> %a) {
 ; CHECK-LABEL: @sadd_vector_maxval_commute(
-; CHECK-NEXT:    [[Y4V:%.*]] = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> <i8 undef, i8 127>, <2 x i8> [[A:%.*]])
+; CHECK-NEXT:    [[Y4V:%.*]] = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> <i8 poison, i8 127>, <2 x i8> [[A:%.*]])
 ; CHECK-NEXT:    ret <2 x i8> [[Y4V]]
 ;
-  %y4v = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> <i8 undef, i8 127>, <2 x i8> %a)
+  %y4v = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> <i8 poison, i8 127>, <2 x i8> %a)
   ret <2 x i8> %y4v
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/sdiv.ll b/llvm/test/Transforms/InstSimplify/sdiv.ll
index 2514d90b01235..99092802cab02 100644
--- a/llvm/test/Transforms/InstSimplify/sdiv.ll
+++ b/llvm/test/Transforms/InstSimplify/sdiv.ll
@@ -158,11 +158,11 @@ define <2 x i32> @knownnegation_commute_vec_bad3(<2 x i32> %x, <2 x i32> %y) {
   ret <2 x i32> %div
 }
 
-define <3 x i32> @negated_operand_vec_undef(<3 x i32> %x) {
-; CHECK-LABEL: @negated_operand_vec_undef(
+define <3 x i32> @negated_operand_vec_poison(<3 x i32> %x) {
+; CHECK-LABEL: @negated_operand_vec_poison(
 ; CHECK-NEXT:    ret <3 x i32> <i32 -1, i32 -1, i32 -1>
 ;
-  %negx = sub nsw <3 x i32> <i32 0, i32 undef, i32 0>, %x
+  %negx = sub nsw <3 x i32> <i32 0, i32 poison, i32 0>, %x
   %div = sdiv <3 x i32> %negx, %x
   ret <3 x i32> %div
 }
diff --git a/llvm/test/Transforms/InstSimplify/select-inseltpoison.ll b/llvm/test/Transforms/InstSimplify/select-inseltpoison.ll
index 2a4ce85ed11f8..fcf8c31b25eed 100644
--- a/llvm/test/Transforms/InstSimplify/select-inseltpoison.ll
+++ b/llvm/test/Transforms/InstSimplify/select-inseltpoison.ll
@@ -17,11 +17,11 @@ define <2 x i1> @bool_true_or_false_vec(<2 x i1> %cond) {
   ret <2 x i1> %s
 }
 
-define <2 x i1> @bool_true_or_false_vec_undef(<2 x i1> %cond) {
-; CHECK-LABEL: @bool_true_or_false_vec_undef(
+define <2 x i1> @bool_true_or_false_vec_poison(<2 x i1> %cond) {
+; CHECK-LABEL: @bool_true_or_false_vec_poison(
 ; CHECK-NEXT:    ret <2 x i1> [[COND:%.*]]
 ;
-  %s = select <2 x i1> %cond, <2 x i1> <i1 undef, i1 true>, <2 x i1> <i1 false, i1 undef>
+  %s = select <2 x i1> %cond, <2 x i1> <i1 poison, i1 true>, <2 x i1> <i1 false, i1 poison>
   ret <2 x i1> %s
 }
 
@@ -57,27 +57,27 @@ define <2 x i32> @equal_arms_vec(<2 x i1> %cond, <2 x i32> %x) {
   ret <2 x i32> %V
 }
 
-define <2 x i32> @equal_arms_vec_undef(<2 x i1> %cond) {
-; CHECK-LABEL: @equal_arms_vec_undef(
+define <2 x i32> @equal_arms_vec_poison(<2 x i1> %cond) {
+; CHECK-LABEL: @equal_arms_vec_poison(
 ; CHECK-NEXT:    ret <2 x i32> <i32 42, i32 42>
 ;
-  %V = select <2 x i1> %cond, <2 x i32> <i32 42, i32 undef>, <2 x i32> <i32 undef, i32 42>
+  %V = select <2 x i1> %cond, <2 x i32> <i32 42, i32 poison>, <2 x i32> <i32 poison, i32 42>
   ret <2 x i32> %V
 }
 
-define <3 x float> @equal_arms_vec_less_undef(<3 x i1> %cond) {
-; CHECK-LABEL: @equal_arms_vec_less_undef(
+define <3 x float> @equal_arms_vec_less_poison(<3 x i1> %cond) {
+; CHECK-LABEL: @equal_arms_vec_less_poison(
 ; CHECK-NEXT:    ret <3 x float> <float 4.200000e+01, float 4.200000e+01, float 4.300000e+01>
 ;
-  %V = select <3 x i1> %cond, <3 x float> <float 42.0, float undef, float 43.0>, <3 x float> <float 42.0, float 42.0, float 43.0>
+  %V = select <3 x i1> %cond, <3 x float> <float 42.0, float poison, float 43.0>, <3 x float> <float 42.0, float 42.0, float 43.0>
   ret <3 x float> %V
 }
 
-define <3 x float> @equal_arms_vec_more_undef(<3 x i1> %cond) {
-; CHECK-LABEL: @equal_arms_vec_more_undef(
-; CHECK-NEXT:    ret <3 x float> <float 4.200000e+01, float undef, float 4.300000e+01>
+define <3 x float> @equal_arms_vec_more_poison(<3 x i1> %cond) {
+; CHECK-LABEL: @equal_arms_vec_more_poison(
+; CHECK-NEXT:    ret <3 x float> <float 4.200000e+01, float poison, float 4.300000e+01>
 ;
-  %V = select <3 x i1> %cond, <3 x float> <float 42.0, float undef, float undef>, <3 x float> <float undef, float undef, float 43.0>
+  %V = select <3 x i1> %cond, <3 x float> <float 42.0, float poison, float poison>, <3 x float> <float poison, float poison, float 43.0>
   ret <3 x float> %V
 }
 
@@ -105,19 +105,19 @@ define <2 x i8> @vsel_mixedvec() {
   ret <2 x i8> %s
 }
 
-define <3 x i8> @vsel_undef_true_op(<3 x i8> %x, <3 x i8> %y) {
-; CHECK-LABEL: @vsel_undef_true_op(
+define <3 x i8> @vsel_poison_true_op(<3 x i8> %x, <3 x i8> %y) {
+; CHECK-LABEL: @vsel_poison_true_op(
 ; CHECK-NEXT:    ret <3 x i8> [[X:%.*]]
 ;
-  %s = select <3 x i1><i1 1, i1 undef, i1 1>, <3 x i8> %x, <3 x i8> %y
+  %s = select <3 x i1><i1 1, i1 poison, i1 1>, <3 x i8> %x, <3 x i8> %y
   ret <3 x i8> %s
 }
 
-define <3 x i4> @vsel_undef_false_op(<3 x i4> %x, <3 x i4> %y) {
-; CHECK-LABEL: @vsel_undef_false_op(
+define <3 x i4> @vsel_poison_false_op(<3 x i4> %x, <3 x i4> %y) {
+; CHECK-LABEL: @vsel_poison_false_op(
 ; CHECK-NEXT:    ret <3 x i4> [[Y:%.*]]
 ;
-  %s = select <3 x i1><i1 0, i1 undef, i1 undef>, <3 x i4> %x, <3 x i4> %y
+  %s = select <3 x i1><i1 0, i1 poison, i1 poison>, <3 x i4> %x, <3 x i4> %y
   ret <3 x i4> %s
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/select.ll b/llvm/test/Transforms/InstSimplify/select.ll
index fe93a0c3f2125..40c1460e3ebc3 100644
--- a/llvm/test/Transforms/InstSimplify/select.ll
+++ b/llvm/test/Transforms/InstSimplify/select.ll
@@ -25,11 +25,11 @@ define <2 x i1> @bool_true_or_false_vec(<2 x i1> %cond) {
   ret <2 x i1> %s
 }
 
-define <2 x i1> @bool_true_or_false_vec_undef(<2 x i1> %cond) {
-; CHECK-LABEL: @bool_true_or_false_vec_undef(
+define <2 x i1> @bool_true_or_false_vec_poison(<2 x i1> %cond) {
+; CHECK-LABEL: @bool_true_or_false_vec_poison(
 ; CHECK-NEXT:    ret <2 x i1> [[COND:%.*]]
 ;
-  %s = select <2 x i1> %cond, <2 x i1> <i1 undef, i1 true>, <2 x i1> <i1 false, i1 undef>
+  %s = select <2 x i1> %cond, <2 x i1> <i1 poison, i1 true>, <2 x i1> <i1 false, i1 poison>
   ret <2 x i1> %s
 }
 
@@ -65,27 +65,27 @@ define <2 x i32> @equal_arms_vec(<2 x i1> %cond, <2 x i32> %x) {
   ret <2 x i32> %V
 }
 
-define <2 x i32> @equal_arms_vec_undef(<2 x i1> %cond) {
-; CHECK-LABEL: @equal_arms_vec_undef(
+define <2 x i32> @equal_arms_vec_poison(<2 x i1> %cond) {
+; CHECK-LABEL: @equal_arms_vec_poison(
 ; CHECK-NEXT:    ret <2 x i32> <i32 42, i32 42>
 ;
-  %V = select <2 x i1> %cond, <2 x i32> <i32 42, i32 undef>, <2 x i32> <i32 undef, i32 42>
+  %V = select <2 x i1> %cond, <2 x i32> <i32 42, i32 poison>, <2 x i32> <i32 poison, i32 42>
   ret <2 x i32> %V
 }
 
-define <3 x float> @equal_arms_vec_less_undef(<3 x i1> %cond) {
-; CHECK-LABEL: @equal_arms_vec_less_undef(
+define <3 x float> @equal_arms_vec_less_poison(<3 x i1> %cond) {
+; CHECK-LABEL: @equal_arms_vec_less_poison(
 ; CHECK-NEXT:    ret <3 x float> <float 4.200000e+01, float 4.200000e+01, float 4.300000e+01>
 ;
-  %V = select <3 x i1> %cond, <3 x float> <float 42.0, float undef, float 43.0>, <3 x float> <float 42.0, float 42.0, float 43.0>
+  %V = select <3 x i1> %cond, <3 x float> <float 42.0, float poison, float 43.0>, <3 x float> <float 42.0, float 42.0, float 43.0>
   ret <3 x float> %V
 }
 
-define <3 x float> @equal_arms_vec_more_undef(<3 x i1> %cond) {
-; CHECK-LABEL: @equal_arms_vec_more_undef(
-; CHECK-NEXT:    ret <3 x float> <float 4.200000e+01, float undef, float 4.300000e+01>
+define <3 x float> @equal_arms_vec_more_poison(<3 x i1> %cond) {
+; CHECK-LABEL: @equal_arms_vec_more_poison(
+; CHECK-NEXT:    ret <3 x float> <float 4.200000e+01, float poison, float 4.300000e+01>
 ;
-  %V = select <3 x i1> %cond, <3 x float> <float 42.0, float undef, float undef>, <3 x float> <float undef, float undef, float 43.0>
+  %V = select <3 x i1> %cond, <3 x float> <float 42.0, float poison, float poison>, <3 x float> <float poison, float poison, float 43.0>
   ret <3 x float> %V
 }
 
@@ -113,19 +113,19 @@ define <2 x i8> @vsel_mixedvec() {
   ret <2 x i8> %s
 }
 
-define <3 x i8> @vsel_undef_true_op(<3 x i8> %x, <3 x i8> %y) {
-; CHECK-LABEL: @vsel_undef_true_op(
+define <3 x i8> @vsel_poison_true_op(<3 x i8> %x, <3 x i8> %y) {
+; CHECK-LABEL: @vsel_poison_true_op(
 ; CHECK-NEXT:    ret <3 x i8> [[X:%.*]]
 ;
-  %s = select <3 x i1><i1 1, i1 undef, i1 1>, <3 x i8> %x, <3 x i8> %y
+  %s = select <3 x i1><i1 1, i1 poison, i1 1>, <3 x i8> %x, <3 x i8> %y
   ret <3 x i8> %s
 }
 
-define <3 x i4> @vsel_undef_false_op(<3 x i4> %x, <3 x i4> %y) {
-; CHECK-LABEL: @vsel_undef_false_op(
+define <3 x i4> @vsel_poison_false_op(<3 x i4> %x, <3 x i4> %y) {
+; CHECK-LABEL: @vsel_poison_false_op(
 ; CHECK-NEXT:    ret <3 x i4> [[Y:%.*]]
 ;
-  %s = select <3 x i1><i1 0, i1 undef, i1 undef>, <3 x i4> %x, <3 x i4> %y
+  %s = select <3 x i1><i1 0, i1 poison, i1 poison>, <3 x i4> %x, <3 x i4> %y
   ret <3 x i4> %s
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/shift.ll b/llvm/test/Transforms/InstSimplify/shift.ll
index b562c3c164d52..a816fcbdeeee0 100644
--- a/llvm/test/Transforms/InstSimplify/shift.ll
+++ b/llvm/test/Transforms/InstSimplify/shift.ll
@@ -17,11 +17,11 @@ define i41 @shl_0(i41 %X) {
   ret i41 %B
 }
 
-define <2 x i41> @shl_0_vec_undef_elt(<2 x i41> %X) {
-; CHECK-LABEL: @shl_0_vec_undef_elt(
+define <2 x i41> @shl_0_vec_poison_elt(<2 x i41> %X) {
+; CHECK-LABEL: @shl_0_vec_poison_elt(
 ; CHECK-NEXT:    ret <2 x i41> zeroinitializer
 ;
-  %B = shl <2 x i41> <i41 0, i41 undef>, %X
+  %B = shl <2 x i41> <i41 0, i41 poison>, %X
   ret <2 x i41> %B
 }
 
@@ -41,11 +41,11 @@ define i39 @ashr_0(i39 %X) {
   ret i39 %B
 }
 
-define <2 x i141> @ashr_0_vec_undef_elt(<2 x i141> %X) {
-; CHECK-LABEL: @ashr_0_vec_undef_elt(
+define <2 x i141> @ashr_0_vec_poison_elt(<2 x i141> %X) {
+; CHECK-LABEL: @ashr_0_vec_poison_elt(
 ; CHECK-NEXT:    ret <2 x i141> zeroinitializer
 ;
-  %B = shl <2 x i141> <i141 undef, i141 0>, %X
+  %B = shl <2 x i141> <i141 poison, i141 0>, %X
   ret <2 x i141> %B
 }
 
@@ -113,11 +113,11 @@ define i32 @ashr_all_ones(i32 %A) {
   ret i32 %B
 }
 
-define <3 x i8> @ashr_all_ones_vec_with_undef_elts(<3 x i8> %x, <3 x i8> %y) {
-; CHECK-LABEL: @ashr_all_ones_vec_with_undef_elts(
+define <3 x i8> @ashr_all_ones_vec_with_poison_elts(<3 x i8> %x, <3 x i8> %y) {
+; CHECK-LABEL: @ashr_all_ones_vec_with_poison_elts(
 ; CHECK-NEXT:    ret <3 x i8> <i8 -1, i8 -1, i8 -1>
 ;
-  %sh = ashr <3 x i8> <i8 undef, i8 -1, i8 undef>, %y
+  %sh = ashr <3 x i8> <i8 poison, i8 -1, i8 poison>, %y
   ret <3 x i8> %sh
 }
 
@@ -306,11 +306,22 @@ define <2 x i7> @all_ones_left_right_splat(<2 x i7> %x) {
 
 ; Poison could propagate, but undef must not.
 
-define <3 x i7> @all_ones_left_right_splat_poison_undef_elt(<3 x i7> %x) {
-; CHECK-LABEL: @all_ones_left_right_splat_poison_undef_elt(
+define <3 x i7> @all_ones_left_right_splat_undef_elt(<3 x i7> %x) {
+; CHECK-LABEL: @all_ones_left_right_splat_undef_elt(
+; CHECK-NEXT:    [[LEFT:%.*]] = shl <3 x i7> <i7 undef, i7 -1, i7 undef>, [[X:%.*]]
+; CHECK-NEXT:    [[RIGHT:%.*]] = ashr <3 x i7> [[LEFT]], [[X]]
+; CHECK-NEXT:    ret <3 x i7> [[RIGHT]]
+;
+  %left = shl <3 x i7> <i7 undef, i7 -1, i7 undef>, %x
+  %right = ashr <3 x i7> %left, %x
+  ret <3 x i7> %right
+}
+
+define <3 x i7> @all_ones_left_right_splat_poison__elt(<3 x i7> %x) {
+; CHECK-LABEL: @all_ones_left_right_splat_poison__elt(
 ; CHECK-NEXT:    ret <3 x i7> <i7 -1, i7 -1, i7 -1>
 ;
-  %left = shl <3 x i7> <i7 poison, i7 -1, i7 undef>, %x
+  %left = shl <3 x i7> <i7 poison, i7 -1, i7 poison>, %x
   %right = ashr <3 x i7> %left, %x
   ret <3 x i7> %right
 }
diff --git a/llvm/test/Transforms/InstSimplify/srem.ll b/llvm/test/Transforms/InstSimplify/srem.ll
index b1cbdf35b3c7c..ab726832e517b 100644
--- a/llvm/test/Transforms/InstSimplify/srem.ll
+++ b/llvm/test/Transforms/InstSimplify/srem.ll
@@ -39,11 +39,11 @@ define <2 x i32> @knownnegation_commute_vec(<2 x i32> %x, <2 x i32> %y) {
   ret <2 x i32> %rem
 }
 
-define <3 x i32> @negated_operand_vec_undef(<3 x i32> %x) {
-; CHECK-LABEL: @negated_operand_vec_undef(
+define <3 x i32> @negated_operand_vec_poison(<3 x i32> %x) {
+; CHECK-LABEL: @negated_operand_vec_poison(
 ; CHECK-NEXT:    ret <3 x i32> zeroinitializer
 ;
-  %negx = sub <3 x i32> <i32 0, i32 undef, i32 0>, %x
+  %negx = sub <3 x i32> <i32 0, i32 poison, i32 0>, %x
   %rem = srem <3 x i32> %negx, %x
   ret <3 x i32> %rem
 }
diff --git a/llvm/test/Transforms/InstSimplify/sub.ll b/llvm/test/Transforms/InstSimplify/sub.ll
index deb0ee33cd920..fd88fc15716c8 100644
--- a/llvm/test/Transforms/InstSimplify/sub.ll
+++ b/llvm/test/Transforms/InstSimplify/sub.ll
@@ -29,7 +29,7 @@ define <2 x i32> @sub_zero_vec(<2 x i32> %A) {
 ; CHECK-LABEL: @sub_zero_vec(
 ; CHECK-NEXT:    ret <2 x i32> [[A:%.*]]
 ;
-  %B = sub <2 x i32> %A, <i32 0, i32 undef>
+  %B = sub <2 x i32> %A, <i32 0, i32 poison>
   ret <2 x i32> %B
 }
 
@@ -46,8 +46,8 @@ define <2 x i32> @neg_neg_vec(<2 x i32> %A) {
 ; CHECK-LABEL: @neg_neg_vec(
 ; CHECK-NEXT:    ret <2 x i32> [[A:%.*]]
 ;
-  %B = sub <2 x i32> <i32 0, i32 undef>, %A
-  %C = sub <2 x i32> <i32 0, i32 undef>, %B
+  %B = sub <2 x i32> <i32 0, i32 poison>, %A
+  %C = sub <2 x i32> <i32 0, i32 poison>, %B
   ret <2 x i32> %C
 }
 
diff --git a/llvm/test/Transforms/InstSimplify/xor.ll b/llvm/test/Transforms/InstSimplify/xor.ll
index 0e23cc66c1652..229e943a3836f 100644
--- a/llvm/test/Transforms/InstSimplify/xor.ll
+++ b/llvm/test/Transforms/InstSimplify/xor.ll
@@ -156,6 +156,20 @@ define <2 x i4> @xor_and_or_not_undef_elt(<2 x i4> %a, <2 x i4> %b) {
   ret <2 x i4> %r
 }
 
+; but correct to propagate poison element
+
+define <2 x i4> @xor_and_or_not_poison_elt(<2 x i4> %a, <2 x i4> %b) {
+; CHECK-LABEL: @xor_and_or_not_poison_elt(
+; CHECK-NEXT:    [[NOT:%.*]] = xor <2 x i4> [[A:%.*]], <i4 -1, i4 poison>
+; CHECK-NEXT:    ret <2 x i4> [[NOT]]
+;
+  %and = and <2 x i4> %b, %a
+  %not = xor <2 x i4> %a, <i4 -1, i4 poison>
+  %or = or <2 x i4> %not, %b
+  %r = xor <2 x i4> %or, %and
+  ret <2 x i4> %r
+}
+
 define i4 @xor_or_and_not_commute0(i4 %a, i4 %b) {
 ; CHECK-LABEL: @xor_or_and_not_commute0(
 ; CHECK-NEXT:    ret i4 [[A:%.*]]
@@ -277,11 +291,11 @@ define i4 @xor_or_and_not_wrong_val2(i4 %a, i4 %b, i4 %c) {
   ret i4 %r
 }
 
-define <2 x i4> @xor_or_and_not_undef_elt(<2 x i4> %a, <2 x i4> %b) {
-; CHECK-LABEL: @xor_or_and_not_undef_elt(
+define <2 x i4> @xor_or_and_not_poison_elt(<2 x i4> %a, <2 x i4> %b) {
+; CHECK-LABEL: @xor_or_and_not_poison_elt(
 ; CHECK-NEXT:    ret <2 x i4> [[A:%.*]]
 ;
-  %not = xor <2 x i4> %a, <i4 -1, i4 undef>
+  %not = xor <2 x i4> %a, <i4 -1, i4 poison>
   %and = and <2 x i4> %b, %not
   %or = or <2 x i4> %a, %b
   %r = xor <2 x i4> %or, %and
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
index feb22aa1a3763..45e2c36836ffc 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
@@ -491,6 +491,30 @@ define void @store_bfloat_factor2(ptr %ptr, <16 x bfloat> %v0, <16 x bfloat> %v1
   ret void
 }
 
+; Ensure vscale_range property does not affect scalable vector types.
+define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_nxptr_factor2(ptr %ptr) #2 {
+; CHECK-LABEL: define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_nxptr_factor2(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[PTR]], i64 0
+; CHECK-NEXT:    [[LDN1:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN1]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[TMP4]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr <vscale x 2 x double>, ptr [[PTR]], i64 2
+; CHECK-NEXT:    [[LDN2:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP6]])
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN2]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP7]], i64 2)
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN2]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP5]], <vscale x 2 x double> [[TMP9]], i64 2)
+; CHECK-NEXT:    [[TMP11:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } poison, <vscale x 4 x double> [[TMP8]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP11]], <vscale x 4 x double> [[TMP10]], 1
+; CHECK-NEXT:    ret { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]]
+;
+  %wide.vec = load <vscale x 8 x double>, ptr %ptr, align 8
+  %ldN = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %wide.vec)
+  ret { <vscale x 4 x double>, <vscale x 4 x double> } %ldN
+}
+
 attributes #0 = { vscale_range(2,2) "target-features"="+sve" }
 attributes #1 = { vscale_range(2,4) "target-features"="+sve" }
 attributes #2 = { vscale_range(4,4) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopFlatten/widen-iv3.ll b/llvm/test/Transforms/LoopFlatten/widen-iv3.ll
index 6e6c045661c24..3ac5a69a496ff 100644
--- a/llvm/test/Transforms/LoopFlatten/widen-iv3.ll
+++ b/llvm/test/Transforms/LoopFlatten/widen-iv3.ll
@@ -35,7 +35,7 @@ define i16 @foo() {
 ; CHECK-NEXT:    [[SUM_110:%.*]] = phi i16 [ [[SUM_012]], [[FOR_COND1_PREHEADER]] ], [ [[ADD5]], [[FOR_BODY4]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[INDVAR]], [[TMP0]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[J_011]], [[MUL]]
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc nuw nsw i32 [[TMP2]] to i16
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [64 x i16], ptr @v, i16 0, i16 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[ADD5]] = add nsw i16 [[TMP4]], [[SUM_110]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
index 2470bca1e17b9..1c26ee8479e57 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-select-cmp.ll
@@ -8,41 +8,39 @@ target triple = "aarch64-linux-gnu"
 define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 {
 ; CHECK-VF4IC1-LABEL: @select_const_i32_from_icmp
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[VEC_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:   [[NOT:%*]] = xor <vscale x 4 x i1> [[VEC_ICMP]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR]], i32 7, i32 3
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
 
 ; CHECK-VF4IC4-LABEL: @select_const_i32_from_icmp
 ; CHECK-VF4IC4:      vector.body:
-; CHECK-VF4IC4:        [[VEC_PHI1:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
-; CHECK-VF4IC4:        [[VEC_PHI2:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
-; CHECK-VF4IC4:        [[VEC_PHI3:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
-; CHECK-VF4IC4:        [[VEC_PHI4:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_PHI4:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
 ; CHECK-VF4IC4:        [[VEC_ICMP1:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-VF4IC4-NEXT:   [[VEC_ICMP2:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-VF4IC4-NEXT:   [[VEC_ICMP3:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-VF4IC4-NEXT:   [[VEC_ICMP4:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[NOT1:%.*]] = xor <vscale x 4 x i1> [[VEC_ICMP1]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[NOT2:%.*]] = xor <vscale x 4 x i1> [[VEC_ICMP2]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[NOT3:%.*]] = xor <vscale x 4 x i1> [[VEC_ICMP3]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:    [[NOT4:%.*]] = xor <vscale x 4 x i1> [[VEC_ICMP4]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL1:%.*]] = or <vscale x 4 x i1> [[VEC_PHI1]], [[NOT1]]
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL2:%.*]] = or <vscale x 4 x i1> [[VEC_PHI2]], [[NOT2]]
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL3:%.*]] = or <vscale x 4 x i1> [[VEC_PHI3]], [[NOT3]]
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL4:%.*]] = or <vscale x 4 x i1> [[VEC_PHI4]], [[NOT4]]
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL1]] = select <vscale x 4 x i1> [[VEC_ICMP1]], <vscale x 4 x i32> [[VEC_PHI1]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL2]] = select <vscale x 4 x i1> [[VEC_ICMP2]], <vscale x 4 x i32> [[VEC_PHI2]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL3]] = select <vscale x 4 x i1> [[VEC_ICMP3]], <vscale x 4 x i32> [[VEC_PHI3]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL4]] = select <vscale x 4 x i1> [[VEC_ICMP4]], <vscale x 4 x i32> [[VEC_PHI4]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-VF4IC4:      middle.block:
-; CHECK-VF4IC4-NEXT:   [[OR1:%.*]] = or <vscale x 4 x i1> [[VEC_SEL2]], [[VEC_SEL1]]
-; CHECK-VF4IC4-NEXT:   [[OR2:%.*]] = or <vscale x 4 x i1> [[VEC_SEL3]], [[OR1]]
-; CHECK-VF4IC4-NEXT:   [[OR3:%.*]] = or <vscale x 4 x i1> [[VEC_SEL4]], [[OR2]]
-; CHECK-VF4IC4-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[OR3]])
-; CHECK-VF4IC4-NEXT:   [[FR:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC4-NEXT:   {{.*}} = select i1 [[FR]], i32 7, i32 3
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP5:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL1]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL5:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP5]], <vscale x 4 x i32> [[VEC_SEL1]], <vscale x 4 x i32> [[VEC_SEL2]]
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP6:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL5]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL6:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP6]], <vscale x 4 x i32> [[VEC_SEL5]], <vscale x 4 x i32> [[VEC_SEL3]]
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP7:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL6]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL7:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP7]], <vscale x 4 x i32> [[VEC_SEL6]], <vscale x 4 x i32> [[VEC_SEL4]]
+; CHECK-VF4IC4-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC4-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
 entry:
   br label %for.body
 
@@ -64,18 +62,21 @@ exit:                                     ; preds = %for.body
 define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 %n) #0 {
 ; CHECK-VF4IC1-LABEL: @select_i32_from_icmp
 ; CHECK-VF4IC1:      vector.ph:
-; CHECK-VF4IC1-NOT:    shufflevector <vscale x 4 x i32>
-; CHECK-VF4IC1-NOT:    shufflevector <vscale x 4 x i32>
+; CHECK-VF4IC1:        [[TMP1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 %a, i64 0
+; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_A:%.*]] = shufflevector <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT:   [[TMP2:%.*]] = insertelement <vscale x 4 x i32> poison, i32 %b, i64 0
+; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_B:%.*]] = shufflevector <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[SPLAT_OF_A]], %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[VEC_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:   [[NOT:%*]] = xor <vscale x 4 x i1> [[VEC_ICMP]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[SPLAT_OF_B]]
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR]], i32 %b, i32 %a
+; CHECK-VF4IC1-NEXT:   [[FIN_INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 %a, i64 0
+; CHECK-VF4IC1-NEXT:   [[FIN_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[FIN_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT:   [[FIN_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], [[FIN_SPLAT]]
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_CMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a
 
 ; CHECK-VF4IC4-LABEL: @select_i32_from_icmp
 ; CHECK-VF4IC4:      vector.body:
@@ -100,15 +101,14 @@ exit:                                     ; preds = %for.body
 define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) #0 {
 ; CHECK-VF4IC1-LABEL: @select_const_i32_from_fcmp
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <vscale x 4 x float>
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = fcmp fast ueq <vscale x 4 x float> [[VEC_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:   [[NOT:%*]] = xor <vscale x 4 x i1> [[VEC_ICMP]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR]], i32 1, i32 2
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 1, i32 2
 
 ; CHECK-VF4IC4-LABEL: @select_const_i32_from_fcmp
 ; CHECK-VF4IC4:      vector.body:
@@ -156,17 +156,17 @@ exit:                                     ; preds = %for.body
 define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i64 %n) #0 {
 ; CHECK-VF4IC1-LABEL: @pred_select_const_i32_from_icmp
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
 ; CHECK-VF4IC1:        [[MASK:%.*]] = icmp sgt <vscale x 4 x i32> [[VEC_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 35, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; CHECK-VF4IC1:        [[MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr {{%.*}}, i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x i32> poison)
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL_TMP:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[VEC_ICMP]]
-; CHECK-VF4IC1:        [[VEC_SEL:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x i1> [[VEC_SEL_TMP]], <vscale x 4 x i1> [[VEC_PHI]]
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL_TMP:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1:        [[VEC_SEL:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x i32> [[VEC_SEL_TMP]], <vscale x 4 x i32> [[VEC_PHI]]
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR]], i32 1, i32 0
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], zeroinitializer
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 1, i32 0
 
 ; CHECK-VF4IC4-LABEL: @pred_select_const_i32_from_icmp
 ; CHECK-VF4IC4:      vector.body:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
index d5ace655fdcc1..c22613509be4f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
@@ -46,8 +46,8 @@ define void @vector_reverse_mask_v4i1(ptr noalias %a, ptr noalias %cond, i64 %N)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i64 -24
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP7]], i64 -56
 ; CHECK-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP8]], i32 8, <4 x i1> [[REVERSE3]], <4 x double> poison)
+; CHECK-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP9]], i32 8, <4 x i1> [[REVERSE4]], <4 x double> poison)
 ; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD6]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll
index c49f7d4a5d5f1..eeef8f199353b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll
@@ -102,10 +102,10 @@ define void @uniform_store_i1(ptr noalias %dst, ptr noalias %start, i64 %N) {
 ; CHECK-LABEL: @uniform_store_i1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 64
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 32
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[N_VEC]], 8
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[TMP1]]
@@ -116,15 +116,12 @@ define void @uniform_store_i1(ptr noalias %dst, ptr noalias %start, i64 %N) {
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <32 x i64> <i64 0, i64 8, i64 16, i64 24, i64 32, i64 40, i64 48, i64 56, i64 64, i64 72, i64 80, i64 88, i64 96, i64 104, i64 112, i64 120, i64 128, i64 136, i64 144, i64 152, i64 160, i64 168, i64 176, i64 184, i64 192, i64 200, i64 208, i64 216, i64 224, i64 232, i64 240, i64 248>
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <32 x i64> <i64 256, i64 264, i64 272, i64 280, i64 288, i64 296, i64 304, i64 312, i64 320, i64 328, i64 336, i64 344, i64 352, i64 360, i64 368, i64 376, i64 384, i64 392, i64 400, i64 408, i64 416, i64 424, i64 432, i64 440, i64 448, i64 456, i64 464, i64 472, i64 480, i64 488, i64 496, i64 504>
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, <32 x ptr> [[TMP2]], i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, <32 x ptr> [[TMP3]], i64 1
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <32 x ptr> [[TMP4]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, <32 x ptr> [[TMP2]], i64 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq <32 x ptr> [[TMP5]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <32 x i1> [[TMP7]], i32 31
 ; CHECK-NEXT:    store i1 [[TMP8]], ptr [[DST:%.*]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
-; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 512
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 256
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
index 2b58acbfe9cc9..8a2dc0abb0de8 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/select-cmp-reduction.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S \
 ; RUN:   < %s | FileCheck %s
 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 \
@@ -6,59 +7,109 @@
 target triple = "riscv64"
 
 define i32 @select_icmp(i32 %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0 {
-; CHECK-LABEL: @select_icmp
+; CHECK-LABEL: define i32 @select_icmp(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr nocapture readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[Y]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[NOT:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; CHECK-NEXT:    [[FR:%.*]] = freeze i1 [[TMP7]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %y, i32 0
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i32 [[Y]], i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[A:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[COND:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP7]], [[X]]
+; CHECK-NEXT:    [[COND]] = select i1 [[CMP1]], i32 [[A]], i32 [[Y]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[COND_LCSSA]]
 ;
-; SCALABLE-LABEL: @select_icmp
+; SCALABLE-LABEL: define i32 @select_icmp(
+; SCALABLE-SAME: i32 [[X:%.*]], i32 [[Y:%.*]], ptr nocapture readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; SCALABLE-NEXT:  entry:
+; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]]
-; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
-; SCALABLE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
-; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X:%.*]], i64 0
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Y]], i64 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP4]]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
-; SCALABLE-NEXT:    [[TMP8:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; SCALABLE-NEXT:    [[NOT:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP9]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
-; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[TMP6]]
+; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; SCALABLE-NEXT:    [[TMP9:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; SCALABLE-NEXT:    [[TMP10]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[BROADCAST_SPLAT2]]
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
-; SCALABLE-NEXT:    [[FR:%.*]] = freeze i1 [[TMP13]]
-; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %y, i32 0
+; SCALABLE-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP10]], zeroinitializer
+; SCALABLE-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
+; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP12]], i32 [[Y]], i32 0
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; SCALABLE:       scalar.ph:
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; SCALABLE-NEXT:    br label [[FOR_BODY:%.*]]
+; SCALABLE:       for.body:
+; SCALABLE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; SCALABLE-NEXT:    [[A:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[COND:%.*]], [[FOR_BODY]] ]
+; SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; SCALABLE-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP13]], [[X]]
+; SCALABLE-NEXT:    [[COND]] = select i1 [[CMP1]], i32 [[A]], i32 [[Y]]
+; SCALABLE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; SCALABLE:       for.end:
+; SCALABLE-NEXT:    [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; SCALABLE-NEXT:    ret i32 [[COND_LCSSA]]
 ;
 entry:
   br label %for.body
@@ -79,59 +130,109 @@ for.end:
 }
 
 define i32 @select_fcmp(float %x, i32 %y, ptr nocapture readonly %c, i64 %n) #0 {
-; CHECK-LABEL: @select_fcmp
+; CHECK-LABEL: define i32 @select_fcmp(
+; CHECK-SAME: float [[X:%.*]], i32 [[Y:%.*]], ptr nocapture readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[X:%.*]], i64 0
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[X]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[Y]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[NOT:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp fast olt <4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i32> [[VEC_PHI]], <4 x i32> [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; CHECK-NEXT:    [[FR:%.*]] = freeze i1 [[TMP7]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %y, i32 0
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i32 [[Y]], i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[A:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[COND:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp fast olt float [[TMP7]], [[X]]
+; CHECK-NEXT:    [[COND]] = select i1 [[CMP1]], i32 [[A]], i32 [[Y]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[COND_LCSSA]]
 ;
-; SCALABLE-LABEL: @select_fcmp
+; SCALABLE-LABEL: define i32 @select_fcmp(
+; SCALABLE-SAME: float [[X:%.*]], i32 [[Y:%.*]], ptr nocapture readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SCALABLE-NEXT:  entry:
+; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]]
-; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
-; SCALABLE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
-; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[X:%.*]], i64 0
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[X]], i64 0
 ; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[BROADCAST_SPLATINSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Y]], i64 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[TMP4]]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
-; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4
-; SCALABLE-NEXT:    [[TMP8:%.*]] = fcmp fast olt <vscale x 4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; SCALABLE-NEXT:    [[NOT:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP9]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
-; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[TMP6]]
+; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
+; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
+; SCALABLE-NEXT:    [[TMP9:%.*]] = fcmp fast olt <vscale x 4 x float> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; SCALABLE-NEXT:    [[TMP10]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[BROADCAST_SPLAT2]]
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
-; SCALABLE-NEXT:    [[FR:%.*]] = freeze i1 [[TMP13]]
-; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %y, i32 0
+; SCALABLE-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP10]], zeroinitializer
+; SCALABLE-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
+; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP12]], i32 [[Y]], i32 0
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; SCALABLE:       scalar.ph:
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; SCALABLE-NEXT:    br label [[FOR_BODY:%.*]]
+; SCALABLE:       for.body:
+; SCALABLE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; SCALABLE-NEXT:    [[A:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[COND:%.*]], [[FOR_BODY]] ]
+; SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[INDVARS_IV]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; SCALABLE-NEXT:    [[CMP1:%.*]] = fcmp fast olt float [[TMP13]], [[X]]
+; SCALABLE-NEXT:    [[COND]] = select i1 [[CMP1]], i32 [[A]], i32 [[Y]]
+; SCALABLE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; SCALABLE:       for.end:
+; SCALABLE-NEXT:    [[COND_LCSSA:%.*]] = phi i32 [ [[COND]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; SCALABLE-NEXT:    ret i32 [[COND_LCSSA]]
 ;
 entry:
   br label %for.body
@@ -152,55 +253,101 @@ for.end:
 }
 
 define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 {
-; CHECK-LABEL: @select_const_i32_from_icmp
+; CHECK-LABEL: define i32 @select_const_i32_from_icmp(
+; CHECK-SAME: ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[NOT:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; CHECK-NEXT:    [[FR:%.*]] = freeze i1 [[TMP7]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 7, i32 3
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP5]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 7, i32 3
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 3, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP15:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 3
+; CHECK-NEXT:    [[TMP12]] = select i1 [[TMP11]], i32 [[TMP8]], i32 7
+; CHECK-NEXT:    [[TMP13]] = add nuw nsw i64 [[TMP15]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP12]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[DOTLCSSA]]
 ;
-; SCALABLE-LABEL: @select_const_i32_from_icmp
+; SCALABLE-LABEL: define i32 @select_const_i32_from_icmp(
+; SCALABLE-SAME: ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SCALABLE-NEXT:  entry:
+; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]]
-; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP4]]
+; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP4]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
 ; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[NOT:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP9]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
+; SCALABLE-NEXT:    [[TMP9]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
-; SCALABLE-NEXT:    [[FR:%.*]] = freeze i1 [[TMP13]]
-; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 7, i32 3
+; SCALABLE-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP9]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
+; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 7, i32 3
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE:       scalar.ph:
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 3, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; SCALABLE-NEXT:    br label [[FOR_BODY:%.*]]
+; SCALABLE:       for.body:
+; SCALABLE-NEXT:    [[TMP21:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP14:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP21]]
+; SCALABLE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
+; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 3
+; SCALABLE-NEXT:    [[TMP18]] = select i1 [[TMP17]], i32 [[TMP14]], i32 7
+; SCALABLE-NEXT:    [[TMP19]] = add nuw nsw i64 [[TMP21]], 1
+; SCALABLE-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[N]]
+; SCALABLE-NEXT:    br i1 [[TMP20]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; SCALABLE:       exit:
+; SCALABLE-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP18]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; SCALABLE-NEXT:    ret i32 [[DOTLCSSA]]
 ;
 entry:
   br label %for.body
@@ -221,55 +368,113 @@ exit:                                     ; preds = %for.body
 }
 
 define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 %n) #0 {
-; CHECK-LABEL: @select_i32_from_icmp
+; CHECK-LABEL: define i32 @select_i32_from_icmp(
+; CHECK-SAME: ptr nocapture readonly [[V:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT:    [[NOT:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; CHECK-NEXT:    [[FR:%.*]] = freeze i1 [[TMP7]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %b, i32 %a
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP5]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 [[B]], i32 [[A]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[A]], [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP15:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 3
+; CHECK-NEXT:    [[TMP12]] = select i1 [[TMP11]], i32 [[TMP8]], i32 [[B]]
+; CHECK-NEXT:    [[TMP13]] = add nuw nsw i64 [[TMP15]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP12]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[DOTLCSSA]]
 ;
-; SCALABLE-LABEL: @select_i32_from_icmp
+; SCALABLE-LABEL: define i32 @select_i32_from_icmp(
+; SCALABLE-SAME: ptr nocapture readonly [[V:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SCALABLE-NEXT:  entry:
+; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]]
-; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; SCALABLE-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i64 0
+; SCALABLE-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B]], i64 0
+; SCALABLE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V:%.*]], i64 [[TMP4]]
+; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP4]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
 ; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[NOT:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP9]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
+; SCALABLE-NEXT:    [[TMP9]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[BROADCAST_SPLAT]]
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
-; SCALABLE-NEXT:    [[FR:%.*]] = freeze i1 [[TMP13]]
-; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 %b, i32 %a
+; SCALABLE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i64 0
+; SCALABLE-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; SCALABLE-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP9]], [[DOTSPLAT]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
+; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 [[B]], i32 [[A]]
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE:       scalar.ph:
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[A]], [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; SCALABLE-NEXT:    br label [[FOR_BODY:%.*]]
+; SCALABLE:       for.body:
+; SCALABLE-NEXT:    [[TMP21:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP14:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP21]]
+; SCALABLE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
+; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 3
+; SCALABLE-NEXT:    [[TMP18]] = select i1 [[TMP17]], i32 [[TMP14]], i32 [[B]]
+; SCALABLE-NEXT:    [[TMP19]] = add nuw nsw i64 [[TMP21]], 1
+; SCALABLE-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[N]]
+; SCALABLE-NEXT:    br i1 [[TMP20]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; SCALABLE:       exit:
+; SCALABLE-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP18]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; SCALABLE-NEXT:    ret i32 [[DOTLCSSA]]
 ;
 entry:
   br label %for.body
@@ -290,55 +495,101 @@ exit:                                     ; preds = %for.body
 }
 
 define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) #0 {
-; CHECK-LABEL: @select_const_i32_from_fcmp
+; CHECK-LABEL: define i32 @select_const_i32_from_fcmp(
+; CHECK-SAME: ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2, i32 2, i32 2, i32 2>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[V:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fcmp fast ueq <4 x float> [[WIDE_LOAD]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
-; CHECK-NEXT:    [[NOT:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-NEXT:    [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; CHECK-NEXT:    [[FR:%.*]] = freeze i1 [[TMP7]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 1, i32 2
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[TMP5]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 1, i32 2
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP15:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = fcmp fast ueq float [[TMP10]], 3.000000e+00
+; CHECK-NEXT:    [[TMP12]] = select i1 [[TMP11]], i32 [[TMP8]], i32 1
+; CHECK-NEXT:    [[TMP13]] = add nuw nsw i64 [[TMP15]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP12]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[DOTLCSSA]]
 ;
-; SCALABLE-LABEL: @select_const_i32_from_fcmp
+; SCALABLE-LABEL: define i32 @select_const_i32_from_fcmp(
+; SCALABLE-SAME: ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SCALABLE-NEXT:  entry:
+; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]]
-; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; SCALABLE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 4
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; SCALABLE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[V:%.*]], i64 [[TMP4]]
+; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP4]]
 ; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP5]], i32 0
 ; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4
 ; SCALABLE-NEXT:    [[TMP8:%.*]] = fcmp fast ueq <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i64 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[NOT:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP9]] = or <vscale x 4 x i1> [[VEC_PHI]], [[NOT]]
+; SCALABLE-NEXT:    [[TMP9]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
 ; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALABLE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
-; SCALABLE-NEXT:    [[FR:%.*]] = freeze i1 [[TMP13]]
-; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 1, i32 2
+; SCALABLE-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP9]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
+; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 1, i32 2
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE:       scalar.ph:
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; SCALABLE-NEXT:    br label [[FOR_BODY:%.*]]
+; SCALABLE:       for.body:
+; SCALABLE-NEXT:    [[TMP21:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP14:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[V]], i64 [[TMP21]]
+; SCALABLE-NEXT:    [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4
+; SCALABLE-NEXT:    [[TMP17:%.*]] = fcmp fast ueq float [[TMP16]], 3.000000e+00
+; SCALABLE-NEXT:    [[TMP18]] = select i1 [[TMP17]], i32 [[TMP14]], i32 1
+; SCALABLE-NEXT:    [[TMP19]] = add nuw nsw i64 [[TMP21]], 1
+; SCALABLE-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[N]]
+; SCALABLE-NEXT:    br i1 [[TMP20]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; SCALABLE:       exit:
+; SCALABLE-NEXT:    [[DOTLCSSA:%.*]] = phi i32 [ [[TMP18]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; SCALABLE-NEXT:    ret i32 [[DOTLCSSA]]
 ;
 entry:
   br label %for.body
@@ -359,11 +610,41 @@ exit:                                     ; preds = %for.body
 }
 
 define float @select_const_f32_from_icmp(ptr nocapture readonly %v, i64 %n) #0 {
-; CHECK-LABEL: @select_const_f32_from_icmp
-; CHECK-NOT: vector.body
+; CHECK-LABEL: define float @select_const_f32_from_icmp(
+; CHECK-SAME: ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi fast float [ 3.000000e+00, [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 3
+; CHECK-NEXT:    [[TMP5]] = select fast i1 [[TMP4]], float [[TMP1]], float 7.000000e+00
+; CHECK-NEXT:    [[TMP6]] = add nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP6]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ]
+; CHECK-NEXT:    ret float [[DOTLCSSA]]
 ;
-; SCALABLE-LABEL: @select_const_f32_from_icmp
-; SCALABLE-NOT: vector.body
+; SCALABLE-LABEL: define float @select_const_f32_from_icmp(
+; SCALABLE-SAME: ptr nocapture readonly [[V:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SCALABLE-NEXT:  entry:
+; SCALABLE-NEXT:    br label [[FOR_BODY:%.*]]
+; SCALABLE:       for.body:
+; SCALABLE-NEXT:    [[TMP0:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP1:%.*]] = phi fast float [ 3.000000e+00, [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[V]], i64 [[TMP0]]
+; SCALABLE-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
+; SCALABLE-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 3
+; SCALABLE-NEXT:    [[TMP5]] = select fast i1 [[TMP4]], float [[TMP1]], float 7.000000e+00
+; SCALABLE-NEXT:    [[TMP6]] = add nuw nsw i64 [[TMP0]], 1
+; SCALABLE-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP6]], [[N]]
+; SCALABLE-NEXT:    br i1 [[TMP7]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; SCALABLE:       exit:
+; SCALABLE-NEXT:    [[DOTLCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ]
+; SCALABLE-NEXT:    ret float [[DOTLCSSA]]
 ;
 entry:
   br label %for.body
@@ -384,63 +665,127 @@ exit:                                     ; preds = %for.body
 }
 
 define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i64 %n) #0 {
-; CHECK-LABEL: @pred_select_const_i32_from_icmp
+; CHECK-LABEL: define i32 @pred_select_const_i32_from_icmp(
+; CHECK-SAME: ptr noalias nocapture readonly [[SRC1:%.*]], ptr noalias nocapture readonly [[SRC2:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC1:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], <i32 35, i32 35, i32 35, i32 35>
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[SRC2:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP6]], i32 4, <4 x i1> [[TMP4]], <4 x i32> poison)
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i32> [[WIDE_MASKED_LOAD]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i1> [[VEC_PHI]], [[TMP8]]
-; CHECK-NEXT:    [[PREDPHI]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP9]], <4 x i1> [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt <4 x i32> [[WIDE_LOAD]], <i32 35, i32 35, i32 35, i32 35>
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[SRC2]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP5]], i32 4, <4 x i1> [[TMP3]], <4 x i32> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq <4 x i32> [[WIDE_MASKED_LOAD]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT:    [[PREDPHI]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP7]], <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[PREDPHI]])
-; CHECK-NEXT:    [[FR:%.*]] = freeze i1 [[TMP12]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 1, i32 0
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i32> [[PREDPHI]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP9]], i32 1, i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[I_013]]
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP10]], 35
+; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[I_013]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[TMP11]], 2
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]]
+; CHECK-NEXT:    br label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ]
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_013]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[R_1_LCSSA]]
 ;
-; SCALABLE-LABEL: @pred_select_const_i32_from_icmp
+; SCALABLE-LABEL: define i32 @pred_select_const_i32_from_icmp(
+; SCALABLE-SAME: ptr noalias nocapture readonly [[SRC1:%.*]], ptr noalias nocapture readonly [[SRC2:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SCALABLE-NEXT:  entry:
+; SCALABLE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; SCALABLE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; SCALABLE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALABLE:       vector.ph:
 ; SCALABLE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; SCALABLE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 %n, [[TMP3]]
-; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 %n, [[N_MOD_VF]]
-; SCALABLE-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; SCALABLE-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; SCALABLE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; SCALABLE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; SCALABLE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; SCALABLE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
 ; SCALABLE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALABLE:       vector.body:
-; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
-; SCALABLE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; SCALABLE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[SRC1:%.*]], i64 [[TMP4]]
-; SCALABLE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
-; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
-; SCALABLE-NEXT:    [[TMP8:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 35, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[SRC2:%.*]], i64 [[TMP4]]
-; SCALABLE-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[TMP9]], i32 0
-; SCALABLE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> poison)
+; SCALABLE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
+; SCALABLE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; SCALABLE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[TMP6]]
+; SCALABLE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
+; SCALABLE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
+; SCALABLE-NEXT:    [[TMP9:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 35, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; SCALABLE-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[SRC2]], i64 [[TMP6]]
+; SCALABLE-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
+; SCALABLE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> poison)
 ; SCALABLE-NEXT:    [[TMP12:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; SCALABLE-NEXT:    [[TMP13:%.*]] = or <vscale x 4 x i1> [[VEC_PHI]], [[TMP12]]
-; SCALABLE-NEXT:    [[PREDPHI]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> [[VEC_PHI]]
-; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
-; SCALABLE-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SCALABLE-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; SCALABLE-NEXT:    [[TMP13:%.*]] = select <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[VEC_PHI]]
+; SCALABLE-NEXT:    [[PREDPHI]] = select <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> [[TMP13]], <vscale x 4 x i32> [[VEC_PHI]]
+; SCALABLE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; SCALABLE-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; SCALABLE:       middle.block:
-; SCALABLE-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[PREDPHI]])
-; SCALABLE-NEXT:    [[FR:%.*]] = freeze i1 [[TMP18]]
-; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR]], i32 1, i32 0
+; SCALABLE-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[PREDPHI]], zeroinitializer
+; SCALABLE-NEXT:    [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
+; SCALABLE-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP15]], i32 1, i32 0
+; SCALABLE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; SCALABLE-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; SCALABLE:       scalar.ph:
+; SCALABLE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; SCALABLE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; SCALABLE-NEXT:    br label [[FOR_BODY:%.*]]
+; SCALABLE:       for.body:
+; SCALABLE-NEXT:    [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; SCALABLE-NEXT:    [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; SCALABLE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[I_013]]
+; SCALABLE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; SCALABLE-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP16]], 35
+; SCALABLE-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; SCALABLE:       if.then:
+; SCALABLE-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[I_013]]
+; SCALABLE-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; SCALABLE-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[TMP17]], 2
+; SCALABLE-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]]
+; SCALABLE-NEXT:    br label [[FOR_INC]]
+; SCALABLE:       for.inc:
+; SCALABLE-NEXT:    [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ]
+; SCALABLE-NEXT:    [[INC]] = add nuw nsw i64 [[I_013]], 1
+; SCALABLE-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; SCALABLE-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; SCALABLE:       for.end.loopexit:
+; SCALABLE-NEXT:    [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; SCALABLE-NEXT:    ret i32 [[R_1_LCSSA]]
 ;
 entry:
   br label %for.body
@@ -472,3 +817,34 @@ for.end.loopexit:                                 ; preds = %for.inc
 }
 
 attributes #0 = { "target-features"="+f,+v" }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+;.
+; SCALABLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; SCALABLE: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; SCALABLE: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; SCALABLE: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; SCALABLE: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; SCALABLE: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; SCALABLE: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; SCALABLE: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; SCALABLE: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; SCALABLE: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; SCALABLE: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; SCALABLE: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; SCALABLE: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; SCALABLE: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
index c55e732c90147..59b8ce42380d9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
@@ -25,7 +25,7 @@ define void @fp_iv_loop1(ptr noalias nocapture %A, i32 %N) #0 {
 ; AUTO_VEC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
 ; AUTO_VEC:       vector.ph:
 ; AUTO_VEC-NEXT:    [[N_VEC:%.*]] = and i64 [[ZEXT]], 2147483616
-; AUTO_VEC-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; AUTO_VEC-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; AUTO_VEC-NEXT:    [[TMP0:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01
 ; AUTO_VEC-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP0]], 1.000000e+00
 ; AUTO_VEC-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -201,7 +201,7 @@ define double @external_use_with_fast_math(ptr %a, i64 %n) {
 ; AUTO_VEC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
 ; AUTO_VEC:       vector.ph:
 ; AUTO_VEC-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775792
-; AUTO_VEC-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to double
+; AUTO_VEC-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to double
 ; AUTO_VEC-NEXT:    [[TMP0:%.*]] = fmul fast double [[DOTCAST]], 3.000000e+00
 ; AUTO_VEC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AUTO_VEC:       vector.body:
@@ -366,7 +366,7 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) {
 ; AUTO_VEC-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
 ; AUTO_VEC:       vector.ph:
 ; AUTO_VEC-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967264
-; AUTO_VEC-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; AUTO_VEC-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; AUTO_VEC-NEXT:    [[TMP1:%.*]] = fmul reassoc float [[DOTCAST]], 4.200000e+01
 ; AUTO_VEC-NEXT:    [[IND_END:%.*]] = fadd reassoc float [[TMP1]], 1.000000e+00
 ; AUTO_VEC-NEXT:    br label [[VECTOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
index eea2894f82794..aea72b7de5f42 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -1400,15 +1400,15 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
 ; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr double, ptr [[TMP20]], i32 -12
 ; AVX2-NEXT:    [[TMP31:%.*]] = getelementptr double, ptr [[TMP30]], i32 -3
 ; AVX2-NEXT:    [[REVERSE12:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[REVERSE14:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[REVERSE17:%.*]] = shufflevector <4 x i1> [[TMP18]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:    [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP19]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP25]], i32 8, <4 x i1> [[REVERSE12]], <4 x double> poison), !alias.scope !21
 ; AVX2-NEXT:    [[REVERSE13:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[REVERSE14:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP27]], i32 8, <4 x i1> [[REVERSE14]], <4 x double> poison), !alias.scope !21
 ; AVX2-NEXT:    [[REVERSE16:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD15]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[REVERSE17:%.*]] = shufflevector <4 x i1> [[TMP18]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD18:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP29]], i32 8, <4 x i1> [[REVERSE17]], <4 x double> poison), !alias.scope !21
 ; AVX2-NEXT:    [[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD18]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP19]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP31]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope !21
 ; AVX2-NEXT:    [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD21]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:    [[TMP32:%.*]] = fadd <4 x double> [[REVERSE13]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
@@ -1524,15 +1524,15 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr
 ; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr double, ptr [[TMP20]], i32 -24
 ; AVX512-NEXT:    [[TMP31:%.*]] = getelementptr double, ptr [[TMP30]], i32 -7
 ; AVX512-NEXT:    [[REVERSE12:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[REVERSE14:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[REVERSE17:%.*]] = shufflevector <8 x i1> [[TMP18]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:    [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP19]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP25]], i32 8, <8 x i1> [[REVERSE12]], <8 x double> poison), !alias.scope !34
 ; AVX512-NEXT:    [[REVERSE13:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[REVERSE14:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP27]], i32 8, <8 x i1> [[REVERSE14]], <8 x double> poison), !alias.scope !34
 ; AVX512-NEXT:    [[REVERSE16:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD15]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[REVERSE17:%.*]] = shufflevector <8 x i1> [[TMP18]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD18:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP29]], i32 8, <8 x i1> [[REVERSE17]], <8 x double> poison), !alias.scope !34
 ; AVX512-NEXT:    [[REVERSE19:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD18]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[REVERSE20:%.*]] = shufflevector <8 x i1> [[TMP19]], <8 x i1> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[TMP31]], i32 8, <8 x i1> [[REVERSE20]], <8 x double> poison), !alias.scope !34
 ; AVX512-NEXT:    [[REVERSE22:%.*]] = shufflevector <8 x double> [[WIDE_MASKED_LOAD21]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:    [[TMP32:%.*]] = fadd <8 x double> [[REVERSE13]], <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>
diff --git a/llvm/test/Transforms/LoopVectorize/blend-in-header.ll b/llvm/test/Transforms/LoopVectorize/blend-in-header.ll
new file mode 100644
index 0000000000000..01e223a324379
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/blend-in-header.ll
@@ -0,0 +1,233 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2"
+
+; Test with blend recipe in header VPBB, from
+; https://github.com/llvm/llvm-project/issues/88297.
+define i64 @pr88297() {
+; CHECK-LABEL: define i64 @pr88297() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    br i1 false, label [[LOOP_LATCH]], label [[THEN:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[R:%.*]] = phi i64 [ 1, [[THEN]] ], [ 0, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp sgt i32 [[IV]], 1000
+; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[R_LCSSA:%.*]] = phi i64 [ [[R]], [[LOOP_LATCH]] ], [ 1, [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[R_LCSSA]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  br i1 false, label %loop.latch, label %then
+
+then:
+  br label %loop.latch
+
+loop.latch:
+  %r = phi i64 [ 1, %then ], [ 0, %loop.header ]
+  %iv.next = add i32 %iv, 1
+  %icmp = icmp sgt i32 %iv, 1000
+  br i1 %icmp, label %exit, label %loop.header
+
+exit:
+  %r.lcssa = phi i64 [ %r, %loop.latch ]
+  ret i64 %r.lcssa
+}
+
+define i64 @pr88297_incoming_ops_reordered() {
+; CHECK-LABEL: define i64 @pr88297_incoming_ops_reordered() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    br i1 false, label [[LOOP_LATCH]], label [[THEN:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[R:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ 1, [[THEN]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp sgt i32 [[IV]], 1000
+; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[R_LCSSA:%.*]] = phi i64 [ [[R]], [[LOOP_LATCH]] ], [ 1, [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[R_LCSSA]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  br i1 false, label %loop.latch, label %then
+
+then:
+  br label %loop.latch
+
+loop.latch:
+  %r = phi i64 [ 0, %loop.header ], [ 1, %then ]
+  %iv.next = add i32 %iv, 1
+  %icmp = icmp sgt i32 %iv, 1000
+  br i1 %icmp, label %exit, label %loop.header
+
+exit:
+  %r.lcssa = phi i64 [ %r, %loop.latch ]
+  ret i64 %r.lcssa
+}
+
+define i64 @invar_cond(i1 %c) {
+; CHECK-LABEL: define i64 @invar_cond(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> zeroinitializer, <4 x i64> <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 3
+; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_LATCH]], label [[THEN:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[R:%.*]] = phi i64 [ 1, [[THEN]] ], [ 0, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp sgt i32 [[IV]], 1000
+; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[R_LCSSA:%.*]] = phi i64 [ [[R]], [[LOOP_LATCH]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[R_LCSSA]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  br i1 %c, label %loop.latch, label %then
+
+then:
+  br label %loop.latch
+
+loop.latch:
+  %r = phi i64 [ 1, %then ], [ 0, %loop.header ]
+  %iv.next = add i32 %iv, 1
+  %icmp = icmp sgt i32 %iv, 1000
+  br i1 %icmp, label %exit, label %loop.header
+
+exit:
+  %r.lcssa = phi i64 [ %r, %loop.latch ]
+  ret i64 %r.lcssa
+}
+
+define i64 @invar_cond_incoming_ops_reordered(i1 %c) {
+; CHECK-LABEL: define i64 @invar_cond_incoming_ops_reordered(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP1]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>, <4 x i64> zeroinitializer
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 3
+; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_LATCH]], label [[THEN:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    br label [[LOOP_LATCH]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[R:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ 1, [[THEN]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp sgt i32 [[IV]], 1000
+; CHECK-NEXT:    br i1 [[ICMP]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[R_LCSSA:%.*]] = phi i64 [ [[R]], [[LOOP_LATCH]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[R_LCSSA]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  br i1 %c, label %loop.latch, label %then
+
+then:
+  br label %loop.latch
+
+loop.latch:
+  %r = phi i64 [ 0, %loop.header ], [ 1, %then ]
+  %iv.next = add i32 %iv, 1
+  %icmp = icmp sgt i32 %iv, 1000
+  br i1 %icmp, label %exit, label %loop.header
+
+exit:
+  %r.lcssa = phi i64 [ %r, %loop.latch ]
+  ret i64 %r.lcssa
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
index c721da7597b1c..0b872709ec6c6 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
@@ -19,20 +19,20 @@ define i32 @any_of_reduction_epilog(ptr %src, i64 %N) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8]] = select <4 x i1> [[TMP4]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; CHECK-NEXT:    [[TMP8:%.*]] = freeze i1 [[TMP7]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP8]], i32 1, i32 0
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP7]], i32 1, i32 0
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
@@ -42,33 +42,32 @@ define i32 @any_of_reduction_epilog(ptr %src, i64 %N) {
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[BC_MERGE_RDX]], 0
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]]
-; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP9]], i64 0
-; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i1> [[MINMAX_IDENT_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_MERGE_RDX]], i64 0
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i1> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX5]], 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP12]], align 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD7]], zeroinitializer
-; CHECK-NEXT:    [[TMP14]] = or <4 x i1> [[VEC_PHI6]], [[TMP13]]
+; CHECK-NEXT:    [[TMP17]] = select <4 x i1> [[TMP13]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> [[VEC_PHI6]]
 ; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <4 x i32> [[TMP17]], zeroinitializer
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]])
-; CHECK-NEXT:    [[TMP17:%.*]] = freeze i1 [[TMP16]]
-; CHECK-NEXT:    [[RDX_SELECT9:%.*]] = select i1 [[TMP17]], i32 1, i32 0
+; CHECK-NEXT:    [[RDX_SELECT9:%.*]] = select i1 [[TMP16]], i32 1, i32 0
 ; CHECK-NEXT:    [[CMP_N4:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N4]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX10:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_SELECT9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX10:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_SELECT9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -102,104 +101,6 @@ exit:
   ret i32 %select
 }
 
-define i32 @any_of_reduction_epilog_arg_as_start_value(ptr %src, i64 %N, i32 %start) {
-; CHECK-LABEL: define i32 @any_of_reduction_epilog_arg_as_start_value(
-; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i32 [[START:%.*]]) {
-; CHECK-NEXT:  iter.check:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP5]] = or <4 x i1> [[VEC_PHI]], [[TMP4]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; CHECK-NEXT:    [[TMP8:%.*]] = freeze i1 [[TMP7]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP8]], i32 1, i32 [[START]]
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[START]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne i32 [[BC_MERGE_RDX]], [[START]]
-; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4
-; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]]
-; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP9]], i64 0
-; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i1> [[MINMAX_IDENT_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i1> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP14:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX5]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP12]], align 1
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD7]], zeroinitializer
-; CHECK-NEXT:    [[TMP14]] = or <4 x i1> [[VEC_PHI6]], [[TMP13]]
-; CHECK-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 4
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP16:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]])
-; CHECK-NEXT:    [[TMP17:%.*]] = freeze i1 [[TMP16]]
-; CHECK-NEXT:    [[RDX_SELECT9:%.*]] = select i1 [[TMP17]], i32 1, i32 [[START]]
-; CHECK-NEXT:    [[CMP_N4:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[CMP_N4]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX10:%.*]] = phi i32 [ [[START]], [[ITER_CHECK]] ], [ [[START]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_SELECT9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX10]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[SELECT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
-; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq i8 [[LOAD]], 0
-; CHECK-NEXT:    [[SELECT]] = select i1 [[ICMP]], i32 1, i32 [[RED]]
-; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[ICMP3:%.*]] = icmp eq i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[ICMP3]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[SELECT_LCSSA:%.*]] = phi i32 [ [[SELECT]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[RDX_SELECT9]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[SELECT_LCSSA]]
-;
-entry:
-  br label %loop
-
-loop:
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
-  %red = phi i32 [ %start, %entry ], [ %select, %loop ]
-  %gep = getelementptr inbounds i8, ptr %src, i64 %iv
-  %load = load i8, ptr %gep, align 1
-  %icmp = icmp eq i8 %load, 0
-  %select = select i1 %icmp, i32 1, i32 %red
-  %iv.next = add i64 %iv, 1
-  %icmp3 = icmp eq i64 %iv, %N
-  br i1 %icmp3, label %exit, label %loop
-
-exit:
-  ret i32 %select
-}
 
 define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
 ; CHECK-LABEL: define i1 @any_of_reduction_i1_epilog(
@@ -223,15 +124,14 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP3]] = or <4 x i1> [[VEC_PHI]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3]] = select <4 x i1> [[TMP1]], <4 x i1> [[VEC_PHI]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = freeze i1 [[TMP5]]
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <4 x i1> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP]])
 ; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i1 false, i1 false
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -244,11 +144,10 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i1 [ false, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i1 [[BC_MERGE_RDX]], false
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF2]]
 ; CHECK-NEXT:    [[IND_END5:%.*]] = trunc i64 [[N_VEC3]] to i32
-; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP7]], i64 0
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[BC_MERGE_RDX]], i64 0
 ; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x i1> [[MINMAX_IDENT_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
@@ -261,22 +160,21 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
 ; CHECK-NEXT:    [[VEC_PHI10:%.*]] = phi <4 x i1> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND11:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i32> [[VEC_IND11]], [[BROADCAST_SPLAT14]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i1> [[TMP8]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT:    [[TMP10]] = or <4 x i1> [[VEC_PHI10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10]] = select <4 x i1> [[TMP8]], <4 x i1> [[VEC_PHI10]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX9]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT12]] = add <4 x i32> [[VEC_IND11]], <i32 4, i32 4, i32 4, i32 4>
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP10]])
-; CHECK-NEXT:    [[TMP13:%.*]] = freeze i1 [[TMP12]]
+; CHECK-NEXT:    [[RDX_SELECT_CMP16:%.*]] = icmp ne <4 x i1> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_SELECT_CMP16]])
 ; CHECK-NEXT:    [[RDX_SELECT16:%.*]] = select i1 [[TMP13]], i1 false, i1 false
 ; CHECK-NEXT:    [[CMP_N8:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N8]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL7:%.*]] = phi i32 [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END6]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX17:%.*]] = phi i1 [ false, [[ITER_CHECK]] ], [ false, [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_SELECT16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX17:%.*]] = phi i1 [ false, [[ITER_CHECK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[RDX_SELECT16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL4]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -287,7 +185,7 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[IV_2_NEXT]] = add i32 [[IV_2]], 1
 ; CHECK-NEXT:    [[CMP_2:%.*]] = icmp eq i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[CMP_2]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP_2]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[SEL_LCSSA:%.*]] = phi i1 [ [[SEL]], [[LOOP]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ], [ [[RDX_SELECT16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i1 [[SEL_LCSSA]]
@@ -321,7 +219,4 @@ exit:
 ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
-; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll
index caea114e3d448..bd658c31768a8 100644
--- a/llvm/test/Transforms/LoopVectorize/float-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll
@@ -29,7 +29,7 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N)
 ; VEC4_INTERL1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL1:       vector.ph:
 ; VEC4_INTERL1-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483644
-; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC4_INTERL1-NEXT:    [[TMP1:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
 ; VEC4_INTERL1-NEXT:    [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP1]]
 ; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
@@ -84,7 +84,7 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N)
 ; VEC4_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL2:       vector.ph:
 ; VEC4_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483640
-; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC4_INTERL2-NEXT:    [[TMP1:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
 ; VEC4_INTERL2-NEXT:    [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP1]]
 ; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
@@ -142,7 +142,7 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N)
 ; VEC1_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC1_INTERL2:       vector.ph:
 ; VEC1_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646
-; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC1_INTERL2-NEXT:    [[TMP1:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
 ; VEC1_INTERL2-NEXT:    [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP1]]
 ; VEC1_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -193,7 +193,7 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N)
 ; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC2_INTERL1_PRED_STORE:       vector.ph:
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP1:%.*]] = fmul fast float [[FPINC]], [[DOTCAST]]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[IND_END:%.*]] = fsub fast float [[INIT:%.*]], [[TMP1]]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0
@@ -276,7 +276,7 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32
 ; VEC4_INTERL1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL1:       vector.ph:
 ; VEC4_INTERL1-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483644
-; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC4_INTERL1-NEXT:    [[TMP1:%.*]] = fmul reassoc float [[FPINC]], [[DOTCAST]]
 ; VEC4_INTERL1-NEXT:    [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP1]]
 ; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
@@ -331,7 +331,7 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32
 ; VEC4_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL2:       vector.ph:
 ; VEC4_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483640
-; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC4_INTERL2-NEXT:    [[TMP1:%.*]] = fmul reassoc float [[FPINC]], [[DOTCAST]]
 ; VEC4_INTERL2-NEXT:    [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP1]]
 ; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
@@ -389,7 +389,7 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32
 ; VEC1_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC1_INTERL2:       vector.ph:
 ; VEC1_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646
-; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC1_INTERL2-NEXT:    [[TMP1:%.*]] = fmul reassoc float [[FPINC]], [[DOTCAST]]
 ; VEC1_INTERL2-NEXT:    [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP1]]
 ; VEC1_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -442,7 +442,7 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32
 ; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC2_INTERL1_PRED_STORE:       vector.ph:
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP1:%.*]] = fmul reassoc float [[FPINC]], [[DOTCAST]]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP1]]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0
@@ -526,7 +526,7 @@ define void @fp_iv_loop2(float %init, ptr noalias nocapture %A, i32 %N) #0 {
 ; VEC4_INTERL1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL1:       vector.ph:
 ; VEC4_INTERL1-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483644
-; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC4_INTERL1-NEXT:    [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01
 ; VEC4_INTERL1-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP1]], [[INIT:%.*]]
 ; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
@@ -574,7 +574,7 @@ define void @fp_iv_loop2(float %init, ptr noalias nocapture %A, i32 %N) #0 {
 ; VEC4_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL2:       vector.ph:
 ; VEC4_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483640
-; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC4_INTERL2-NEXT:    [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01
 ; VEC4_INTERL2-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP1]], [[INIT:%.*]]
 ; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
@@ -625,7 +625,7 @@ define void @fp_iv_loop2(float %init, ptr noalias nocapture %A, i32 %N) #0 {
 ; VEC1_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC1_INTERL2:       vector.ph:
 ; VEC1_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646
-; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC1_INTERL2-NEXT:    [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01
 ; VEC1_INTERL2-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP1]], [[INIT:%.*]]
 ; VEC1_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -675,7 +675,7 @@ define void @fp_iv_loop2(float %init, ptr noalias nocapture %A, i32 %N) #0 {
 ; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC2_INTERL1_PRED_STORE:       vector.ph:
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP1]], [[INIT:%.*]]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0
@@ -758,10 +758,10 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca
 ; VEC4_INTERL1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL1:       vector.ph:
 ; VEC4_INTERL1-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP1]], 2147483644
-; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC4_INTERL1-NEXT:    [[TMP2:%.*]] = fmul fast float [[DOTCAST]], -5.000000e-01
 ; VEC4_INTERL1-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP2]], 0x3FB99999A0000000
-; VEC4_INTERL1-NEXT:    [[DOTCAST2:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC4_INTERL1-NEXT:    [[DOTCAST2:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC4_INTERL1-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]]
 ; VEC4_INTERL1-NEXT:    [[IND_END3:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]]
 ; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
@@ -835,10 +835,10 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca
 ; VEC4_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL2:       vector.ph:
 ; VEC4_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP1]], 2147483640
-; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC4_INTERL2-NEXT:    [[TMP2:%.*]] = fmul fast float [[DOTCAST]], -5.000000e-01
 ; VEC4_INTERL2-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP2]], 0x3FB99999A0000000
-; VEC4_INTERL2-NEXT:    [[DOTCAST2:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC4_INTERL2-NEXT:    [[DOTCAST2:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC4_INTERL2-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]]
 ; VEC4_INTERL2-NEXT:    [[IND_END3:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]]
 ; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i64 0
@@ -922,10 +922,10 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca
 ; VEC1_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC1_INTERL2:       vector.ph:
 ; VEC1_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP1]], 2147483646
-; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC1_INTERL2-NEXT:    [[TMP2:%.*]] = fmul fast float [[DOTCAST]], -5.000000e-01
 ; VEC1_INTERL2-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP2]], 0x3FB99999A0000000
-; VEC1_INTERL2-NEXT:    [[DOTCAST2:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC1_INTERL2-NEXT:    [[DOTCAST2:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC1_INTERL2-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]]
 ; VEC1_INTERL2-NEXT:    [[IND_END3:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]]
 ; VEC1_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1000,10 +1000,10 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca
 ; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC2_INTERL1_PRED_STORE:       vector.ph:
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP1]], 2147483646
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP2:%.*]] = fmul fast float [[DOTCAST]], -5.000000e-01
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP2]], 0x3FB99999A0000000
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST2:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST2:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP3:%.*]] = fmul fast float [[TMP0]], [[DOTCAST2]]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[IND_END3:%.*]] = fadd fast float [[TMP3]], [[INIT:%.*]]
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[INIT]], i64 0
@@ -1113,7 +1113,7 @@ define void @fp_iv_loop4(ptr noalias nocapture %A, i32 %N) {
 ; VEC4_INTERL1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL1:       vector.ph:
 ; VEC4_INTERL1-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483644
-; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC4_INTERL1-NEXT:    [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01
 ; VEC4_INTERL1-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
 ; VEC4_INTERL1-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1158,7 +1158,7 @@ define void @fp_iv_loop4(ptr noalias nocapture %A, i32 %N) {
 ; VEC4_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL2:       vector.ph:
 ; VEC4_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483640
-; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC4_INTERL2-NEXT:    [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01
 ; VEC4_INTERL2-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
 ; VEC4_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1206,7 +1206,7 @@ define void @fp_iv_loop4(ptr noalias nocapture %A, i32 %N) {
 ; VEC1_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC1_INTERL2:       vector.ph:
 ; VEC1_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646
-; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC1_INTERL2-NEXT:    [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01
 ; VEC1_INTERL2-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
 ; VEC1_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1256,7 +1256,7 @@ define void @fp_iv_loop4(ptr noalias nocapture %A, i32 %N) {
 ; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC2_INTERL1_PRED_STORE:       vector.ph:
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483646
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP1:%.*]] = fmul fast float [[DOTCAST]], 5.000000e-01
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[IND_END:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
 ; VEC2_INTERL1_PRED_STORE-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1319,7 +1319,7 @@ define void @non_primary_iv_float_scalar(ptr %A, i64 %N) {
 ; VEC4_INTERL1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL1:       vector.ph:
 ; VEC4_INTERL1-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775804
-; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC4_INTERL1-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC4_INTERL1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC4_INTERL1:       vector.body:
 ; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ]
@@ -1396,7 +1396,7 @@ define void @non_primary_iv_float_scalar(ptr %A, i64 %N) {
 ; VEC4_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC4_INTERL2:       vector.ph:
 ; VEC4_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775800
-; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC4_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC4_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC4_INTERL2:       vector.body:
 ; VEC4_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE17:%.*]] ]
@@ -1512,7 +1512,7 @@ define void @non_primary_iv_float_scalar(ptr %A, i64 %N) {
 ; VEC1_INTERL2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC1_INTERL2:       vector.ph:
 ; VEC1_INTERL2-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775806
-; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC1_INTERL2-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC1_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC1_INTERL2:       vector.body:
 ; VEC1_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
@@ -1570,7 +1570,7 @@ define void @non_primary_iv_float_scalar(ptr %A, i64 %N) {
 ; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]]
 ; VEC2_INTERL1_PRED_STORE:       vector.ph:
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], 9223372036854775806
-; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = sitofp i64 [[N_VEC]] to float
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[DOTCAST:%.*]] = uitofp nneg i64 [[N_VEC]] to float
 ; VEC2_INTERL1_PRED_STORE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC2_INTERL1_PRED_STORE:       vector.body:
 ; VEC2_INTERL1_PRED_STORE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll b/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
index 6a9f83a9e0aa2..1b4bcf6a3739a 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp-predicated.ll
@@ -1,114 +1,155 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s --check-prefix=CHECK-VF2IC1
 ; RUN: opt -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC2
 
 define i32 @pred_select_const_i32_from_icmp(ptr noalias nocapture readonly %src1, ptr noalias nocapture readonly %src2, i64 %n) {
-; CHECK-VF2IC1-LABEL: @pred_select_const_i32_from_icmp(
+; CHECK-VF2IC1-LABEL: define i32 @pred_select_const_i32_from_icmp(
+; CHECK-VF2IC1-SAME: ptr noalias nocapture readonly [[SRC1:%.*]], ptr noalias nocapture readonly [[SRC2:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF2IC1-NEXT:  entry:
+; CHECK-VF2IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; CHECK-VF2IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-VF2IC1:       vector.ph:
+; CHECK-VF2IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; CHECK-VF2IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF2IC1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-VF2IC1:       vector.body:
-; CHECK-VF2IC1:         [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, %vector.ph ], [ [[PREDPHI:%.*]], %pred.load.continue2 ]
-; CHECK-VF2IC1:         [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr {{%.*}}, align 4
-; CHECK-VF2IC1-NEXT:    [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], <i32 35, i32 35>
-; CHECK-VF2IC1-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
-; CHECK-VF2IC1-NEXT:    br i1 [[TMP5]], label %pred.load.if, label %pred.load.continue
+; CHECK-VF2IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
+; CHECK-VF2IC1-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_LOAD_CONTINUE2]] ]
+; CHECK-VF2IC1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF2IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[TMP0]]
+; CHECK-VF2IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
+; CHECK-VF2IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
+; CHECK-VF2IC1-NEXT:    [[TMP3:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], <i32 35, i32 35>
+; CHECK-VF2IC1-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
+; CHECK-VF2IC1-NEXT:    br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK-VF2IC1:       pred.load.if:
-; CHECK-VF2IC1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC2:%.*]], i64 {{%.*}}
-; CHECK-VF2IC1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
-; CHECK-VF2IC1-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
-; CHECK-VF2IC1-NEXT:    br label %pred.load.continue
+; CHECK-VF2IC1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[TMP0]]
+; CHECK-VF2IC1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
+; CHECK-VF2IC1-NEXT:    [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP6]], i32 0
+; CHECK-VF2IC1-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK-VF2IC1:       pred.load.continue:
-; CHECK-VF2IC1-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ poison, %vector.body ], [ [[TMP8]], %pred.load.if ]
-; CHECK-VF2IC1-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
-; CHECK-VF2IC1-NEXT:    br i1 [[TMP10]], label %pred.load.if1, label %pred.load.continue2
+; CHECK-VF2IC1-NEXT:    [[TMP8:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ]
+; CHECK-VF2IC1-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1
+; CHECK-VF2IC1-NEXT:    br i1 [[TMP9]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]]
 ; CHECK-VF2IC1:       pred.load.if1:
-; CHECK-VF2IC1:         [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 {{%.*}}
-; CHECK-VF2IC1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
-; CHECK-VF2IC1-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP13]], i32 1
-; CHECK-VF2IC1-NEXT:    br label %pred.load.continue2
+; CHECK-VF2IC1-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 1
+; CHECK-VF2IC1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[TMP10]]
+; CHECK-VF2IC1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
+; CHECK-VF2IC1-NEXT:    [[TMP13:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[TMP12]], i32 1
+; CHECK-VF2IC1-NEXT:    br label [[PRED_LOAD_CONTINUE2]]
 ; CHECK-VF2IC1:       pred.load.continue2:
-; CHECK-VF2IC1-NEXT:    [[TMP15:%.*]] = phi <2 x i32> [ [[TMP9]], %pred.load.continue ], [ [[TMP14]], %pred.load.if1 ]
-; CHECK-VF2IC1-NEXT:    [[TMP16:%.*]] = icmp eq <2 x i32> [[TMP15]], <i32 2, i32 2>
-; CHECK-VF2IC1-NEXT:    [[TMP17:%.*]] = or <2 x i1> [[VEC_PHI]], [[TMP16]]
-; CHECK-VF2IC1-NEXT:    [[PREDPHI]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP17]], <2 x i1> [[VEC_PHI]]
-; CHECK-VF2IC1:         br i1 {{%.*}}, label %middle.block, label %vector.body
+; CHECK-VF2IC1-NEXT:    [[TMP14:%.*]] = phi <2 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF1]] ]
+; CHECK-VF2IC1-NEXT:    [[TMP15:%.*]] = icmp eq <2 x i32> [[TMP14]], <i32 2, i32 2>
+; CHECK-VF2IC1-NEXT:    [[TMP16:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> <i32 1, i32 1>, <2 x i32> [[VEC_PHI]]
+; CHECK-VF2IC1-NEXT:    [[PREDPHI]] = select <2 x i1> [[TMP3]], <2 x i32> [[TMP16]], <2 x i32> [[VEC_PHI]]
+; CHECK-VF2IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-VF2IC1-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF2IC1-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-VF2IC1:       middle.block:
-; CHECK-VF2IC1-NEXT:    [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[PREDPHI]])
-; CHECK-VF2IC1-NEXT:    [[FR_TMP20:%.*]] = freeze i1 [[TMP20]]
-; CHECK-VF2IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR_TMP20]], i32 1, i32 0
+; CHECK-VF2IC1-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i32> [[PREDPHI]], zeroinitializer
+; CHECK-VF2IC1-NEXT:    [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]])
+; CHECK-VF2IC1-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP18]], i32 1, i32 0
+; CHECK-VF2IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF2IC1-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-VF2IC1:       scalar.ph:
-; CHECK-VF2IC1:         [[BC_RESUME_VAL:%.*]] = phi i64 [ {{%.*}}, %middle.block ], [ 0, %entry ]
-; CHECK-VF2IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %entry ], [ [[RDX_SELECT]], %middle.block ]
-; CHECK-VF2IC1-NEXT:    br label %for.body
+; CHECK-VF2IC1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-VF2IC1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-VF2IC1-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-VF2IC1:       for.body:
-; CHECK-VF2IC1:         [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], %for.inc ], [ [[BC_MERGE_RDX]], %scalar.ph ]
-; CHECK-VF2IC1:         [[TMP21:%.*]] = load i32, ptr {{%.*}}, align 4
-; CHECK-VF2IC1-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP21]], 35
-; CHECK-VF2IC1-NEXT:    br i1 [[CMP1]], label %if.then, label %for.inc
+; CHECK-VF2IC1-NEXT:    [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-VF2IC1-NEXT:    [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-VF2IC1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[I_013]]
+; CHECK-VF2IC1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-VF2IC1-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP19]], 35
+; CHECK-VF2IC1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK-VF2IC1:       if.then:
-; CHECK-VF2IC1:         [[TMP22:%.*]] = load i32, ptr {{%.*}}, align 4
-; CHECK-VF2IC1-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[TMP22]], 2
+; CHECK-VF2IC1-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[I_013]]
+; CHECK-VF2IC1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-VF2IC1-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[TMP20]], 2
 ; CHECK-VF2IC1-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]]
-; CHECK-VF2IC1-NEXT:    br label %for.inc
+; CHECK-VF2IC1-NEXT:    br label [[FOR_INC]]
 ; CHECK-VF2IC1:       for.inc:
-; CHECK-VF2IC1-NEXT:    [[R_1]] = phi i32 [ [[R_012]], %for.body ], [ [[SPEC_SELECT]], %if.then ]
+; CHECK-VF2IC1-NEXT:    [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ]
+; CHECK-VF2IC1-NEXT:    [[INC]] = add nuw nsw i64 [[I_013]], 1
+; CHECK-VF2IC1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF2IC1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-VF2IC1:       for.end.loopexit:
-; CHECK-VF2IC1-NEXT:    [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], %for.inc ], [ [[RDX_SELECT]], %middle.block ]
+; CHECK-VF2IC1-NEXT:    [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
 ; CHECK-VF2IC1-NEXT:    ret i32 [[R_1_LCSSA]]
 ;
-; CHECK-VF1IC2-LABEL: @pred_select_const_i32_from_icmp(
+; CHECK-VF1IC2-LABEL: define i32 @pred_select_const_i32_from_icmp(
+; CHECK-VF1IC2-SAME: ptr noalias nocapture readonly [[SRC1:%.*]], ptr noalias nocapture readonly [[SRC2:%.*]], i64 [[N:%.*]]) {
+; CHECK-VF1IC2-NEXT:  entry:
+; CHECK-VF1IC2-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; CHECK-VF1IC2-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-VF1IC2:       vector.ph:
+; CHECK-VF1IC2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; CHECK-VF1IC2-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-VF1IC2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-VF1IC2:       vector.body:
-; CHECK-VF1IC2:         [[VEC_PHI:%.*]] = phi i1 [ false, %vector.ph ], [ [[PREDPHI:%.*]], %pred.load.continue3 ]
-; CHECK-VF1IC2-NEXT:    [[VEC_PHI2:%.*]] = phi i1 [ false, %vector.ph ], [ [[PREDPHI5:%.*]], %pred.load.continue3 ]
-; CHECK-VF1IC2:         [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC1:%.*]], i64 {{%.*}}
-; CHECK-VF1IC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 {{%.*}}
-; CHECK-VF1IC2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4
-; CHECK-VF1IC2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
-; CHECK-VF1IC2-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], 35
-; CHECK-VF1IC2-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], 35
-; CHECK-VF1IC2-NEXT:    br i1 [[TMP4]], label %pred.load.if, label %pred.load.continue
+; CHECK-VF1IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE3:%.*]] ]
+; CHECK-VF1IC2-NEXT:    [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_LOAD_CONTINUE3]] ]
+; CHECK-VF1IC2-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI4:%.*]], [[PRED_LOAD_CONTINUE3]] ]
+; CHECK-VF1IC2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-VF1IC2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-VF1IC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[TMP0]]
+; CHECK-VF1IC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[TMP1]]
+; CHECK-VF1IC2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP2]], align 4
+; CHECK-VF1IC2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP3]], align 4
+; CHECK-VF1IC2-NEXT:    [[TMP6:%.*]] = icmp sgt i32 [[TMP4]], 35
+; CHECK-VF1IC2-NEXT:    [[TMP7:%.*]] = icmp sgt i32 [[TMP5]], 35
+; CHECK-VF1IC2-NEXT:    br i1 [[TMP6]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK-VF1IC2:       pred.load.if:
-; CHECK-VF1IC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC2:%.*]], i64 {{%.*}}
-; CHECK-VF1IC2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
-; CHECK-VF1IC2-NEXT:    br label %pred.load.continue
+; CHECK-VF1IC2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[TMP0]]
+; CHECK-VF1IC2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-VF1IC2-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK-VF1IC2:       pred.load.continue:
-; CHECK-VF1IC2-NEXT:    [[TMP8:%.*]] = phi i32 [ poison, %vector.body ], [ [[TMP7]], %pred.load.if ]
-; CHECK-VF1IC2-NEXT:    br i1 [[TMP5]], label %pred.load.if2, label %pred.load.continue3
+; CHECK-VF1IC2-NEXT:    [[TMP10:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_LOAD_IF]] ]
+; CHECK-VF1IC2-NEXT:    br i1 [[TMP7]], label [[PRED_LOAD_IF2:%.*]], label [[PRED_LOAD_CONTINUE3]]
 ; CHECK-VF1IC2:       pred.load.if2:
-; CHECK-VF1IC2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 {{%.*}}
-; CHECK-VF1IC2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
-; CHECK-VF1IC2-NEXT:    br label %pred.load.continue3
+; CHECK-VF1IC2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[TMP1]]
+; CHECK-VF1IC2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
+; CHECK-VF1IC2-NEXT:    br label [[PRED_LOAD_CONTINUE3]]
 ; CHECK-VF1IC2:       pred.load.continue3:
-; CHECK-VF1IC2-NEXT:    [[TMP11:%.*]] = phi i32 [ poison, %pred.load.continue ], [ [[TMP10]], %pred.load.if2 ]
-; CHECK-VF1IC2-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP8]], 2
-; CHECK-VF1IC2-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[TMP11]], 2
-; CHECK-VF1IC2-NEXT:    [[TMP14:%.*]] = or i1 [[VEC_PHI]], [[TMP12]]
-; CHECK-VF1IC2-NEXT:    [[TMP15:%.*]] = or i1 [[VEC_PHI2]], [[TMP13]]
-; CHECK-VF1IC2-NEXT:    [[PREDPHI]] = select i1 [[TMP4]], i1 [[TMP14]], i1 [[VEC_PHI]]
-; CHECK-VF1IC2-NEXT:    [[PREDPHI5]] = select i1 [[TMP5]], i1 [[TMP15]], i1 [[VEC_PHI2]]
-; CHECK-VF1IC2:         br i1 {{%.*}}, label %middle.block, label %vector.body
+; CHECK-VF1IC2-NEXT:    [[TMP13:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF2]] ]
+; CHECK-VF1IC2-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP10]], 2
+; CHECK-VF1IC2-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[TMP13]], 2
+; CHECK-VF1IC2-NEXT:    [[TMP16:%.*]] = select i1 [[TMP14]], i32 1, i32 [[VEC_PHI]]
+; CHECK-VF1IC2-NEXT:    [[TMP17:%.*]] = select i1 [[TMP15]], i32 1, i32 [[VEC_PHI1]]
+; CHECK-VF1IC2-NEXT:    [[PREDPHI]] = select i1 [[TMP6]], i32 [[TMP16]], i32 [[VEC_PHI]]
+; CHECK-VF1IC2-NEXT:    [[PREDPHI4]] = select i1 [[TMP7]], i32 [[TMP17]], i32 [[VEC_PHI1]]
+; CHECK-VF1IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-VF1IC2-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF1IC2-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-VF1IC2:       middle.block:
-; CHECK-VF1IC2-NEXT:    [[OR:%.*]] = or i1 [[PREDPHI5]], [[PREDPHI]]
-; CHECK-VF1IC2-NEXT:    [[FR_OR:%.*]] = freeze i1 [[OR]]
-; CHECK-VF1IC2-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR_OR]], i32 1, i32 0
-; CHECK-VF1IC2:         br i1 {{%.*}}, label %for.end.loopexit, label %scalar.ph
+; CHECK-VF1IC2-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[PREDPHI]], 0
+; CHECK-VF1IC2-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[PREDPHI]], i32 [[PREDPHI4]]
+; CHECK-VF1IC2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-VF1IC2-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-VF1IC2:       scalar.ph:
-; CHECK-VF1IC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ {{%.*}}, %middle.block ], [ 0, %entry ]
-; CHECK-VF1IC2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %entry ], [ [[RDX_SELECT]], %middle.block ]
-; CHECK-VF1IC2-NEXT:    br label %for.body
+; CHECK-VF1IC2-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-VF1IC2-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC2-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-VF1IC2:       for.body:
-; CHECK-VF1IC2-NEXT:    [[I_013:%.*]] = phi i64 [ [[INC:%.*]], %for.inc ], [ [[BC_RESUME_VAL]], %scalar.ph ]
-; CHECK-VF1IC2-NEXT:    [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], %for.inc ], [ [[BC_MERGE_RDX]], %scalar.ph ]
-; CHECK-VF1IC2:         [[TMP19:%.*]] = load i32, ptr {{%.*}}, align 4
+; CHECK-VF1IC2-NEXT:    [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-VF1IC2-NEXT:    [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
+; CHECK-VF1IC2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC1]], i64 [[I_013]]
+; CHECK-VF1IC2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-VF1IC2-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP19]], 35
-; CHECK-VF1IC2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label %for.inc
+; CHECK-VF1IC2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK-VF1IC2:       if.then:
-; CHECK-VF1IC2:         [[TMP20:%.*]] = load i32, ptr {{%.*}}, align 4
+; CHECK-VF1IC2-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[SRC2]], i64 [[I_013]]
+; CHECK-VF1IC2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
 ; CHECK-VF1IC2-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[TMP20]], 2
 ; CHECK-VF1IC2-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]]
-; CHECK-VF1IC2-NEXT:    br label %for.inc
+; CHECK-VF1IC2-NEXT:    br label [[FOR_INC]]
 ; CHECK-VF1IC2:       for.inc:
-; CHECK-VF1IC2-NEXT:    [[R_1]] = phi i32 [ [[R_012]], %for.body ], [ [[SPEC_SELECT]], %if.then ]
-; CHECK-VF1IC2:         br i1 {{%.*}}, label %for.end.loopexit, label %for.body
+; CHECK-VF1IC2-NEXT:    [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ]
+; CHECK-VF1IC2-NEXT:    [[INC]] = add nuw nsw i64 [[I_013]], 1
+; CHECK-VF1IC2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
+; CHECK-VF1IC2-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-VF1IC2:       for.end.loopexit:
-; CHECK-VF1IC2-NEXT:    [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], %for.inc ], [ [[RDX_SELECT]], %middle.block ]
+; CHECK-VF1IC2-NEXT:    [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
 ; CHECK-VF1IC2-NEXT:    ret i32 [[R_1_LCSSA]]
 ;
 entry:
@@ -139,3 +180,14 @@ for.end.loopexit:                                 ; preds = %for.inc
   %r.1.lcssa = phi i32 [ %r.1, %for.inc ]
   ret i32 %r.1.lcssa
 }
+;.
+; CHECK-VF2IC1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-VF2IC1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-VF2IC1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VF2IC1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
+; CHECK-VF1IC2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-VF1IC2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-VF1IC2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-VF1IC2: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/select-cmp.ll b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
index 993b56a05207b..c9f2aaef6d5c8 100644
--- a/llvm/test/Transforms/LoopVectorize/select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-cmp.ll
@@ -5,47 +5,45 @@
 define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) {
 ; CHECK-LABEL: @select_const_i32_from_icmp
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x i32>
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-VF4IC1-NEXT:   [[NOT:%.*]] = xor <4 x i1> [[VEC_ICMP]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> [[VEC_PHI]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
 
 ; CHECK-VF4IC4:      vector.body:
-; CHECK-VF4IC4:        [[VEC_PHI1:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
-; CHECK-VF4IC4-NEXT:   [[VEC_PHI2:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
-; CHECK-VF4IC4-NEXT:   [[VEC_PHI3:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
-; CHECK-VF4IC4-NEXT:   [[VEC_PHI4:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
+; CHECK-VF4IC4:        [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
+; CHECK-VF4IC4-NEXT:   [[VEC_PHI2:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
+; CHECK-VF4IC4-NEXT:   [[VEC_PHI3:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
+; CHECK-VF4IC4-NEXT:   [[VEC_PHI4:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
 ; CHECK-VF4IC4:        [[VEC_ICMP1:%.*]] = icmp eq <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
 ; CHECK-VF4IC4-NEXT:   [[VEC_ICMP2:%.*]] = icmp eq <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
 ; CHECK-VF4IC4-NEXT:   [[VEC_ICMP3:%.*]] = icmp eq <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
 ; CHECK-VF4IC4-NEXT:   [[VEC_ICMP4:%.*]] = icmp eq <4 x i32> {{.*}}, <i32 3, i32 3, i32 3, i32 3>
-; CHECK-VF4IC4-NEXT:   [[NOT1:%.*]] = xor <4 x i1> [[VEC_ICMP1]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-VF4IC4-NEXT:   [[NOT2:%.*]] = xor <4 x i1> [[VEC_ICMP2]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-VF4IC4-NEXT:   [[NOT3:%.*]] = xor <4 x i1> [[VEC_ICMP3]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-VF4IC4-NEXT:   [[NOT4:%.*]] = xor <4 x i1> [[VEC_ICMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL1:%.*]] = or <4 x i1> [[VEC_PHI1]], [[NOT1]]
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL2:%.*]] = or <4 x i1> [[VEC_PHI2]], [[NOT2]]
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL3:%.*]] = or <4 x i1> [[VEC_PHI3]], [[NOT3]]
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL4:%.*]] = or <4 x i1> [[VEC_PHI4]], [[NOT4]]
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL1:%.*]] = select <4 x i1> [[VEC_ICMP1]], <4 x i32> [[VEC_PHI1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL2:%.*]] = select <4 x i1> [[VEC_ICMP2]], <4 x i32> [[VEC_PHI2]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL3:%.*]] = select <4 x i1> [[VEC_ICMP3]], <4 x i32> [[VEC_PHI3]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL4:%.*]] = select <4 x i1> [[VEC_ICMP4]], <4 x i32> [[VEC_PHI4]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
 ; CHECK-VF4IC4:      middle.block:
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL5:%.*]] = or <4 x i1>  [[VEC_SEL2]], [[VEC_SEL1]]
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL6:%.*]] = or <4 x i1> [[VEC_SEL3]], [[VEC_SEL5]]
-; CHECK-VF4IC4-NEXT:   [[VEC_SEL7:%.*]] = or <4 x i1> [[VEC_SEL4]], [[VEC_SEL6]]
-; CHECK-VF4IC4-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL7]])
-; CHECK-VF4IC4-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC4-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP5:%.*]] = icmp ne <4 x i32> [[VEC_SEL1]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL5:%.*]] = select <4 x i1> [[VEC_ICMP5]], <4 x i32> [[VEC_SEL1]], <4 x i32> [[VEC_SEL2]]
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP6:%.*]] = icmp ne <4 x i32> [[VEC_SEL5]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL6:%.*]] = select <4 x i1> [[VEC_ICMP6]], <4 x i32> [[VEC_SEL5]], <4 x i32> [[VEC_SEL3]]
+; CHECK-VF4IC4-NEXT:   [[VEC_ICMP7:%.*]] = icmp ne <4 x i32> [[VEC_SEL6]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC4-NEXT:   [[VEC_SEL_FIN:%.*]] = select <4 x i1> [[VEC_ICMP7]], <4 x i32> [[VEC_SEL6]], <4 x i32> [[VEC_SEL4]]
+; CHECK-VF4IC4-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL_FIN]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC4-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC4-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
 
 
 ; CHECK-VF1IC4:      vector.body:
-; CHECK-VF1IC4:        [[VEC_PHI1:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
-; CHECK-VF1IC4-NEXT:   [[VEC_PHI2:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
-; CHECK-VF1IC4-NEXT:   [[VEC_PHI3:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
-; CHECK-VF1IC4-NEXT:   [[VEC_PHI4:%.*]] = phi i1 [ false, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
+; CHECK-VF1IC4:        [[VEC_PHI1:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
+; CHECK-VF1IC4-NEXT:   [[VEC_PHI2:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
+; CHECK-VF1IC4-NEXT:   [[VEC_PHI3:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
+; CHECK-VF1IC4-NEXT:   [[VEC_PHI4:%.*]] = phi i32 [ 3, %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
 ; CHECK-VF1IC4:        [[VEC_LOAD1:%.*]] = load i32
 ; CHECK-VF1IC4-NEXT:   [[VEC_LOAD2:%.*]] = load i32
 ; CHECK-VF1IC4-NEXT:   [[VEC_LOAD3:%.*]] = load i32
@@ -54,20 +52,17 @@ define i32 @select_const_i32_from_icmp(ptr nocapture readonly %v, i64 %n) {
 ; CHECK-VF1IC4-NEXT:   [[VEC_ICMP2:%.*]] = icmp eq i32 [[VEC_LOAD2]], 3
 ; CHECK-VF1IC4-NEXT:   [[VEC_ICMP3:%.*]] = icmp eq i32 [[VEC_LOAD3]], 3
 ; CHECK-VF1IC4-NEXT:   [[VEC_ICMP4:%.*]] = icmp eq i32 [[VEC_LOAD4]], 3
-; CHECK-VF1IC4-NEXT:   [[NOT1:%.*]] = xor i1 [[VEC_ICMP1]], true
-; CHECK-VF1IC4-NEXT:   [[NOT2:%.*]] = xor i1 [[VEC_ICMP2]], true
-; CHECK-VF1IC4-NEXT:   [[NOT3:%.*]] = xor i1 [[VEC_ICMP3]], true
-; CHECK-VF1IC4-NEXT:   [[NOT4:%.*]] = xor i1 [[VEC_ICMP4]], true
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL1:%.*]] = or i1 [[VEC_PHI1]], [[NOT1]]
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL2:%.*]] = or i1 [[VEC_PHI2]], [[NOT2]]
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL3:%.*]] = or i1 [[VEC_PHI3]], [[NOT3]]
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL4:%.*]] = or i1 [[VEC_PHI4]], [[NOT4]]
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL1]] = select i1 [[VEC_ICMP1]], i32 [[VEC_PHI1]], i32 7
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL2]] = select i1 [[VEC_ICMP2]], i32 [[VEC_PHI2]], i32 7
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL3]] = select i1 [[VEC_ICMP3]], i32 [[VEC_PHI3]], i32 7
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL4]] = select i1 [[VEC_ICMP4]], i32 [[VEC_PHI4]], i32 7
 ; CHECK-VF1IC4:      middle.block:
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL5:%.*]] = or i1 [[VEC_SEL2]], [[VEC_SEL1]]
-; CHECK-VF1IC4-NEXT:   [[VEC_SEL6:%.*]] = or i1 [[VEC_SEL3]], [[VEC_SEL5]]
-; CHECK-VF1IC4-NEXT:   [[OR_RDX:%.*]] = or i1  [[VEC_SEL4]], [[VEC_SEL6]]
-; CHECK-VF1IC4-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF1IC4-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3
+; CHECK-VF1IC4-NEXT:   [[VEC_ICMP4:%.*]] = icmp ne i32 [[VEC_SEL1]], 3
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL5:%.*]] = select i1 [[VEC_ICMP4]], i32 [[VEC_SEL1]], i32 [[VEC_SEL2]]
+; CHECK-VF1IC4-NEXT:   [[VEC_ICMP5:%.*]] = icmp ne i32 [[VEC_SEL5]], 3
+; CHECK-VF1IC4-NEXT:   [[VEC_SEL6:%.*]] = select i1 [[VEC_ICMP5]], i32 [[VEC_SEL5]], i32 [[VEC_SEL3]]
+; CHECK-VF1IC4-NEXT:   [[VEC_ICMP6:%.*]] = icmp ne i32 [[VEC_SEL6]], 3
+; CHECK-VF1IC4-NEXT:   {{.*}} = select i1 [[VEC_ICMP6]], i32 [[VEC_SEL6]], i32 [[VEC_SEL4]]
 
 entry:
   br label %for.body
@@ -91,14 +86,14 @@ exit:                                     ; preds = %for.body
 define i32 @select_const_i32_from_icmp2(ptr nocapture readonly %v, i64 %n) {
 ; CHECK-LABEL: @select_const_i32_from_icmp2
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 3, i32 3, i32 3, i32 3>, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x i32>
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[VEC_ICMP]]
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32> [[VEC_PHI]]
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 7, i32 3
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
 
 entry:
   br label %for.body
@@ -122,18 +117,21 @@ exit:                                     ; preds = %for.body
 define i32 @select_i32_from_icmp(ptr nocapture readonly %v, i32 %a, i32 %b, i64 %n) {
 ; CHECK-LABEL: @select_i32_from_icmp
 ; CHECK-VF4IC1:      vector.ph:
-; CHECK-VF4IC1-NOT:    shufflevector <4 x i32>
-; CHECK-VF4IC1-NOT:    shufflevector <4 x i32>
+; CHECK-VF4IC1:        [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0
+; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_A:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT:   [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 %b, i64 0
+; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_B:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ [[SPLAT_OF_A]], %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x i32>
 ; CHECK-VF4IC1-NEXT:   [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_LOAD]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-VF4IC1-NEXT:   [[NOT:%.*]] = xor <4 x i1> [[VEC_ICMP]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> [[VEC_PHI]], <4 x i32> [[SPLAT_OF_B]]
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 %b, i32 %a
+; CHECK-VF4IC1-NEXT:   [[FIN_INS:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0
+; CHECK-VF4IC1-NEXT:   [[FIN_SPLAT:%.*]] = shufflevector <4 x i32> [[FIN_INS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT:   [[FIN_CMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], [[FIN_SPLAT]]
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_CMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a
 entry:
   br label %for.body
 
@@ -156,15 +154,14 @@ exit:                                     ; preds = %for.body
 define i32 @select_const_i32_from_fcmp_fast(ptr nocapture readonly %v, i64 %n) {
 ; CHECK-LABEL: @select_const_i32_from_fcmp_fast
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2, i32 2, i32 2, i32 2>, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x float>
 ; CHECK-VF4IC1-NEXT:   [[VEC_FCMP:%.*]] = fcmp fast ueq <4 x float> [[VEC_LOAD]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
-; CHECK-VF4IC1-NEXT:   [[NOT:%.*]] = xor <4 x i1> [[VEC_FCMP]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_FCMP]], <4 x i32> [[VEC_PHI]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 1, i32 2
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 1, i32 2
 entry:
   br label %for.body
 
@@ -187,15 +184,14 @@ exit:                                     ; preds = %for.body
 define i32 @select_const_i32_from_fcmp(ptr nocapture readonly %v, i64 %n) {
 ; CHECK-LABEL: @select_const_i32_from_fcmp
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2, i32 2, i32 2, i32 2>, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
 ; CHECK-VF4IC1:        [[VEC_LOAD:%.*]] = load <4 x float>
 ; CHECK-VF4IC1-NEXT:   [[VEC_FCMP:%.*]] = fcmp ueq <4 x float> [[VEC_LOAD]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
-; CHECK-VF4IC1-NEXT:   [[NOT:%.*]] = xor <4 x i1> [[VEC_FCMP]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_FCMP]], <4 x i32> [[VEC_PHI]], <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 1, i32 2
+; CHECK-VF4IC1-NEXT:   [[FIN_ICMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 1, i32 2
 entry:
   br label %for.body
 
@@ -220,16 +216,18 @@ define i32 @select_i32_from_icmp_same_inputs(i32 %a, i32 %b, i64 %n) {
 ; CHECK-VF4IC1:      vector.ph:
 ; CHECK-VF4IC1:        [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0
 ; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_A:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-VF4IC1-NOT:   [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 %b, i64 0
+; CHECK-VF4IC1-NEXT:   [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 %b, i64 0
+; CHECK-VF4IC1-NEXT:   [[SPLAT_OF_B:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-VF4IC1:      vector.body:
-; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i1> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
-; CHECK-VF4IC1:        [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[SPLAT_OF_A]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-VF4IC1-NEXT:   [[NOT:%.*]] = xor <4 x i1> [[VEC_ICMP]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = or <4 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-VF4IC1:        [[VEC_PHI:%.*]] = phi <4 x i32> [ [[SPLAT_OF_A]], %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1:        [[VEC_ICMP:%.*]] = icmp eq <4 x i32> [[VEC_PHI]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-VF4IC1-NEXT:   [[VEC_SEL]] = select <4 x i1> [[VEC_ICMP]], <4 x i32> [[VEC_PHI]], <4 x i32> [[SPLAT_OF_B]]
 ; CHECK-VF4IC1:      middle.block:
-; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[VEC_SEL]])
-; CHECK-VF4IC1-NEXT:   [[FR_OR_RDX:%.*]] = freeze i1 [[OR_RDX]]
-; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[FR_OR_RDX]], i32 %b, i32 %a
+; CHECK-VF4IC1-NEXT:   [[FIN_INS:%.*]] = insertelement <4 x i32> poison, i32 %a, i64 0
+; CHECK-VF4IC1-NEXT:   [[FIN_SPLAT:%.*]] = shufflevector <4 x i32> [[FIN_INS]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT:   [[FIN_CMP:%.*]] = icmp ne <4 x i32> [[VEC_SEL]], [[FIN_SPLAT]]
+; CHECK-VF4IC1-NEXT:   [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[FIN_CMP]])
+; CHECK-VF4IC1-NEXT:   {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
index 55e61158a79c6..16ab45415b5cc 100644
--- a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
+++ b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll
@@ -8,25 +8,26 @@ define i64 @pr62565_incoming_value_known_undef(i64 %a, ptr %src) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ undef, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], <i32 1, i32 1>
-; CHECK-NEXT:    [[NOT:%*]] = xor <2 x i1> [[TMP3]], <i1 true, i1 true>
-; CHECK-NEXT:    [[TMP4]] = or <2 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-NEXT:    [[TMP4]] = select <2 x i1> [[TMP3]], <2 x i64> [[VEC_PHI]], <2 x i64> [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP4]])
-; CHECK-NEXT:    [[FR_TMP6:%.*]] = freeze i1 [[TMP6]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR_TMP6]], i64 [[A]], i64 undef
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i64> [[TMP4]], undef
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i64 [[A]], i64 undef
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 33, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
@@ -71,25 +72,26 @@ define i64 @pr62565_incoming_value_known_poison(i64 %a, ptr %src) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ poison, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], <i32 1, i32 1>
-; CHECK-NEXT:    [[NOT:%.*]] = xor <2 x i1> [[TMP3]], <i1 true, i1 true>
-; CHECK-NEXT:    [[TMP4]] = or <2 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-NEXT:    [[TMP4]] = select <2 x i1> [[TMP3]], <2 x i64> [[VEC_PHI]], <2 x i64> [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP4]])
-; CHECK-NEXT:    [[FR_TMP6:%.*]] = freeze i1 [[TMP6]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR_TMP6]], i64 [[A]], i64 poison
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i64> [[TMP4]], poison
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i64 [[A]], i64 poison
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 33, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
@@ -134,25 +136,30 @@ define i64 @pr62565_incoming_value_may_be_poison(i64 %a, ptr %src, i64 %start) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[START]], i64 0
+; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <2 x i64> [[MINMAX_IDENT_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i1> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i32 1, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], <i32 1, i32 1>
-; CHECK-NEXT:    [[NOT:%.*]] = xor <2 x i1> [[TMP3]], <i1 true, i1 true>
-; CHECK-NEXT:    [[TMP4]] = or <2 x i1> [[VEC_PHI]], [[NOT]]
+; CHECK-NEXT:    [[TMP4]] = select <2 x i1> [[TMP3]], <2 x i64> [[VEC_PHI]], <2 x i64> [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[TMP4]])
-; CHECK-NEXT:    [[FR_TMP6:%.*]] = freeze i1 [[TMP6]]
-; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[FR_TMP6]], i64 [[A]], i64 [[START]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[START]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i64> [[TMP4]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]])
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP6]], i64 [[A]], i64 [[START]]
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 33, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll b/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll
index a90b38c6a9605..fe98ca167a089 100644
--- a/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll
+++ b/llvm/test/Transforms/LoopVectorize/vectorize-force-tail-with-evl.ll
@@ -9,6 +9,11 @@
 ; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-vector-width=4 \
 ; RUN: -force-target-supports-scalable-vectors -scalable-vectorization=on -S < %s | FileCheck --check-prefix=NO-VP %s
 
+; RUN: opt -passes=loop-vectorize \
+; RUN: -force-tail-folding-style=none \
+; RUN: -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue \
+; RUN: -force-target-supports-scalable-vectors -scalable-vectorization=on -S < %s | FileCheck --check-prefix=NO-VP-DEF %s
+
 ; The target does not support predicated vectorization.
 define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL-LABEL: @foo(
@@ -80,6 +85,54 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; NO-VP:       for.cond.cleanup:
 ; NO-VP-NEXT:    ret void
 ;
+; NO-VP-DEF-LABEL: @foo(
+; NO-VP-DEF-NEXT:  entry:
+; NO-VP-DEF-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-DEF-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP0]]
+; NO-VP-DEF-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; NO-VP-DEF:       vector.ph:
+; NO-VP-DEF-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-DEF-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP1]]
+; NO-VP-DEF-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-VP-DEF-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; NO-VP-DEF-NEXT:    br label [[VECTOR_BODY:%.*]]
+; NO-VP-DEF:       vector.body:
+; NO-VP-DEF-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; NO-VP-DEF-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; NO-VP-DEF-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP3]]
+; NO-VP-DEF-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; NO-VP-DEF-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i32>, ptr [[TMP5]], align 4
+; NO-VP-DEF-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 [[TMP3]]
+; NO-VP-DEF-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; NO-VP-DEF-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 1 x i32>, ptr [[TMP7]], align 4
+; NO-VP-DEF-NEXT:    [[TMP8:%.*]] = add nsw <vscale x 1 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
+; NO-VP-DEF-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]]
+; NO-VP-DEF-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
+; NO-VP-DEF-NEXT:    store <vscale x 1 x i32> [[TMP8]], ptr [[TMP10]], align 4
+; NO-VP-DEF-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; NO-VP-DEF-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-VP-DEF-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-VP-DEF:       middle.block:
+; NO-VP-DEF-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-VP-DEF-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; NO-VP-DEF:       scalar.ph:
+; NO-VP-DEF-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; NO-VP-DEF-NEXT:    br label [[FOR_BODY:%.*]]
+; NO-VP-DEF:       for.body:
+; NO-VP-DEF-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; NO-VP-DEF-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; NO-VP-DEF-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; NO-VP-DEF-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[IV]]
+; NO-VP-DEF-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; NO-VP-DEF-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
+; NO-VP-DEF-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
+; NO-VP-DEF-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4
+; NO-VP-DEF-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; NO-VP-DEF-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; NO-VP-DEF-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-VP-DEF:       for.cond.cleanup:
+; NO-VP-DEF-NEXT:    ret void
+;
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr88239.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr88239.ll
new file mode 100644
index 0000000000000..3afa1904fb249
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr88239.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes="default<O3>" -mcpu=skx -S | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo(ptr noalias noundef %0, ptr noalias noundef %1) optsize {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: ptr noalias nocapture noundef readonly [[TMP0:%.*]], ptr noalias nocapture noundef writeonly [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    br label [[TMP4:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[TMP2:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[TMP4]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[TMP2]] ], [ [[VEC_IND_NEXT:%.*]], [[TMP4]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i64> [[VEC_IND]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i64> [[TMP6]], <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = tail call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP7]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> poison)
+; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER]], <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store <8 x i32> [[TMP5]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw i64 [[INDVARS_IV]], 8
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 256
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[TMP4]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    ret void
+;
+  br label %3
+
+3:                                                ; preds = %7, %2
+  %4 = phi i32 [ 0, %2 ], [ %15, %7 ]
+  %5 = icmp slt i32 %4, 256
+  br i1 %5, label %7, label %6
+
+6:                                                ; preds = %3
+  ret void
+
+7:                                                ; preds = %3
+  %8 = sub nsw i32 255, %4
+  %9 = zext nneg i32 %8 to i64
+  %10 = getelementptr inbounds i32, ptr %0, i64 %9
+  %11 = load i32, ptr %10, align 4
+  %12 = add nsw i32 %11, 5
+  %13 = sext i32 %4 to i64
+  %14 = getelementptr inbounds i32, ptr %1, i64 %13
+  store i32 %12, ptr %14, align 4
+  %15 = add nsw i32 %4, 1
+  br label %3
+}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.
diff --git a/llvm/test/Transforms/Reassociate/inverses.ll b/llvm/test/Transforms/Reassociate/inverses.ll
index b6962c6872a9a..a9d0c4fb03222 100644
--- a/llvm/test/Transforms/Reassociate/inverses.ll
+++ b/llvm/test/Transforms/Reassociate/inverses.ll
@@ -12,12 +12,12 @@ define i32 @test1(i32 %a, i32 %b) {
   ret i32 %t5
 }
 
-define <2 x i32> @not_op_vec_undef(<2 x i32> %a, <2 x i32> %b) {
-; CHECK-LABEL: @not_op_vec_undef(
+define <2 x i32> @not_op_vec_poison(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: @not_op_vec_poison(
 ; CHECK-NEXT:    ret <2 x i32> zeroinitializer
 ;
   %t2 = and <2 x i32> %b, %a
-  %t4 = xor <2 x i32> %a, <i32 -1, i32 undef>
+  %t4 = xor <2 x i32> %a, <i32 -1, i32 poison>
   %t5 = and <2 x i32> %t2, %t4
   ret <2 x i32> %t5
 }
diff --git a/llvm/test/Transforms/Reassociate/negation.ll b/llvm/test/Transforms/Reassociate/negation.ll
index 4718d9d87ae1b..14ae86fb94aab 100644
--- a/llvm/test/Transforms/Reassociate/negation.ll
+++ b/llvm/test/Transforms/Reassociate/negation.ll
@@ -31,16 +31,16 @@ define i32 @test2(i32 %a, i32 %b, i32 %z) {
   ret i32 %f
 }
 
-define <2 x i32> @negate_vec_undefs(<2 x i32> %a, <2 x i32> %b, <2 x i32> %z) {
-; CHECK-LABEL: @negate_vec_undefs(
+define <2 x i32> @negate_vec_poisons(<2 x i32> %a, <2 x i32> %b, <2 x i32> %z) {
+; CHECK-LABEL: @negate_vec_poisons(
 ; CHECK-NEXT:    [[E:%.*]] = mul <2 x i32> [[A:%.*]], <i32 40, i32 40>
 ; CHECK-NEXT:    [[F:%.*]] = mul <2 x i32> [[E]], [[Z:%.*]]
 ; CHECK-NEXT:    ret <2 x i32> [[F]]
 ;
   %d = mul <2 x i32> %z, <i32 40, i32 40>
-  %c = sub <2 x i32> <i32 0, i32 undef>, %d
+  %c = sub <2 x i32> <i32 0, i32 poison>, %d
   %e = mul <2 x i32> %a, %c
-  %f = sub <2 x i32> <i32 0, i32 undef>, %e
+  %f = sub <2 x i32> <i32 0, i32 poison>, %e
   ret <2 x i32> %f
 }
 
diff --git a/llvm/test/Transforms/SCCP/pr50901.ll b/llvm/test/Transforms/SCCP/pr50901.ll
index 11d6bba6f6a93..d48d67532d88b 100644
--- a/llvm/test/Transforms/SCCP/pr50901.ll
+++ b/llvm/test/Transforms/SCCP/pr50901.ll
@@ -52,6 +52,16 @@
 ; CHECK:     = !DIGlobalVariableExpression(var: ![[DBG_FLOAT_UNDEF:.+]], expr: !DIExpression())
 ; CHECK-DAG: ![[DBG_FLOAT_UNDEF]]  = distinct !DIGlobalVariable(name: "g_float_undef"
 
+; CHECK: ![[G8:[0-9]+]] = !DIGlobalVariableExpression(var: ![[DBG8:[0-9]+]], expr: !DIExpression(DW_OP_constu, 22136, DW_OP_stack_value))
+; CHECK-DAG: ![[DBG8]] = distinct !DIGlobalVariable(name: "g_88", {{.*}}
+; CHECK: ![[G9:[0-9]+]] = !DIGlobalVariableExpression(var: ![[DBG9:[0-9]+]], expr: !DIExpression(DW_OP_constu, 23726, DW_OP_stack_value))
+; CHECK-DAG: ![[DBG9]] = distinct !DIGlobalVariable(name: "g_99", {{.*}}
+
+; CHECK-DAG: ![[DBGA:[0-9]+]] = distinct !DIGlobalVariable(name: "g_i32_undef"
+; CHECK-DAG: ![[GA:[0-9]+]] = !DIGlobalVariableExpression(var: ![[DBGA]], expr: !DIExpression())
+; CHECK-DAG: ![[DBGB:[0-9]+]] = distinct !DIGlobalVariable(name: "g_ptr_undef"
+; CHECK-DAG: ![[GB:[0-9]+]] = !DIGlobalVariableExpression(var: ![[DBGB]], expr: !DIExpression())
+
 @g_1 = dso_local global i32 -4, align 4, !dbg !0
 @g_2 = dso_local global float 0x4011C28F60000000, align 4, !dbg !8
 @g_3 = dso_local global i8 97, align 1, !dbg !10
@@ -59,6 +69,8 @@
 @g_5 = dso_local global i8 1, align 1, !dbg !16
 @g_6 = dso_local global ptr null, align 8, !dbg !19
 @g_7 = dso_local global ptr null, align 8, !dbg !23
+@g_8 = dso_local global half 0xH4321, align 4, !dbg !86
+@g_9 = dso_local global bfloat 0xR3F80, align 4, !dbg !90
 @_ZL4g_11 = internal global i32 -5, align 4, !dbg !25
 @_ZL4g_22 = internal global float 0x4016333340000000, align 4, !dbg !27
 @_ZL4g_33 = internal global i8 98, align 1, !dbg !29
@@ -67,6 +79,10 @@
 @_ZL4g_66 = internal global ptr null, align 8, !dbg !35
 @_ZL4g_77 = internal global ptr inttoptr (i64 70 to ptr), align 8, !dbg !37
 @g_float_undef = internal global float undef, align 4, !dbg !83
+@_ZL4g_88 = internal global half 0xH5678, align 4, !dbg !88
+@_ZL4g_99 = internal global bfloat 0xR5CAE, align 4, !dbg !92
+@g_i32_undef = internal global i32 undef, align 4, !dbg !95
+@g_ptr_undef = internal global ptr undef, align 8, !dbg !97
 
 define dso_local void @_Z3barv() !dbg !46 {
 entry:
@@ -88,6 +104,15 @@ entry:
   store ptr %6, ptr @g_7, align 8, !dbg !59
   %l = load float, ptr @g_float_undef, align 8, !dbg !59
   store float %l, ptr @g_2, align 8, !dbg !59
+  %7 = load half, ptr @_ZL4g_88, align 4, !dbg !59
+  store half %7, ptr @g_8, align 4, !dbg !59
+  %8 = load bfloat, ptr @_ZL4g_99, align 4, !dbg !59
+  store bfloat %8, ptr @g_9, align 4, !dbg !59
+  %9 = load i32, ptr @g_i32_undef, align 4, !dbg !59
+  store i32 %9, ptr @g_1, align 4, !dbg !59
+  %10 = load ptr, ptr @g_ptr_undef, align 8, !dbg !59
+  store ptr %10, ptr @g_6, align 8, !dbg !59
+
   ret void, !dbg !59
 }
 
@@ -108,7 +133,7 @@ entry:
 !4 = !{!5}
 !5 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !6, size: 64)
 !6 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
-!7 = !{!0, !8, !10, !13, !16, !19, !23, !25, !27, !29, !31, !33, !35, !37, !83}
+!7 = !{!0, !8, !10, !13, !16, !19, !23, !25, !27, !29, !31, !33, !35, !37, !83, !86, !88, !90, !92, !95, !97}
 !8 = !DIGlobalVariableExpression(var: !9, expr: !DIExpression())
 !9 = distinct !DIGlobalVariable(name: "g_2", scope: !2, file: !3, line: 2, type: !6, isLocal: false, isDefinition: true)
 !10 = !DIGlobalVariableExpression(var: !11, expr: !DIExpression())
@@ -159,3 +184,17 @@ entry:
 !82 = !DILocation(line: 31, column: 1, scope: !77)
 !83 = !DIGlobalVariableExpression(var: !84, expr: !DIExpression())
 !84 = distinct !DIGlobalVariable(name: "g_float_undef", linkageName: "g_float_undef", scope: !2, file: !3, line: 15, type: !6, isLocal: true, isDefinition: true)
+!85 = !DIBasicType(name: "float", size: 16, encoding: DW_ATE_float)
+!86 = !DIGlobalVariableExpression(var: !87, expr: !DIExpression())
+!87 = distinct !DIGlobalVariable(name: "g_8", scope: !2, file: !3, line: 2, type: !85, isLocal: false, isDefinition: true)
+!88 = !DIGlobalVariableExpression(var: !89, expr: !DIExpression())
+!89 = distinct !DIGlobalVariable(name: "g_88", linkageName: "_ZL4g_88", scope: !2, file: !3, line: 10, type: !85, isLocal: true, isDefinition: true)
+!90 = !DIGlobalVariableExpression(var: !91, expr: !DIExpression())
+!91 = distinct !DIGlobalVariable(name: "g_9", scope: !2, file: !3, line: 2, type: !85, isLocal: false, isDefinition: true)
+!92 = !DIGlobalVariableExpression(var: !93, expr: !DIExpression())
+!93 = distinct !DIGlobalVariable(name: "g_99", linkageName: "_ZL4g_99", scope: !2, file: !3, line: 10, type: !85, isLocal: true, isDefinition: true)
+
+!95 = !DIGlobalVariableExpression(var: !96, expr: !DIExpression())
+!96 = distinct !DIGlobalVariable(name: "g_i32_undef", linkageName: "g_i32_undef", scope: !2, file: !3, line: 9, type: !22, isLocal: true, isDefinition: true)
+!97 = !DIGlobalVariableExpression(var: !98, expr: !DIExpression())
+!98 = distinct !DIGlobalVariable(name: "g_ptr_undef", linkageName: "g_ptr_undef", scope: !2, file: !3, line: 14, type: !21, isLocal: true, isDefinition: true)
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/smax-unsigned-operand.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/smax-unsigned-operand.ll
new file mode 100644
index 0000000000000..5db148ac1b485
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/smax-unsigned-operand.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux-gnu -mattr=+v < %s | FileCheck %s
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-linux-gnu"
+
+@e = global [2 x i8] zeroinitializer
+
+define void @main(ptr noalias %p) {
+; CHECK-LABEL: define void @main(
+; CHECK-SAME: ptr noalias [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[CONV_4:%.*]] = zext i32 0 to i64
+; CHECK-NEXT:    [[COND_4:%.*]] = tail call i64 @llvm.smax.i64(i64 [[CONV_4]], i64 0)
+; CHECK-NEXT:    [[CONV5_4:%.*]] = trunc i64 [[COND_4]] to i8
+; CHECK-NEXT:    store i8 [[CONV5_4]], ptr getelementptr inbounds ([11 x i8], ptr @e, i64 0, i64 4), align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[CONV_5:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[COND_5:%.*]] = tail call i64 @llvm.smax.i64(i64 [[CONV_5]], i64 1)
+; CHECK-NEXT:    [[CONV5_5:%.*]] = trunc i64 [[COND_5]] to i8
+; CHECK-NEXT:    store i8 [[CONV5_5]], ptr getelementptr inbounds ([11 x i8], ptr @e, i64 0, i64 5), align 1
+; CHECK-NEXT:    ret void
+;
+bb:
+  %conv.4 = zext i32 0 to i64
+  %cond.4 = tail call i64 @llvm.smax.i64(i64 %conv.4, i64 0)
+  %conv5.4 = trunc i64 %cond.4 to i8
+  store i8 %conv5.4, ptr getelementptr inbounds ([11 x i8], ptr @e, i64 0, i64 4), align 1
+  %0 = load i32, ptr %p, align 4
+  %conv.5 = zext i32 %0 to i64
+  %cond.5 = tail call i64 @llvm.smax.i64(i64 %conv.5, i64 1)
+  %conv5.5 = trunc i64 %cond.5 to i8
+  store i8 %conv5.5, ptr getelementptr inbounds ([11 x i8], ptr @e, i64 0, i64 5), align 1
+  ret void
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll
index 639aa0a1c6a2c..9b8480cd0088a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll
@@ -834,22 +834,34 @@ define void @fshl_v2i32_uniformconst() {
 ; SSE-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
 ; SSE-NEXT:    ret void
 ;
-; AVX-LABEL: @fshl_v2i32_uniformconst(
-; AVX-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
-; AVX-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
-; AVX-NEXT:    [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 1)
-; AVX-NEXT:    [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 1)
-; AVX-NEXT:    store i32 [[R0]], ptr @d32, align 4
-; AVX-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
-; AVX-NEXT:    ret void
+; AVX1-LABEL: @fshl_v2i32_uniformconst(
+; AVX1-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
+; AVX1-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
+; AVX1-NEXT:    [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 1)
+; AVX1-NEXT:    [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 1)
+; AVX1-NEXT:    store i32 [[R0]], ptr @d32, align 4
+; AVX1-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @fshl_v2i32_uniformconst(
+; AVX2-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
+; AVX2-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
+; AVX2-NEXT:    [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 1)
+; AVX2-NEXT:    [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 1)
+; AVX2-NEXT:    store i32 [[R0]], ptr @d32, align 4
+; AVX2-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
+; AVX2-NEXT:    ret void
+;
+; AVX256-LABEL: @fshl_v2i32_uniformconst(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>)
+; AVX256-NEXT:    store <2 x i32> [[TMP2]], ptr @d32, align 4
+; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @fshl_v2i32_uniformconst(
-; AVX512-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
-; AVX512-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
-; AVX512-NEXT:    [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 1)
-; AVX512-NEXT:    [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 1)
-; AVX512-NEXT:    store i32 [[R0]], ptr @d32, align 4
-; AVX512-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
+; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>)
+; AVX512-NEXT:    store <2 x i32> [[TMP2]], ptr @d32, align 4
 ; AVX512-NEXT:    ret void
 ;
   %a0  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll
index c557c9647551a..f3e73d0e6840e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll
@@ -834,22 +834,34 @@ define void @fshr_v2i32_uniformconst() {
 ; SSE-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
 ; SSE-NEXT:    ret void
 ;
-; AVX-LABEL: @fshr_v2i32_uniformconst(
-; AVX-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
-; AVX-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
-; AVX-NEXT:    [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1)
-; AVX-NEXT:    [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1)
-; AVX-NEXT:    store i32 [[R0]], ptr @d32, align 4
-; AVX-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
-; AVX-NEXT:    ret void
+; AVX1-LABEL: @fshr_v2i32_uniformconst(
+; AVX1-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
+; AVX1-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
+; AVX1-NEXT:    [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1)
+; AVX1-NEXT:    [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1)
+; AVX1-NEXT:    store i32 [[R0]], ptr @d32, align 4
+; AVX1-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @fshr_v2i32_uniformconst(
+; AVX2-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
+; AVX2-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
+; AVX2-NEXT:    [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1)
+; AVX2-NEXT:    [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1)
+; AVX2-NEXT:    store i32 [[R0]], ptr @d32, align 4
+; AVX2-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
+; AVX2-NEXT:    ret void
+;
+; AVX256-LABEL: @fshr_v2i32_uniformconst(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>)
+; AVX256-NEXT:    store <2 x i32> [[TMP2]], ptr @d32, align 4
+; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @fshr_v2i32_uniformconst(
-; AVX512-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
-; AVX512-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
-; AVX512-NEXT:    [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1)
-; AVX512-NEXT:    [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1)
-; AVX512-NEXT:    store i32 [[R0]], ptr @d32, align 4
-; AVX512-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
+; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> <i32 1, i32 1>)
+; AVX512-NEXT:    store <2 x i32> [[TMP2]], ptr @d32, align 4
 ; AVX512-NEXT:    ret void
 ;
   %a0  = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 0 ), align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
index 096f57d100a50..c600d75ed1e8c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract_in_tree_user.ll
@@ -13,7 +13,7 @@ define i32 @fn1() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 11, i64 56>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
 ; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    ret i32 undef
@@ -92,7 +92,7 @@ define void @externally_used_ptrs() {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP0]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x ptr> [[TMP1]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 56, i64 11>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 11
 ; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP5]], [[TMP6]]
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
index aa67974358306..e459cd8c6955b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll
@@ -13,25 +13,26 @@ define dso_local i32 @g() local_unnamed_addr {
 ; CHECK:       while.body:
 ; CHECK-NEXT:    [[C_022:%.*]] = phi ptr [ [[C_022_BE:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x ptr> [ [[TMP14:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ]
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[C_022]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP9]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 1, i64 1>
 ; CHECK-NEXT:    switch i32 [[TMP3]], label [[WHILE_BODY_BACKEDGE]] [
-; CHECK-NEXT:    i32 2, label [[SW_BB:%.*]]
-; CHECK-NEXT:    i32 4, label [[SW_BB6:%.*]]
+; CHECK-NEXT:      i32 2, label [[SW_BB:%.*]]
+; CHECK-NEXT:      i32 4, label [[SW_BB6:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x ptr> [[TMP4]], i32 1
-; CHECK-NEXT:    store i32 [[TMP7]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 2
+; CHECK-NEXT:    store i32 [[TMP7]], ptr [[INCDEC_PTR1]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 2, i64 2>
 ; CHECK-NEXT:    br label [[WHILE_BODY_BACKEDGE]]
 ; CHECK:       sw.bb6:
 ; CHECK-NEXT:    [[INCDEC_PTR8:%.*]] = getelementptr inbounds i32, ptr [[C_022]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[INCDEC_PTR]] to i64
 ; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, <2 x ptr> [[TMP1]], <2 x i64> <i64 2, i64 2>
@@ -39,7 +40,7 @@ define dso_local i32 @g() local_unnamed_addr {
 ; CHECK-NEXT:    store i32 [[TMP11]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    br label [[WHILE_BODY_BACKEDGE]]
 ; CHECK:       while.body.backedge:
-; CHECK-NEXT:    [[C_022_BE]] = phi ptr [ [[INCDEC_PTR]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ]
+; CHECK-NEXT:    [[C_022_BE]] = phi ptr [ [[INCDEC_PTR1]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ]
 ; CHECK-NEXT:    [[TMP14]] = phi <2 x ptr> [ [[TMP4]], [[WHILE_BODY]] ], [ [[TMP12]], [[SW_BB6]] ], [ [[TMP8]], [[SW_BB]] ]
 ; CHECK-NEXT:    br label [[WHILE_BODY]]
 ; CHECK:       while.end:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
index 3801fa5c787b6..c40be9690cce1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
@@ -52,17 +52,14 @@ define void @test(ptr %r, ptr %p, ptr %q) #0 {
 
 define void @test2(ptr %a, ptr %b) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 1
-; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 2
-; CHECK-NEXT:    [[I1:%.*]] = ptrtoint ptr [[A1]] to i64
-; CHECK-NEXT:    [[B3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 3
-; CHECK-NEXT:    [[I2:%.*]] = ptrtoint ptr [[B3]] to i64
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A1]], align 8
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 8
-; CHECK-NEXT:    [[ADD1:%.*]] = add i64 [[I1]], [[V1]]
-; CHECK-NEXT:    [[ADD2:%.*]] = add i64 [[I2]], [[V2]]
-; CHECK-NEXT:    store i64 [[ADD1]], ptr [[A1]], align 8
-; CHECK-NEXT:    store i64 [[ADD2]], ptr [[A2]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A:%.*]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[B:%.*]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 1, i64 3>
+; CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i64>, ptr [[A1]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr [[A1]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %a1 = getelementptr inbounds i64, ptr %a, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
index 75505f632a43f..3deab0975ce76 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
@@ -100,41 +100,17 @@ define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) {
 define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) {
 ; SSE-LABEL: @store_i64(
 ; SSE-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
-; SSE-NEXT:    [[TMP5:%.*]] = load i64, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
-; SSE-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = lshr i64 [[TMP6]], 15
-; SSE-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
-; SSE-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[TMP8]], 255
-; SSE-NEXT:    [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295
-; SSE-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255
-; SSE-NEXT:    store i64 [[TMP11]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
-; SSE-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]]
-; SSE-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 15
-; SSE-NEXT:    [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32
-; SSE-NEXT:    [[TMP17:%.*]] = icmp ult i32 [[TMP16]], 255
-; SSE-NEXT:    [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295
-; SSE-NEXT:    [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255
-; SSE-NEXT:    store i64 [[TMP19]], ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
-; SSE-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]]
-; SSE-NEXT:    [[TMP23:%.*]] = lshr i64 [[TMP22]], 15
-; SSE-NEXT:    [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32
-; SSE-NEXT:    [[TMP25:%.*]] = icmp ult i32 [[TMP24]], 255
-; SSE-NEXT:    [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295
-; SSE-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255
-; SSE-NEXT:    store i64 [[TMP27]], ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
-; SSE-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]]
-; SSE-NEXT:    [[TMP31:%.*]] = lshr i64 [[TMP30]], 15
-; SSE-NEXT:    [[TMP32:%.*]] = trunc i64 [[TMP31]] to i32
-; SSE-NEXT:    [[TMP33:%.*]] = icmp ult i32 [[TMP32]], 255
-; SSE-NEXT:    [[TMP34:%.*]] = and i64 [[TMP31]], 4294967295
-; SSE-NEXT:    [[TMP35:%.*]] = select i1 [[TMP33]], i64 [[TMP34]], i64 255
-; SSE-NEXT:    store i64 [[TMP35]], ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
+; SSE-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
+; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
+; SSE-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
+; SSE-NEXT:    [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], <i64 15, i64 15, i64 15, i64 15>
+; SSE-NEXT:    [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
+; SSE-NEXT:    [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], <i32 255, i32 255, i32 255, i32 255>
+; SSE-NEXT:    [[TMP12:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
+; SSE-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> <i32 255, i32 255, i32 255, i32 255>
+; SSE-NEXT:    [[TMP14:%.*]] = zext <4 x i32> [[TMP13]] to <4 x i64>
+; SSE-NEXT:    store <4 x i64> [[TMP14]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @store_i64(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
index ddc2a1b819041..30f328293cdaa 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll
@@ -9,7 +9,7 @@ define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) [[TMP0:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x ptr addrspace(1)> [[TMP3]], <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, <4 x ptr addrspace(1)> [[TMP4]], <4 x i64> <i64 8, i64 12, i64 28, i64 24>
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x ptr addrspace(1)> [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p1(<4 x ptr addrspace(1)> [[TMP5]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> poison)
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <8 x i32> <i32 0, i32 3, i32 0, i32 3, i32 2, i32 1, i32 2, i32 1>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll b/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll
index 0125e5fab089b..e93c5244dfbe2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/stacksave-dependence.ll
@@ -35,7 +35,7 @@ define void @allocas(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[V1]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[V2]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[V1]], i32 1
 ; CHECK-NEXT:    store ptr [[TMP4]], ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    store <2 x ptr> [[TMP3]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
@@ -127,7 +127,7 @@ define void @stacksave2(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[V1]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[V2]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, <2 x ptr> [[TMP2]], <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[V1]], i32 1
 ; CHECK-NEXT:    store ptr [[TMP4]], ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    call void @use(ptr inalloca(i8) [[V2]]) #[[ATTR5:[0-9]+]]
 ; CHECK-NEXT:    call void @llvm.stackrestore.p0(ptr [[STACK]])
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/trunc-store-value-ty-not-power-of-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/trunc-store-value-ty-not-power-of-2.ll
index 81b4ee40e7fdf..2f0fad70b593b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/trunc-store-value-ty-not-power-of-2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/trunc-store-value-ty-not-power-of-2.ll
@@ -107,3 +107,36 @@ define void @test_4_trunc_i24_to_i16(i24 %x, ptr %A) {
   store i16 %t, ptr %gep.3, align 1
   ret void
 }
+
+%struct.d = type { [3 x i8], [3 x i8], [2 x i8] }
+
+; Test case for https://github.com/llvm/llvm-project/issues/88640.
+define void @test_access_i24_directly(ptr %src, ptr noalias %dst) "target-cpu"="btver2" {
+; CHECK-LABEL: define void @test_access_i24_directly(
+; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[SRC]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i24
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds [[STRUCT_D:%.*]], ptr [[SRC]], i64 0, i32 1
+; CHECK-NEXT:    [[BF_LOAD:%.*]] = load i24, ptr [[GEP_SRC]], align 1
+; CHECK-NEXT:    [[BF_VALUE:%.*]] = and i24 [[TMP1]], 8388607
+; CHECK-NEXT:    [[BF_CLEAR:%.*]] = and i24 [[BF_LOAD]], -8388608
+; CHECK-NEXT:    [[BF_SET:%.*]] = or disjoint i24 [[BF_CLEAR]], [[BF_VALUE]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds [[STRUCT_D]], ptr [[DST]], i64 0, i32 1
+; CHECK-NEXT:    store i24 [[BF_SET]], ptr [[GEP_DST]], align 1
+; CHECK-NEXT:    store i24 0, ptr [[DST]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i64, ptr %src, align 8
+  %1 = trunc i64 %0 to i24
+  %gep.src = getelementptr inbounds %struct.d, ptr %src, i64 0, i32 1
+  %bf.load = load i24, ptr %gep.src, align 1
+  %bf.value = and i24 %1, 8388607
+  %bf.clear = and i24 %bf.load, -8388608
+  %bf.set = or disjoint i24 %bf.clear, %bf.value
+  %gep.dst = getelementptr inbounds %struct.d, ptr %dst, i64 0, i32 1
+  store i24 %bf.set, ptr %gep.dst, align 1
+  store i24 0, ptr %dst, align 8
+  ret void
+}
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll
index eae0879004839..b27c026dbccf0 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/select-shuffle.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='vector-combine' -S %s | FileCheck %s
+; RUN: opt -passes=vector-combine -S %s | FileCheck %s
 
 target triple = "aarch64"
 
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
index d96dfec849167..a1d4ca1a740e7 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='vector-combine' -S %s | FileCheck %s
+; RUN: opt -passes=vector-combine -S %s | FileCheck %s
 
 target triple = "aarch64"
 
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/vecreduce-shuffle.ll b/llvm/test/Transforms/VectorCombine/AArch64/vecreduce-shuffle.ll
index d69cb75664a8c..a22575ccb1ca2 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/vecreduce-shuffle.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/vecreduce-shuffle.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='vector-combine' -S %s | FileCheck %s
+; RUN: opt -passes=vector-combine -S %s | FileCheck %s
 
 target triple = "aarch64"
 
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll
new file mode 100644
index 0000000000000..e2ff343944cf2
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-binops.ll
@@ -0,0 +1,204 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX
+
+declare void @use(<4 x i32>)
+
+; Shuffle is much cheaper than fdiv. FMF are intersected.
+
+define <4 x float> @shuf_fdiv_v4f32_yy(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
+; CHECK-LABEL: define <4 x float> @shuf_fdiv_v4f32_yy(
+; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Z]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = fdiv arcp <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %b0 = fdiv fast <4 x float> %x, %y
+  %b1 = fdiv arcp <4 x float> %z, %y
+  %r = shufflevector <4 x float> %b0, <4 x float> %b1, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x float> %r
+}
+
+; Common operand is op0 of the binops.
+
+define <4 x i32> @shuf_add_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
+; CHECK-LABEL: define <4 x i32> @shuf_add_v4i32_xx(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 2, i32 0>
+; CHECK-NEXT:    [[R1:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[Z]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
+; CHECK-NEXT:    [[R2:%.*]] = add <4 x i32> [[TMP1]], [[R1]]
+; CHECK-NEXT:    ret <4 x i32> [[R2]]
+;
+  %b0 = add <4 x i32> %x, %y
+  %b1 = add <4 x i32> %x, %z
+  %r = shufflevector <4 x i32> %b0, <4 x i32> %b1, <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
+  ret <4 x i32> %r
+}
+
+; For commutative instructions, common operand may be swapped.
+
+define <4 x float> @shuf_fmul_v4f32_xx_swap(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
+; CHECK-LABEL: define <4 x float> @shuf_fmul_v4f32_xx_swap(
+; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; CHECK-NEXT:    [[R:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %b0 = fmul <4 x float> %x, %y
+  %b1 = fmul <4 x float> %z, %x
+  %r = shufflevector <4 x float> %b0, <4 x float> %b1, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
+  ret <4 x float> %r
+}
+
+; For commutative instructions, common operand may be swapped.
+
+define <2 x i64> @shuf_and_v2i64_yy_swap(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) {
+; CHECK-LABEL: define <2 x i64> @shuf_and_v2i64_yy_swap(
+; CHECK-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[Y]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[X]], <2 x i64> [[Z]], <2 x i32> <i32 3, i32 0>
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %b0 = and <2 x i64> %x, %y
+  %b1 = and <2 x i64> %y, %z
+  %r = shufflevector <2 x i64> %b0, <2 x i64> %b1, <2 x i32> <i32 3, i32 0>
+  ret <2 x i64> %r
+}
+
+; non-commutative binop, but common op0
+
+define <4 x i32> @shuf_shl_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
+; CHECK-LABEL: define <4 x i32> @shuf_shl_v4i32_xx(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[Z]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
+; CHECK-NEXT:    [[R:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %b0 = shl <4 x i32> %x, %y
+  %b1 = shl <4 x i32> %x, %z
+  %r = shufflevector <4 x i32> %b0, <4 x i32> %b1, <4 x i32> <i32 3, i32 1, i32 1, i32 6>
+  ret <4 x i32> %r
+}
+
+; negative test - common operand, but not commutable
+
+define <4 x i32> @shuf_shl_v4i32_xx_swap(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
+; CHECK-LABEL: define <4 x i32> @shuf_shl_v4i32_xx_swap(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[B0:%.*]] = shl <4 x i32> [[X]], [[Y]]
+; CHECK-NEXT:    [[B1:%.*]] = shl <4 x i32> [[Z]], [[X]]
+; CHECK-NEXT:    [[R1:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> <i32 3, i32 2, i32 2, i32 5>
+; CHECK-NEXT:    ret <4 x i32> [[R1]]
+;
+  %b0 = shl <4 x i32> %x, %y
+  %b1 = shl <4 x i32> %z, %x
+  %r = shufflevector <4 x i32> %b0, <4 x i32> %b1, <4 x i32> <i32 3, i32 2, i32 2, i32 5>
+  ret <4 x i32> %r
+}
+
+; negative test - mismatched opcodes
+
+define <2 x i64> @shuf_sub_add_v2i64_yy(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) {
+; CHECK-LABEL: define <2 x i64> @shuf_sub_add_v2i64_yy(
+; CHECK-SAME: <2 x i64> [[X:%.*]], <2 x i64> [[Y:%.*]], <2 x i64> [[Z:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[B0:%.*]] = sub <2 x i64> [[X]], [[Y]]
+; CHECK-NEXT:    [[B1:%.*]] = add <2 x i64> [[Z]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i64> [[B0]], <2 x i64> [[B1]], <2 x i32> <i32 3, i32 0>
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %b0 = sub <2 x i64> %x, %y
+  %b1 = add <2 x i64> %z, %y
+  %r = shufflevector <2 x i64> %b0, <2 x i64> %b1, <2 x i32> <i32 3, i32 0>
+  ret <2 x i64> %r
+}
+
+; negative test - type change via shuffle
+
+define <8 x float> @shuf_fmul_v4f32_xx_type(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
+; CHECK-LABEL: define <8 x float> @shuf_fmul_v4f32_xx_type(
+; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[B0:%.*]] = fmul <4 x float> [[X]], [[Y]]
+; CHECK-NEXT:    [[B1:%.*]] = fmul <4 x float> [[Z]], [[X]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 0, i32 1, i32 1, i32 6>
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
+  %b0 = fmul <4 x float> %x, %y
+  %b1 = fmul <4 x float> %z, %x
+  %r = shufflevector <4 x float> %b0, <4 x float> %b1, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 0, i32 1, i32 1, i32 6>
+  ret <8 x float> %r
+}
+
+; negative test - uses
+
+define <4 x i32> @shuf_lshr_v4i32_yy_use1(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
+; CHECK-LABEL: define <4 x i32> @shuf_lshr_v4i32_yy_use1(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[B0:%.*]] = lshr <4 x i32> [[X]], [[Y]]
+; CHECK-NEXT:    call void @use(<4 x i32> [[B0]])
+; CHECK-NEXT:    [[B1:%.*]] = lshr <4 x i32> [[Z]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %b0 = lshr <4 x i32> %x, %y
+  call void @use(<4 x i32> %b0)
+  %b1 = lshr <4 x i32> %z, %y
+  %r = shufflevector <4 x i32> %b0, <4 x i32> %b1, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  ret <4 x i32> %r
+}
+
+; negative test - uses
+
+define <4 x i32> @shuf_mul_v4i32_yy_use2(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
+; CHECK-LABEL: define <4 x i32> @shuf_mul_v4i32_yy_use2(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[B0:%.*]] = mul <4 x i32> [[X]], [[Y]]
+; CHECK-NEXT:    [[B1:%.*]] = mul <4 x i32> [[Z]], [[Y]]
+; CHECK-NEXT:    call void @use(<4 x i32> [[B1]])
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %b0 = mul <4 x i32> %x, %y
+  %b1 = mul <4 x i32> %z, %y
+  call void @use(<4 x i32> %b1)
+  %r = shufflevector <4 x i32> %b0, <4 x i32> %b1, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x i32> %r
+}
+
+; negative test - must have matching operand
+
+define <4 x float> @shuf_fadd_v4f32_no_common_op(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) {
+; CHECK-LABEL: define <4 x float> @shuf_fadd_v4f32_no_common_op(
+; CHECK-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]], <4 x float> [[W:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[B0:%.*]] = fadd <4 x float> [[X]], [[Y]]
+; CHECK-NEXT:    [[B1:%.*]] = fadd <4 x float> [[Z]], [[W]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %b0 = fadd <4 x float> %x, %y
+  %b1 = fadd <4 x float> %z, %w
+  %r = shufflevector <4 x float> %b0, <4 x float> %b1, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret <4 x float> %r
+}
+
+; negative test - binops may be relatively cheap
+
+define <16 x i16> @shuf_and_v16i16_yy_expensive_shuf(<16 x i16> %x, <16 x i16> %y, <16 x i16> %z) {
+; CHECK-LABEL: define <16 x i16> @shuf_and_v16i16_yy_expensive_shuf(
+; CHECK-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]], <16 x i16> [[Z:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[B0:%.*]] = and <16 x i16> [[X]], [[Y]]
+; CHECK-NEXT:    [[B1:%.*]] = and <16 x i16> [[Y]], [[Z]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <16 x i16> [[B0]], <16 x i16> [[B1]], <16 x i32> <i32 15, i32 22, i32 25, i32 13, i32 28, i32 0, i32 poison, i32 3, i32 0, i32 30, i32 3, i32 7, i32 9, i32 19, i32 2, i32 22>
+; CHECK-NEXT:    ret <16 x i16> [[R]]
+;
+  %b0 = and <16 x i16> %x, %y
+  %b1 = and <16 x i16> %y, %z
+  %r = shufflevector <16 x i16> %b0, <16 x i16> %b1, <16 x i32> <i32 15, i32 22, i32 25, i32 13, i32 28, i32 0, i32 poison, i32 3, i32 0, i32 30, i32 3, i32 7, i32 9, i32 19, i32 2, i32 22>
+  ret <16 x i16> %r
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
+; SSE: {{.*}}
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
index 7d9f7e390b9c0..3a5d2095e2b93 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
@@ -1,12 +1,13 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX
 
 ; standard vector concatenations
 
 define <16 x i32> @concat_zext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: @concat_zext_v8i16_v16i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-LABEL: define <16 x i32> @concat_zext_v8i16_v16i32(
+; CHECK-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> [[A1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[R:%.*]] = zext <16 x i16> [[TMP1]] to <16 x i32>
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
 ;
@@ -17,8 +18,9 @@ define <16 x i32> @concat_zext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
 }
 
 define <16 x i32> @concat_zext_nneg_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: @concat_zext_nneg_v8i16_v16i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-LABEL: define <16 x i32> @concat_zext_nneg_v8i16_v16i32(
+; CHECK-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> [[A1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[R:%.*]] = zext nneg <16 x i16> [[TMP1]] to <16 x i32>
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
 ;
@@ -29,14 +31,16 @@ define <16 x i32> @concat_zext_nneg_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
 }
 
 define <16 x i32> @concat_sext_zext_nneg_v8i16_v8i32(<8 x i16> %a0, <8 x i16> %a1) {
-; SSE-LABEL: @concat_sext_zext_nneg_v8i16_v8i32(
-; SSE-NEXT:    [[X0:%.*]] = sext <8 x i16> [[A0:%.*]] to <8 x i32>
-; SSE-NEXT:    [[X1:%.*]] = zext nneg <8 x i16> [[A1:%.*]] to <8 x i32>
+; SSE-LABEL: define <16 x i32> @concat_sext_zext_nneg_v8i16_v8i32(
+; SSE-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[X0:%.*]] = sext <8 x i16> [[A0]] to <8 x i32>
+; SSE-NEXT:    [[X1:%.*]] = zext nneg <8 x i16> [[A1]] to <8 x i32>
 ; SSE-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[X0]], <8 x i32> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:    ret <16 x i32> [[R]]
 ;
-; AVX-LABEL: @concat_sext_zext_nneg_v8i16_v8i32(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-LABEL: define <16 x i32> @concat_sext_zext_nneg_v8i16_v8i32(
+; AVX-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> [[A1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
 ; AVX-NEXT:    ret <16 x i32> [[R]]
 ;
@@ -47,8 +51,9 @@ define <16 x i32> @concat_sext_zext_nneg_v8i16_v8i32(<8 x i16> %a0, <8 x i16> %a
 }
 
 define <16 x i32> @concat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: @concat_sext_v8i16_v16i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-LABEL: define <16 x i32> @concat_sext_v8i16_v16i32(
+; CHECK-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> [[A1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
 ;
@@ -59,8 +64,9 @@ define <16 x i32> @concat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
 }
 
 define <8 x i32> @concat_sext_v4i1_v8i32(<4 x i1> %a0, <4 x i1> %a1) {
-; CHECK-LABEL: @concat_sext_v4i1_v8i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[A0:%.*]], <4 x i1> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-LABEL: define <8 x i32> @concat_sext_v4i1_v8i32(
+; CHECK-SAME: <4 x i1> [[A0:%.*]], <4 x i1> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i1> [[A0]], <4 x i1> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[R:%.*]] = sext <8 x i1> [[TMP1]] to <8 x i32>
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 ;
@@ -71,8 +77,9 @@ define <8 x i32> @concat_sext_v4i1_v8i32(<4 x i1> %a0, <4 x i1> %a1) {
 }
 
 define <8 x i16> @concat_trunc_v4i32_v8i16(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: @concat_trunc_v4i32_v8i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-LABEL: define <8 x i16> @concat_trunc_v4i32_v8i16(
+; CHECK-SAME: <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0]], <4 x i32> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[R:%.*]] = trunc <8 x i32> [[TMP1]] to <8 x i16>
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
@@ -83,8 +90,9 @@ define <8 x i16> @concat_trunc_v4i32_v8i16(<4 x i32> %a0, <4 x i32> %a1) {
 }
 
 define <8 x ptr> @concat_inttoptr_v4i32_v8iptr(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: @concat_inttoptr_v4i32_v8iptr(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-LABEL: define <8 x ptr> @concat_inttoptr_v4i32_v8iptr(
+; CHECK-SAME: <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0]], <4 x i32> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[R:%.*]] = inttoptr <8 x i32> [[TMP1]] to <8 x ptr>
 ; CHECK-NEXT:    ret <8 x ptr> [[R]]
 ;
@@ -95,8 +103,9 @@ define <8 x ptr> @concat_inttoptr_v4i32_v8iptr(<4 x i32> %a0, <4 x i32> %a1) {
 }
 
 define <16 x i64> @concat_ptrtoint_v8i16_v16i32(<8 x ptr> %a0, <8 x ptr> %a1) {
-; CHECK-LABEL: @concat_ptrtoint_v8i16_v16i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[A0:%.*]], <8 x ptr> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-LABEL: define <16 x i64> @concat_ptrtoint_v8i16_v16i32(
+; CHECK-SAME: <8 x ptr> [[A0:%.*]], <8 x ptr> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x ptr> [[A0]], <8 x ptr> [[A1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[R:%.*]] = ptrtoint <16 x ptr> [[TMP1]] to <16 x i64>
 ; CHECK-NEXT:    ret <16 x i64> [[R]]
 ;
@@ -107,14 +116,16 @@ define <16 x i64> @concat_ptrtoint_v8i16_v16i32(<8 x ptr> %a0, <8 x ptr> %a1) {
 }
 
 define <8 x double> @concat_fpext_v4f32_v8f64(<4 x float> %a0, <4 x float> %a1) {
-; SSE-LABEL: @concat_fpext_v4f32_v8f64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-LABEL: define <8 x double> @concat_fpext_v4f32_v8f64(
+; SSE-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A0]], <4 x float> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:    [[R:%.*]] = fpext <8 x float> [[TMP1]] to <8 x double>
 ; SSE-NEXT:    ret <8 x double> [[R]]
 ;
-; AVX-LABEL: @concat_fpext_v4f32_v8f64(
-; AVX-NEXT:    [[X0:%.*]] = fpext <4 x float> [[A0:%.*]] to <4 x double>
-; AVX-NEXT:    [[X1:%.*]] = fpext <4 x float> [[A1:%.*]] to <4 x double>
+; AVX-LABEL: define <8 x double> @concat_fpext_v4f32_v8f64(
+; AVX-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[X0:%.*]] = fpext <4 x float> [[A0]] to <4 x double>
+; AVX-NEXT:    [[X1:%.*]] = fpext <4 x float> [[A1]] to <4 x double>
 ; AVX-NEXT:    [[R:%.*]] = shufflevector <4 x double> [[X0]], <4 x double> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:    ret <8 x double> [[R]]
 ;
@@ -125,9 +136,10 @@ define <8 x double> @concat_fpext_v4f32_v8f64(<4 x float> %a0, <4 x float> %a1)
 }
 
 define <16 x float> @concat_fptrunc_v8f64_v16f32(<8 x double> %a0, <8 x double> %a1) {
-; CHECK-LABEL: @concat_fptrunc_v8f64_v16f32(
-; CHECK-NEXT:    [[X0:%.*]] = fptrunc <8 x double> [[A0:%.*]] to <8 x float>
-; CHECK-NEXT:    [[X1:%.*]] = fptrunc <8 x double> [[A1:%.*]] to <8 x float>
+; CHECK-LABEL: define <16 x float> @concat_fptrunc_v8f64_v16f32(
+; CHECK-SAME: <8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[X0:%.*]] = fptrunc <8 x double> [[A0]] to <8 x float>
+; CHECK-NEXT:    [[X1:%.*]] = fptrunc <8 x double> [[A1]] to <8 x float>
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x float> [[X0]], <8 x float> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    ret <16 x float> [[R]]
 ;
@@ -140,8 +152,9 @@ define <16 x float> @concat_fptrunc_v8f64_v16f32(<8 x double> %a0, <8 x double>
 ; commuted vector concatenation
 
 define <16 x i32> @rconcat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: @rconcat_sext_v8i16_v16i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-LABEL: define <16 x i32> @rconcat_sext_v8i16_v16i32(
+; CHECK-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> [[A1]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
 ;
@@ -154,8 +167,9 @@ define <16 x i32> @rconcat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
 ; interleaved shuffle
 
 define <8 x double> @interleave_fpext_v4f32_v8f64(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: @interleave_fpext_v4f32_v8f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-LABEL: define <8 x double> @interleave_fpext_v4f32_v8f64(
+; CHECK-SAME: <4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A0]], <4 x float> [[A1]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 ; CHECK-NEXT:    [[R:%.*]] = fpext <8 x float> [[TMP1]] to <8 x double>
 ; CHECK-NEXT:    ret <8 x double> [[R]]
 ;
@@ -168,8 +182,9 @@ define <8 x double> @interleave_fpext_v4f32_v8f64(<4 x float> %a0, <4 x float> %
 ; bitcasts (same element count)
 
 define <8 x float> @concat_bitcast_v4i32_v8f32(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: @concat_bitcast_v4i32_v8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-LABEL: define <8 x float> @concat_bitcast_v4i32_v8f32(
+; CHECK-SAME: <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0]], <4 x i32> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[R:%.*]] = bitcast <8 x i32> [[TMP1]] to <8 x float>
 ; CHECK-NEXT:    ret <8 x float> [[R]]
 ;
@@ -182,8 +197,9 @@ define <8 x float> @concat_bitcast_v4i32_v8f32(<4 x i32> %a0, <4 x i32> %a1) {
 ; bitcasts (lower element count)
 
 define <4 x double> @concat_bitcast_v8i16_v4f64(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: @concat_bitcast_v8i16_v4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-LABEL: define <4 x double> @concat_bitcast_v8i16_v4f64(
+; CHECK-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> [[A1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[R:%.*]] = bitcast <16 x i16> [[TMP1]] to <4 x double>
 ; CHECK-NEXT:    ret <4 x double> [[R]]
 ;
@@ -196,8 +212,9 @@ define <4 x double> @concat_bitcast_v8i16_v4f64(<8 x i16> %a0, <8 x i16> %a1) {
 ; bitcasts (higher element count)
 
 define <16 x i16> @concat_bitcast_v4i32_v16i16(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: @concat_bitcast_v4i32_v16i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-LABEL: define <16 x i16> @concat_bitcast_v4i32_v16i16(
+; CHECK-SAME: <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0]], <4 x i32> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[R:%.*]] = bitcast <8 x i32> [[TMP1]] to <16 x i16>
 ; CHECK-NEXT:    ret <16 x i16> [[R]]
 ;
@@ -210,11 +227,12 @@ define <16 x i16> @concat_bitcast_v4i32_v16i16(<4 x i32> %a0, <4 x i32> %a1) {
 ; negative - multiuse
 
 define <8 x i16> @concat_trunc_v4i32_v8i16_multiuse(<4 x i32> %a0, <4 x i32> %a1, ptr %a2) {
-; CHECK-LABEL: @concat_trunc_v4i32_v8i16_multiuse(
-; CHECK-NEXT:    [[X0:%.*]] = trunc <4 x i32> [[A0:%.*]] to <4 x i16>
-; CHECK-NEXT:    [[X1:%.*]] = trunc <4 x i32> [[A1:%.*]] to <4 x i16>
+; CHECK-LABEL: define <8 x i16> @concat_trunc_v4i32_v8i16_multiuse(
+; CHECK-SAME: <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]], ptr [[A2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[X0:%.*]] = trunc <4 x i32> [[A0]] to <4 x i16>
+; CHECK-NEXT:    [[X1:%.*]] = trunc <4 x i32> [[A1]] to <4 x i16>
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i16> [[X0]], <4 x i16> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    store <4 x i16> [[X0]], ptr [[A2:%.*]], align 8
+; CHECK-NEXT:    store <4 x i16> [[X0]], ptr [[A2]], align 8
 ; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %x0 = trunc <4 x i32> %a0 to <4 x i16>
@@ -227,9 +245,10 @@ define <8 x i16> @concat_trunc_v4i32_v8i16_multiuse(<4 x i32> %a0, <4 x i32> %a1
 ; negative - bitcasts (unscalable higher element count)
 
 define <16 x i16> @revpair_bitcast_v4i32_v16i16(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: @revpair_bitcast_v4i32_v16i16(
-; CHECK-NEXT:    [[X0:%.*]] = bitcast <4 x i32> [[A0:%.*]] to <8 x i16>
-; CHECK-NEXT:    [[X1:%.*]] = bitcast <4 x i32> [[A1:%.*]] to <8 x i16>
+; CHECK-LABEL: define <16 x i16> @revpair_bitcast_v4i32_v16i16(
+; CHECK-SAME: <4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[X0:%.*]] = bitcast <4 x i32> [[A0]] to <8 x i16>
+; CHECK-NEXT:    [[X1:%.*]] = bitcast <4 x i32> [[A1]] to <8 x i16>
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[X0]], <8 x i16> [[X1]], <16 x i32> <i32 1, i32 0, i32 3, i32 3, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
 ; CHECK-NEXT:    ret <16 x i16> [[R]]
 ;
@@ -242,9 +261,10 @@ define <16 x i16> @revpair_bitcast_v4i32_v16i16(<4 x i32> %a0, <4 x i32> %a1) {
 ; negative - bitcasts (unscalable element counts)
 
 define <4 x i32> @shuffle_bitcast_v32i40_v4i32(<32 x i40> %a0, <32 x i40> %a1) {
-; CHECK-LABEL: @shuffle_bitcast_v32i40_v4i32(
-; CHECK-NEXT:    [[X0:%.*]] = bitcast <32 x i40> [[A0:%.*]] to <40 x i32>
-; CHECK-NEXT:    [[X1:%.*]] = bitcast <32 x i40> [[A1:%.*]] to <40 x i32>
+; CHECK-LABEL: define <4 x i32> @shuffle_bitcast_v32i40_v4i32(
+; CHECK-SAME: <32 x i40> [[A0:%.*]], <32 x i40> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[X0:%.*]] = bitcast <32 x i40> [[A0]] to <40 x i32>
+; CHECK-NEXT:    [[X1:%.*]] = bitcast <32 x i40> [[A1]] to <40 x i32>
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <40 x i32> [[X0]], <40 x i32> [[X1]], <4 x i32> <i32 0, i32 42, i32 poison, i32 poison>
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
@@ -257,9 +277,10 @@ define <4 x i32> @shuffle_bitcast_v32i40_v4i32(<32 x i40> %a0, <32 x i40> %a1) {
 ; negative - src type mismatch
 
 define <8 x i32> @concat_sext_v4i8_v4i16_v8i32(<4 x i8> %a0, <4 x i16> %a1) {
-; CHECK-LABEL: @concat_sext_v4i8_v4i16_v8i32(
-; CHECK-NEXT:    [[X0:%.*]] = sext <4 x i8> [[A0:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[X1:%.*]] = sext <4 x i16> [[A1:%.*]] to <4 x i32>
+; CHECK-LABEL: define <8 x i32> @concat_sext_v4i8_v4i16_v8i32(
+; CHECK-SAME: <4 x i8> [[A0:%.*]], <4 x i16> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[X0:%.*]] = sext <4 x i8> [[A0]] to <4 x i32>
+; CHECK-NEXT:    [[X1:%.*]] = sext <4 x i16> [[A1]] to <4 x i32>
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[X0]], <4 x i32> [[X1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 ;
@@ -272,9 +293,10 @@ define <8 x i32> @concat_sext_v4i8_v4i16_v8i32(<4 x i8> %a0, <4 x i16> %a1) {
 ; negative - castop mismatch
 
 define <16 x i32> @concat_sext_zext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: @concat_sext_zext_v8i16_v16i32(
-; CHECK-NEXT:    [[X0:%.*]] = sext <8 x i16> [[A0:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[X1:%.*]] = zext <8 x i16> [[A1:%.*]] to <8 x i32>
+; CHECK-LABEL: define <16 x i32> @concat_sext_zext_v8i16_v16i32(
+; CHECK-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[X0:%.*]] = sext <8 x i16> [[A0]] to <8 x i32>
+; CHECK-NEXT:    [[X1:%.*]] = zext <8 x i16> [[A1]] to <8 x i32>
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i32> [[X0]], <8 x i32> [[X1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    ret <16 x i32> [[R]]
 ;
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
new file mode 100644
index 0000000000000..b5b5bb997c6c7
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
+
+; TODO: fold to identity
+
+define <8 x i32> @concat_extract_subvectors(<8 x i32> %x) {
+; CHECK-LABEL: define <8 x i32> @concat_extract_subvectors(
+; CHECK-SAME: <8 x i32> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[LO:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[HI:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[CONCAT:%.*]] = shufflevector <4 x i32> [[LO]], <4 x i32> [[HI]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[CONCAT]]
+;
+  %lo = shufflevector <8 x i32> %x, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %hi = shufflevector <8 x i32> %x, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %concat = shufflevector <4 x i32> %lo, <4 x i32> %hi, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %concat
+}
+
+; negative test - shuffle contains undef
+
+define <8 x i32> @concat_extract_subvectors_undef(<8 x i32> %x) {
+; CHECK-LABEL: define <8 x i32> @concat_extract_subvectors_undef(
+; CHECK-SAME: <8 x i32> [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[LO:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 8>
+; CHECK-NEXT:    [[HI:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 8>
+; CHECK-NEXT:    [[CONCAT:%.*]] = shufflevector <4 x i32> [[LO]], <4 x i32> [[HI]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[CONCAT]]
+;
+  %lo = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 8>
+  %hi = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 8>
+  %concat = shufflevector <4 x i32> %lo, <4 x i32> %hi, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %concat
+}
+
+; negative test - shuffle contains poision
+
+define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
+; CHECK-LABEL: define <8 x i32> @concat_extract_subvectors_poison(
+; CHECK-SAME: <8 x i32> [[X:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[LO:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 8>
+; CHECK-NEXT:    [[HI:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 8>
+; CHECK-NEXT:    [[CONCAT:%.*]] = shufflevector <4 x i32> [[LO]], <4 x i32> [[HI]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[CONCAT]]
+;
+  %lo = shufflevector <8 x i32> %x, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 8>
+  %hi = shufflevector <8 x i32> %x, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 8>
+  %concat = shufflevector <4 x i32> %lo, <4 x i32> %hi, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %concat
+}
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle.ll
index 8337bb37bc549..c8c9aa161ae28 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle.ll
@@ -1,17 +1,21 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE
 ; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX
 
+declare void @use(<4 x i32>)
+
 ; x86 does not have a cheap v16i8 shuffle until SSSE3 (pshufb)
 
 define <16 x i8> @bitcast_shuf_narrow_element(<4 x i32> %v) {
-; SSE-LABEL: @bitcast_shuf_narrow_element(
-; SSE-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE-LABEL: define <16 x i8> @bitcast_shuf_narrow_element(
+; SSE-SAME: <4 x i32> [[V:%.*]]) #[[ATTR0:[0-9]+]] {
+; SSE-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[V]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
 ; SSE-NEXT:    ret <16 x i8> [[R]]
 ;
-; AVX-LABEL: @bitcast_shuf_narrow_element(
-; AVX-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8>
+; AVX-LABEL: define <16 x i8> @bitcast_shuf_narrow_element(
+; AVX-SAME: <4 x i32> [[V:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V]] to <16 x i8>
 ; AVX-NEXT:    [[R:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:    ret <16 x i8> [[R]]
 ;
@@ -23,8 +27,9 @@ define <16 x i8> @bitcast_shuf_narrow_element(<4 x i32> %v) {
 ; v4f32 is the same cost as v4i32, so this always works
 
 define <4 x float> @bitcast_shuf_same_size(<4 x i32> %v) {
-; CHECK-LABEL: @bitcast_shuf_same_size(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <4 x float>
+; CHECK-LABEL: define <4 x float> @bitcast_shuf_same_size(
+; CHECK-SAME: <4 x i32> [[V:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V]] to <4 x float>
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
@@ -36,13 +41,15 @@ define <4 x float> @bitcast_shuf_same_size(<4 x i32> %v) {
 ; Length-changing shuffles
 
 define <16 x i8> @bitcast_shuf_narrow_element_subvector(<2 x i32> %v) {
-; SSE-LABEL: @bitcast_shuf_narrow_element_subvector(
-; SSE-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
+; SSE-LABEL: define <16 x i8> @bitcast_shuf_narrow_element_subvector(
+; SSE-SAME: <2 x i32> [[V:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i32> [[V]], <2 x i32> poison, <4 x i32> <i32 1, i32 0, i32 1, i32 0>
 ; SSE-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
 ; SSE-NEXT:    ret <16 x i8> [[R]]
 ;
-; AVX-LABEL: @bitcast_shuf_narrow_element_subvector(
-; AVX-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8>
+; AVX-LABEL: define <16 x i8> @bitcast_shuf_narrow_element_subvector(
+; AVX-SAME: <2 x i32> [[V:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <2 x i32> [[V]] to <8 x i8>
 ; AVX-NEXT:    [[R:%.*]] = shufflevector <8 x i8> [[TMP1]], <8 x i8> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:    ret <16 x i8> [[R]]
 ;
@@ -52,13 +59,15 @@ define <16 x i8> @bitcast_shuf_narrow_element_subvector(<2 x i32> %v) {
 }
 
 define <16 x i16> @bitcast_shuf_narrow_element_concat_subvectors(<2 x i64> %v) {
-; SSE-LABEL: @bitcast_shuf_narrow_element_concat_subvectors(
-; SSE-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i64> [[V:%.*]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; SSE-LABEL: define <16 x i16> @bitcast_shuf_narrow_element_concat_subvectors(
+; SSE-SAME: <2 x i64> [[V:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i64> [[V]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 ; SSE-NEXT:    [[R:%.*]] = bitcast <4 x i64> [[SHUF]] to <16 x i16>
 ; SSE-NEXT:    ret <16 x i16> [[R]]
 ;
-; AVX-LABEL: @bitcast_shuf_narrow_element_concat_subvectors(
-; AVX-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[V:%.*]] to <8 x i16>
+; AVX-LABEL: define <16 x i16> @bitcast_shuf_narrow_element_concat_subvectors(
+; AVX-SAME: <2 x i64> [[V:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[V]] to <8 x i16>
 ; AVX-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:    ret <16 x i16> [[R]]
 ;
@@ -68,8 +77,9 @@ define <16 x i16> @bitcast_shuf_narrow_element_concat_subvectors(<2 x i64> %v) {
 }
 
 define <16 x i8> @bitcast_shuf_extract_subvector(<8 x i32> %v) {
-; CHECK-LABEL: @bitcast_shuf_extract_subvector(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32> [[V:%.*]] to <32 x i8>
+; CHECK-LABEL: define <16 x i8> @bitcast_shuf_extract_subvector(
+; CHECK-SAME: <8 x i32> [[V:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32> [[V]] to <32 x i8>
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    ret <16 x i8> [[R]]
 ;
@@ -81,8 +91,9 @@ define <16 x i8> @bitcast_shuf_extract_subvector(<8 x i32> %v) {
 ; Negative test - must cast to vector type
 
 define i128 @bitcast_shuf_narrow_element_wrong_type(<4 x i32> %v) {
-; CHECK-LABEL: @bitcast_shuf_narrow_element_wrong_type(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-LABEL: define i128 @bitcast_shuf_narrow_element_wrong_type(
+; CHECK-SAME: <4 x i32> [[V:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[V]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to i128
 ; CHECK-NEXT:    ret i128 [[R]]
 ;
@@ -94,8 +105,9 @@ define i128 @bitcast_shuf_narrow_element_wrong_type(<4 x i32> %v) {
 ; Widen shuffle elements
 
 define <4 x i32> @bitcast_shuf_wide_element(<8 x i16> %v) {
-; CHECK-LABEL: @bitcast_shuf_wide_element(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <4 x i32>
+; CHECK-LABEL: define <4 x i32> @bitcast_shuf_wide_element(
+; CHECK-SAME: <8 x i16> [[V:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V]] to <4 x i32>
 ; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    ret <4 x i32> [[R]]
 ;
@@ -104,13 +116,12 @@ define <4 x i32> @bitcast_shuf_wide_element(<8 x i16> %v) {
   ret <4 x i32> %r
 }
 
-declare void @use(<4 x i32>)
-
 ; Negative test - don't create an extra shuffle
 
 define <16 x i8> @bitcast_shuf_uses(<4 x i32> %v) {
-; CHECK-LABEL: @bitcast_shuf_uses(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-LABEL: define <16 x i8> @bitcast_shuf_uses(
+; CHECK-SAME: <4 x i32> [[V:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[V]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    call void @use(<4 x i32> [[SHUF]])
 ; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8>
 ; CHECK-NEXT:    ret <16 x i8> [[R]]
@@ -125,8 +136,9 @@ define <16 x i8> @bitcast_shuf_uses(<4 x i32> %v) {
 ; TODO - can we remove the empty bitcast(bitcast()) ?
 
 define <4 x i64> @bitcast_shuf_remove_bitcasts(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: @bitcast_shuf_remove_bitcasts(
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-LABEL: define <4 x i64> @bitcast_shuf_remove_bitcasts(
+; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i64> [[A0]], <2 x i64> [[A1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[SHUF:%.*]] = bitcast <4 x i64> [[R]] to <8 x i32>
 ; CHECK-NEXT:    [[R1:%.*]] = bitcast <8 x i32> [[SHUF]] to <4 x i64>
 ; CHECK-NEXT:    ret <4 x i64> [[R1]]
@@ -141,9 +153,10 @@ define <4 x i64> @bitcast_shuf_remove_bitcasts(<2 x i64> %a0, <2 x i64> %a1) {
 ; shuffle of 2 operands must reduce bitcasts
 
 define <8 x i32> @bitcast_shuf_one_bitcast(<4 x i32> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: @bitcast_shuf_one_bitcast(
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[A1:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-LABEL: define <8 x i32> @bitcast_shuf_one_bitcast(
+; CHECK-SAME: <4 x i32> [[A0:%.*]], <2 x i64> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[A1]] to <4 x i32>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[A0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 ;
   %bc0 = bitcast <4 x i32> %a0 to <2 x i64>
@@ -155,8 +168,9 @@ define <8 x i32> @bitcast_shuf_one_bitcast(<4 x i32> %a0, <2 x i64> %a1) {
 ; Negative test - shuffle of 2 operands must not increase bitcasts
 
 define <8 x i32> @bitcast_shuf_too_many_bitcasts(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: @bitcast_shuf_too_many_bitcasts(
-; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-LABEL: define <8 x i32> @bitcast_shuf_too_many_bitcasts(
+; CHECK-SAME: <2 x i64> [[A0:%.*]], <2 x i64> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <2 x i64> [[A0]], <2 x i64> [[A1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i64> [[SHUF]] to <8 x i32>
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 ;
@@ -166,8 +180,9 @@ define <8 x i32> @bitcast_shuf_too_many_bitcasts(<2 x i64> %a0, <2 x i64> %a1) {
 }
 
 define <2 x i64> @PR35454_1(<2 x i64> %v) {
-; SSE-LABEL: @PR35454_1(
-; SSE-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
+; SSE-LABEL: define <2 x i64> @PR35454_1(
+; SSE-SAME: <2 x i64> [[V:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[V]] to <4 x i32>
 ; SSE-NEXT:    [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:    [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <16 x i8>
 ; SSE-NEXT:    [[ADD:%.*]] = shl <16 x i8> [[BC1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
@@ -176,8 +191,9 @@ define <2 x i64> @PR35454_1(<2 x i64> %v) {
 ; SSE-NEXT:    [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
 ; SSE-NEXT:    ret <2 x i64> [[BC3]]
 ;
-; AVX-LABEL: @PR35454_1(
-; AVX-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[V:%.*]] to <16 x i8>
+; AVX-LABEL: define <2 x i64> @PR35454_1(
+; AVX-SAME: <2 x i64> [[V:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[V]] to <16 x i8>
 ; AVX-NEXT:    [[BC1:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 12, i32 13, i32 14, i32 15, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:    [[ADD:%.*]] = shl <16 x i8> [[BC1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 ; AVX-NEXT:    [[BC2:%.*]] = bitcast <16 x i8> [[ADD]] to <4 x i32>
@@ -196,8 +212,9 @@ define <2 x i64> @PR35454_1(<2 x i64> %v) {
 }
 
 define <2 x i64> @PR35454_2(<2 x i64> %v) {
-; SSE-LABEL: @PR35454_2(
-; SSE-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32>
+; SSE-LABEL: define <2 x i64> @PR35454_2(
+; SSE-SAME: <2 x i64> [[V:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[BC:%.*]] = bitcast <2 x i64> [[V]] to <4 x i32>
 ; SSE-NEXT:    [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:    [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <8 x i16>
 ; SSE-NEXT:    [[ADD:%.*]] = shl <8 x i16> [[BC1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -206,8 +223,9 @@ define <2 x i64> @PR35454_2(<2 x i64> %v) {
 ; SSE-NEXT:    [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64>
 ; SSE-NEXT:    ret <2 x i64> [[BC3]]
 ;
-; AVX-LABEL: @PR35454_2(
-; AVX-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[V:%.*]] to <8 x i16>
+; AVX-LABEL: define <2 x i64> @PR35454_2(
+; AVX-SAME: <2 x i64> [[V:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> [[V]] to <8 x i16>
 ; AVX-NEXT:    [[BC1:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 6, i32 7, i32 4, i32 5, i32 2, i32 3, i32 0, i32 1>
 ; AVX-NEXT:    [[ADD:%.*]] = shl <8 x i16> [[BC1]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
 ; AVX-NEXT:    [[BC2:%.*]] = bitcast <8 x i16> [[ADD]] to <4 x i32>
@@ -224,187 +242,3 @@ define <2 x i64> @PR35454_2(<2 x i64> %v) {
   %bc3 = bitcast <4 x i32> %permil1 to <2 x i64>
   ret <2 x i64> %bc3
 }
-
-; Shuffle is much cheaper than fdiv. FMF are intersected.
-
-define <4 x float> @shuf_fdiv_v4f32_yy(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
-; CHECK-LABEL: @shuf_fdiv_v4f32_yy(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> [[Z:%.*]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 1, i32 3>
-; CHECK-NEXT:    [[R:%.*]] = fdiv arcp <4 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x float> [[R]]
-;
-  %b0 = fdiv fast <4 x float> %x, %y
-  %b1 = fdiv arcp <4 x float> %z, %y
-  %r = shufflevector <4 x float> %b0, <4 x float> %b1, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-  ret <4 x float> %r
-}
-
-; Common operand is op0 of the binops.
-
-define <4 x i32> @shuf_add_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
-; CHECK-LABEL: @shuf_add_v4i32_xx(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 2, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
-; CHECK-NEXT:    [[R:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x i32> [[R]]
-;
-  %b0 = add <4 x i32> %x, %y
-  %b1 = add <4 x i32> %x, %z
-  %r = shufflevector <4 x i32> %b0, <4 x i32> %b1, <4 x i32> <i32 poison, i32 poison, i32 6, i32 0>
-  ret <4 x i32> %r
-}
-
-; For commutative instructions, common operand may be swapped.
-
-define <4 x float> @shuf_fmul_v4f32_xx_swap(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
-; CHECK-LABEL: @shuf_fmul_v4f32_xx_swap(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]], <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 0, i32 3>
-; CHECK-NEXT:    [[R:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x float> [[R]]
-;
-  %b0 = fmul <4 x float> %x, %y
-  %b1 = fmul <4 x float> %z, %x
-  %r = shufflevector <4 x float> %b0, <4 x float> %b1, <4 x i32> <i32 0, i32 3, i32 4, i32 7>
-  ret <4 x float> %r
-}
-
-; For commutative instructions, common operand may be swapped.
-
-define <2 x i64> @shuf_and_v2i64_yy_swap(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) {
-; CHECK-LABEL: @shuf_and_v2i64_yy_swap(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i64> [[Y:%.*]], <2 x i64> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[X:%.*]], <2 x i64> [[Z:%.*]], <2 x i32> <i32 3, i32 0>
-; CHECK-NEXT:    [[R:%.*]] = and <2 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <2 x i64> [[R]]
-;
-  %b0 = and <2 x i64> %x, %y
-  %b1 = and <2 x i64> %y, %z
-  %r = shufflevector <2 x i64> %b0, <2 x i64> %b1, <2 x i32> <i32 3, i32 0>
-  ret <2 x i64> %r
-}
-
-; non-commutative binop, but common op0
-
-define <4 x i32> @shuf_shl_v4i32_xx(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
-; CHECK-LABEL: @shuf_shl_v4i32_xx(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> <i32 3, i32 1, i32 1, i32 2>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]], <4 x i32> <i32 3, i32 1, i32 1, i32 6>
-; CHECK-NEXT:    [[R:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x i32> [[R]]
-;
-  %b0 = shl <4 x i32> %x, %y
-  %b1 = shl <4 x i32> %x, %z
-  %r = shufflevector <4 x i32> %b0, <4 x i32> %b1, <4 x i32> <i32 3, i32 1, i32 1, i32 6>
-  ret <4 x i32> %r
-}
-
-; negative test - common operand, but not commutable
-
-define <4 x i32> @shuf_shl_v4i32_xx_swap(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
-; CHECK-LABEL: @shuf_shl_v4i32_xx_swap(
-; CHECK-NEXT:    [[B0:%.*]] = shl <4 x i32> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[B1:%.*]] = shl <4 x i32> [[Z:%.*]], [[X]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> <i32 3, i32 2, i32 2, i32 5>
-; CHECK-NEXT:    ret <4 x i32> [[R]]
-;
-  %b0 = shl <4 x i32> %x, %y
-  %b1 = shl <4 x i32> %z, %x
-  %r = shufflevector <4 x i32> %b0, <4 x i32> %b1, <4 x i32> <i32 3, i32 2, i32 2, i32 5>
-  ret <4 x i32> %r
-}
-
-; negative test - mismatched opcodes
-
-define <2 x i64> @shuf_sub_add_v2i64_yy(<2 x i64> %x, <2 x i64> %y, <2 x i64> %z) {
-; CHECK-LABEL: @shuf_sub_add_v2i64_yy(
-; CHECK-NEXT:    [[B0:%.*]] = sub <2 x i64> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[B1:%.*]] = add <2 x i64> [[Z:%.*]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x i64> [[B0]], <2 x i64> [[B1]], <2 x i32> <i32 3, i32 0>
-; CHECK-NEXT:    ret <2 x i64> [[R]]
-;
-  %b0 = sub <2 x i64> %x, %y
-  %b1 = add <2 x i64> %z, %y
-  %r = shufflevector <2 x i64> %b0, <2 x i64> %b1, <2 x i32> <i32 3, i32 0>
-  ret <2 x i64> %r
-}
-
-; negative test - type change via shuffle
-
-define <8 x float> @shuf_fmul_v4f32_xx_type(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
-; CHECK-LABEL: @shuf_fmul_v4f32_xx_type(
-; CHECK-NEXT:    [[B0:%.*]] = fmul <4 x float> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[B1:%.*]] = fmul <4 x float> [[Z:%.*]], [[X]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 0, i32 1, i32 1, i32 6>
-; CHECK-NEXT:    ret <8 x float> [[R]]
-;
-  %b0 = fmul <4 x float> %x, %y
-  %b1 = fmul <4 x float> %z, %x
-  %r = shufflevector <4 x float> %b0, <4 x float> %b1, <8 x i32> <i32 0, i32 3, i32 4, i32 7, i32 0, i32 1, i32 1, i32 6>
-  ret <8 x float> %r
-}
-
-; negative test - uses
-
-define <4 x i32> @shuf_lshr_v4i32_yy_use1(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
-; CHECK-LABEL: @shuf_lshr_v4i32_yy_use1(
-; CHECK-NEXT:    [[B0:%.*]] = lshr <4 x i32> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    call void @use(<4 x i32> [[B0]])
-; CHECK-NEXT:    [[B1:%.*]] = lshr <4 x i32> [[Z:%.*]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    ret <4 x i32> [[R]]
-;
-  %b0 = lshr <4 x i32> %x, %y
-  call void @use(<4 x i32> %b0)
-  %b1 = lshr <4 x i32> %z, %y
-  %r = shufflevector <4 x i32> %b0, <4 x i32> %b1, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-  ret <4 x i32> %r
-}
-
-; negative test - uses
-
-define <4 x i32> @shuf_mul_v4i32_yy_use2(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
-; CHECK-LABEL: @shuf_mul_v4i32_yy_use2(
-; CHECK-NEXT:    [[B0:%.*]] = mul <4 x i32> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[B1:%.*]] = mul <4 x i32> [[Z:%.*]], [[Y]]
-; CHECK-NEXT:    call void @use(<4 x i32> [[B1]])
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[B0]], <4 x i32> [[B1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    ret <4 x i32> [[R]]
-;
-  %b0 = mul <4 x i32> %x, %y
-  %b1 = mul <4 x i32> %z, %y
-  call void @use(<4 x i32> %b1)
-  %r = shufflevector <4 x i32> %b0, <4 x i32> %b1, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-  ret <4 x i32> %r
-}
-
-; negative test - must have matching operand
-
-define <4 x float> @shuf_fadd_v4f32_no_common_op(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) {
-; CHECK-LABEL: @shuf_fadd_v4f32_no_common_op(
-; CHECK-NEXT:    [[B0:%.*]] = fadd <4 x float> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[B1:%.*]] = fadd <4 x float> [[Z:%.*]], [[W:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[B0]], <4 x float> [[B1]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:    ret <4 x float> [[R]]
-;
-  %b0 = fadd <4 x float> %x, %y
-  %b1 = fadd <4 x float> %z, %w
-  %r = shufflevector <4 x float> %b0, <4 x float> %b1, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-  ret <4 x float> %r
-}
-
-; negative test - binops may be relatively cheap
-
-define <16 x i16> @shuf_and_v16i16_yy_expensive_shuf(<16 x i16> %x, <16 x i16> %y, <16 x i16> %z) {
-; CHECK-LABEL: @shuf_and_v16i16_yy_expensive_shuf(
-; CHECK-NEXT:    [[B0:%.*]] = and <16 x i16> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[B1:%.*]] = and <16 x i16> [[Y]], [[Z:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <16 x i16> [[B0]], <16 x i16> [[B1]], <16 x i32> <i32 15, i32 22, i32 25, i32 13, i32 28, i32 0, i32 poison, i32 3, i32 0, i32 30, i32 3, i32 7, i32 9, i32 19, i32 2, i32 22>
-; CHECK-NEXT:    ret <16 x i16> [[R]]
-;
-  %b0 = and <16 x i16> %x, %y
-  %b1 = and <16 x i16> %y, %z
-  %r = shufflevector <16 x i16> %b0, <16 x i16> %b1, <16 x i32> <i32 15, i32 22, i32 25, i32 13, i32 28, i32 0, i32 poison, i32 3, i32 0, i32 30, i32 3, i32 7, i32 9, i32 19, i32 2, i32 22>
-  ret <16 x i16> %r
-}
diff --git a/llvm/test/Verifier/variadic.ll b/llvm/test/Verifier/variadic.ll
new file mode 100644
index 0000000000000..55e4a4da0a920
--- /dev/null
+++ b/llvm/test/Verifier/variadic.ll
@@ -0,0 +1,8 @@
+; RUN: not opt -S -passes=verify 2>&1 < %s | FileCheck %s
+
+; CHECK: va_start called in a non-varargs function
+declare void @llvm.va_start(ptr)
+define void @not_vararg(ptr %p) nounwind {
+  call void @llvm.va_start(ptr %p)
+  ret void
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected
index a65f140b46015..3ac63d070933d 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected
@@ -30,7 +30,6 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct
 ; CHECK-NEXT:    ld.u64 %rd7, [%SP+24];
 ; CHECK-NEXT:    ld.u64 %rd8, [%SP+16];
 ; CHECK-NEXT:    { // callseq 0, 0
-; CHECK-NEXT:    .reg .b32 temp_param_reg;
 ; CHECK-NEXT:    .param .align 16 .b8 param0[32];
 ; CHECK-NEXT:    st.param.v2.b64 [param0+0], {%rd6, %rd5};
 ; CHECK-NEXT:    st.param.v2.b64 [param0+16], {%rd8, %rd7};
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/phi-labels.ll b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/phi-labels.ll
new file mode 100644
index 0000000000000..4eb05b943f506
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/phi-labels.ll
@@ -0,0 +1,39 @@
+; RUN: opt < %s -S | FileCheck %s
+
+define i32 @phi_after_label(i1 %cc) {
+entry:
+  br i1 %cc, label %then, label %end
+
+then:
+  br label %end
+
+end:
+  %r = phi i32 [ 0, %entry ], [ 1, %then ]
+  ret i32 %r
+}
+
+define void @phi_before_label(i32 %bound) {
+entry:
+  br label %loop
+
+loop:
+  %ctr = phi i32 [ 0, %entry ], [ %ctr.next, %loop ]
+  %ctr.next = add i32 %ctr, 1
+  %cc = icmp ult i32 %ctr.next, %bound
+  br i1 %cc, label %loop, label %end
+
+end:
+  ret void
+}
+
+define i32 @phi_after_label_unnamed(i1 %cc) {
+0:
+  br i1 %cc, label %1, label %2
+
+1:
+  br label %2
+
+2:
+  %r = phi i32 [ 0, %0 ], [ 1, %1 ]
+  ret i32 %r
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/phi-labels.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/phi-labels.ll.expected
new file mode 100644
index 0000000000000..1d21ebe547f68
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/phi-labels.ll.expected
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -S | FileCheck %s
+
+define i32 @phi_after_label(i1 %cc) {
+; CHECK-LABEL: define i32 @phi_after_label(
+; CHECK-SAME: i1 [[CC:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[CC]], label [[THEN:%.*]], label [[END:%.*]]
+; CHECK:       then:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[R:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ 1, [[THEN]] ]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+entry:
+  br i1 %cc, label %then, label %end
+
+then:
+  br label %end
+
+end:
+  %r = phi i32 [ 0, %entry ], [ 1, %then ]
+  ret i32 %r
+}
+
+define void @phi_before_label(i32 %bound) {
+; CHECK-LABEL: define void @phi_before_label(
+; CHECK-SAME: i32 [[BOUND:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[CTR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[CTR_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[CTR_NEXT]] = add i32 [[CTR]], 1
+; CHECK-NEXT:    [[CC:%.*]] = icmp ult i32 [[CTR_NEXT]], [[BOUND]]
+; CHECK-NEXT:    br i1 [[CC]], label [[LOOP]], label [[END:%.*]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %ctr = phi i32 [ 0, %entry ], [ %ctr.next, %loop ]
+  %ctr.next = add i32 %ctr, 1
+  %cc = icmp ult i32 %ctr.next, %bound
+  br i1 %cc, label %loop, label %end
+
+end:
+  ret void
+}
+
+define i32 @phi_after_label_unnamed(i1 %cc) {
+; CHECK-LABEL: define i32 @phi_after_label_unnamed(
+; CHECK-SAME: i1 [[CC:%.*]]) {
+; CHECK-NEXT:    br i1 [[CC]], label [[TMP1:%.*]], label [[TMP2:%.*]]
+; CHECK:       1:
+; CHECK-NEXT:    br label [[TMP2]]
+; CHECK:       2:
+; CHECK-NEXT:    [[R:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ 1, [[TMP1]] ]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+0:
+  br i1 %cc, label %1, label %2
+
+1:
+  br label %2
+
+2:
+  %r = phi i32 [ 0, %0 ], [ 1, %1 ]
+  ret i32 %r
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/stable_ir_values_funcs.ll b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/stable_ir_values_funcs.ll
new file mode 100644
index 0000000000000..b4fd23a3d81ce
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/stable_ir_values_funcs.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
+; RUN: opt < %s -S | FileCheck %s
+
+; The assumption underlying this test is that there are pre-existing check lines
+; but something has changed, and we would like to avoid needless changes of
+; meta variable names so that diffs end up being easier to read, e.g. avoid
+; changing X_I33 into X_I34 or renumbering the various TMP variables.
+
+define i32 @func({i32, i32} %x, i32 %y) {
+  %x.i34 = extractvalue {i32, i32} %x, 0
+  %1 = add i32 %y, 1
+  %2 = add i32 %x.i34, %1
+  %3 = mul i32 %2, 3
+  ret i32 %3
+}
+
+; CHECK-LABEL: define i32 @func(
+; CHECK-SAME: { i32, i32 } [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[X_I33:%.*]] = extractvalue { i32, i32 } [[X]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[X_I33]], [[Y]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], 3
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/stable_ir_values_funcs.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/stable_ir_values_funcs.ll.expected
new file mode 100644
index 0000000000000..86f929ffe36af
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/stable_ir_values_funcs.ll.expected
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs --version 3
+; RUN: opt < %s -S | FileCheck %s
+
+; The assumption underlying this test is that there are pre-existing check lines
+; but something has changed, and we would like to avoid needless changes of
+; meta variable names so that diffs end up being easier to read, e.g. avoid
+; changing X_I33 into X_I34 or renumbering the various TMP variables.
+
+define i32 @func({i32, i32} %x, i32 %y) {
+  %x.i34 = extractvalue {i32, i32} %x, 0
+  %1 = add i32 %y, 1
+  %2 = add i32 %x.i34, %1
+  %3 = mul i32 %2, 3
+  ret i32 %3
+}
+
+; CHECK-LABEL: define i32 @func(
+; CHECK-SAME: { i32, i32 } [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[X_I33:%.*]] = extractvalue { i32, i32 } [[X]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[Y]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[X_I33]], [[TMP3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], 3
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll
new file mode 100644
index 0000000000000..9a9cc0a06936f
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll
@@ -0,0 +1,168 @@
+; Just run it through opt, no passes needed.
+; RUN: opt < %s -S --write-experimental-debuginfo=true | FileCheck %s
+
+; ModuleID = 'various_ir_values.c'
+source_filename = "various_ir_values.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define dso_local void @foo(ptr %A) #0 !dbg !7 {
+entry:
+  %A.addr = alloca ptr, align 8, !DIAssignID !16
+  %i = alloca i32, align 4
+    #dbg_assign(i1 undef, !13, !DIExpression(), !16, ptr %A.addr, !DIExpression(), !17)
+  store ptr %A, ptr %A.addr, align 8, !tbaa !18
+    #dbg_declare(ptr %A.addr, !13, !DIExpression(), !17)
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !22
+    #dbg_declare(ptr %i, !14, !DIExpression(), !23)
+  store i32 0, ptr %i, align 4, !dbg !23, !tbaa !24
+  br label %for.cond, !dbg !22
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4, !dbg !26, !tbaa !24
+  %1 = load ptr, ptr %A.addr, align 8, !dbg !28, !tbaa !18
+  %2 = load i32, ptr %1, align 4, !dbg !29, !tbaa !24
+  %cmp = icmp slt i32 %0, %2, !dbg !30
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !31, !prof !32
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !33
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %3 = load ptr, ptr %A.addr, align 8, !dbg !34, !tbaa !18
+  %4 = load i32, ptr %i, align 4, !dbg !35, !tbaa !24
+  %idxprom = sext i32 %4 to i64, !dbg !34
+  %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !34
+  store i32 0, ptr %arrayidx, align 4, !dbg !36, !tbaa !24
+  br label %for.inc, !dbg !34
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, ptr %i, align 4, !dbg !37, !tbaa !24
+  %inc = add nsw i32 %5, 1, !dbg !37
+  store i32 %inc, ptr %i, align 4, !dbg !37, !tbaa !24
+  br label %for.cond, !dbg !33, !llvm.loop !38
+
+for.end:                                          ; preds = %for.cond.cleanup
+  ret void, !dbg !40
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: nounwind uwtable
+define dso_local void @bar(ptr %A) #0 !dbg !41 {
+entry:
+  %A.addr = alloca ptr, align 8
+  %i = alloca i32, align 4
+  store ptr %A, ptr %A.addr, align 8, !tbaa !18
+    #dbg_declare(ptr %A.addr, !43, !DIExpression(), !46)
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !47
+    #dbg_declare(ptr %i, !44, !DIExpression(), !48)
+  store i32 0, ptr %i, align 4, !dbg !48, !tbaa !24
+  br label %for.cond, !dbg !47
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4, !dbg !49, !tbaa !24
+  %1 = load ptr, ptr %A.addr, align 8, !dbg !51, !tbaa !18
+  %2 = load i32, ptr %1, align 4, !dbg !52, !tbaa !24
+  %cmp = icmp slt i32 %0, %2, !dbg !53
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !54
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !55
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %3 = load ptr, ptr %A.addr, align 8, !dbg !56, !tbaa !18
+  %4 = load i32, ptr %i, align 4, !dbg !57, !tbaa !24
+  %idxprom = sext i32 %4 to i64, !dbg !56
+  %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !56
+  store i32 0, ptr %arrayidx, align 4, !dbg !58, !tbaa !24
+  br label %for.inc, !dbg !56
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, ptr %i, align 4, !dbg !59, !tbaa !24
+  %inc = add nsw i32 %5, 1, !dbg !59
+  store i32 %inc, ptr %i, align 4, !dbg !59, !tbaa !24
+  br label %for.cond, !dbg !55, !llvm.loop !60
+
+for.end:                                          ; preds = %for.cond.cleanup
+  ret void, !dbg !62
+}
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "various_ir_values.c", directory: "/data/build/llvm-project")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)"}
+!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !{!13, !14}
+!13 = !DILocalVariable(name: "A", arg: 1, scope: !7, file: !1, line: 1, type: !10)
+!14 = !DILocalVariable(name: "i", scope: !15, file: !1, line: 3, type: !11)
+!15 = distinct !DILexicalBlock(scope: !7, file: !1, line: 3, column: 3)
+!16 = distinct !DIAssignID()
+!17 = !DILocation(line: 1, column: 15, scope: !7)
+!18 = !{!19, !19, i64 0}
+!19 = !{!"any pointer", !20, i64 0}
+!20 = !{!"omnipotent char", !21, i64 0}
+!21 = !{!"Simple C/C++ TBAA"}
+!22 = !DILocation(line: 3, column: 8, scope: !15)
+!23 = !DILocation(line: 3, column: 12, scope: !15)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"int", !20, i64 0}
+!26 = !DILocation(line: 3, column: 19, scope: !27)
+!27 = distinct !DILexicalBlock(scope: !15, file: !1, line: 3, column: 3)
+!28 = !DILocation(line: 3, column: 24, scope: !27)
+!29 = !DILocation(line: 3, column: 23, scope: !27)
+!30 = !DILocation(line: 3, column: 21, scope: !27)
+!31 = !DILocation(line: 3, column: 3, scope: !15)
+!32 = !{!"branch_weights", i32 1, i32 1048575}
+!33 = !DILocation(line: 3, column: 3, scope: !27)
+!34 = !DILocation(line: 4, column: 5, scope: !27)
+!35 = !DILocation(line: 4, column: 7, scope: !27)
+!36 = !DILocation(line: 4, column: 10, scope: !27)
+!37 = !DILocation(line: 3, column: 27, scope: !27)
+!38 = distinct !{!38, !31, !39}
+!39 = !DILocation(line: 4, column: 12, scope: !15)
+!40 = !DILocation(line: 5, column: 1, scope: !7)
+!41 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 7, type: !8, scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !42)
+!42 = !{!43, !44}
+!43 = !DILocalVariable(name: "A", arg: 1, scope: !41, file: !1, line: 7, type: !10)
+!44 = !DILocalVariable(name: "i", scope: !45, file: !1, line: 9, type: !11)
+!45 = distinct !DILexicalBlock(scope: !41, file: !1, line: 9, column: 3)
+!46 = !DILocation(line: 7, column: 15, scope: !41)
+!47 = !DILocation(line: 9, column: 8, scope: !45)
+!48 = !DILocation(line: 9, column: 12, scope: !45)
+!49 = !DILocation(line: 9, column: 19, scope: !50)
+!50 = distinct !DILexicalBlock(scope: !45, file: !1, line: 9, column: 3)
+!51 = !DILocation(line: 9, column: 24, scope: !50)
+!52 = !DILocation(line: 9, column: 23, scope: !50)
+!53 = !DILocation(line: 9, column: 21, scope: !50)
+!54 = !DILocation(line: 9, column: 3, scope: !45)
+!55 = !DILocation(line: 9, column: 3, scope: !50)
+!56 = !DILocation(line: 10, column: 5, scope: !50)
+!57 = !DILocation(line: 10, column: 7, scope: !50)
+!58 = !DILocation(line: 10, column: 10, scope: !50)
+!59 = !DILocation(line: 9, column: 27, scope: !50)
+!60 = distinct !{!60, !54, !61}
+!61 = !DILocation(line: 10, column: 12, scope: !45)
+!62 = !DILocation(line: 11, column: 1, scope: !41)
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.expected
new file mode 100644
index 0000000000000..1f9c37ccfbd88
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.expected
@@ -0,0 +1,238 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; Just run it through opt, no passes needed.
+; RUN: opt < %s -S --write-experimental-debuginfo=true | FileCheck %s
+
+; ModuleID = 'various_ir_values.c'
+source_filename = "various_ir_values.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define dso_local void @foo(ptr %A) #0 !dbg !7 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, !DIAssignID [[DIASSIGNID16:![0-9]+]]
+; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+; CHECK-NEXT:      #dbg_assign(i1 undef, [[META13:![0-9]+]], !DIExpression(), [[DIASSIGNID16]], ptr [[A_ADDR]], !DIExpression(), [[META17:![0-9]+]])
+; CHECK-NEXT:    store ptr [[A:%.*]], ptr [[A_ADDR]], align 8, !tbaa [[TBAA18:![0-9]+]]
+; CHECK-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META13]], !DIExpression(), [[META17]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR2:[0-9]+]], !dbg [[DBG22:![0-9]+]]
+; CHECK-NEXT:      #dbg_declare(ptr [[I]], [[META14:![0-9]+]], !DIExpression(), [[META23:![0-9]+]])
+; CHECK-NEXT:    store i32 0, ptr [[I]], align 4, !dbg [[META23]], !tbaa [[TBAA24:![0-9]+]]
+; CHECK-NEXT:    br label [[FOR_COND:%.*]], !dbg [[DBG22]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG26:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG28:![0-9]+]], !tbaa [[TBAA18]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG29:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP2]], !dbg [[DBG30:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG31:![0-9]+]], !prof [[PROF32:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG33:![0-9]+]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG34:![0-9]+]], !tbaa [[TBAA18]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG35:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64, !dbg [[DBG34]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 [[IDXPROM]], !dbg [[DBG34]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !dbg [[DBG36:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_INC:%.*]], !dbg [[DBG34]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG37:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP5]], 1, !dbg [[DBG37]]
+; CHECK-NEXT:    store i32 [[INC]], ptr [[I]], align 4, !dbg [[DBG37]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_COND]], !dbg [[DBG33]], !llvm.loop [[LOOP38:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void, !dbg [[DBG40:![0-9]+]]
+;
+entry:
+  %A.addr = alloca ptr, align 8, !DIAssignID !16
+  %i = alloca i32, align 4
+  #dbg_assign(i1 undef, !13, !DIExpression(), !16, ptr %A.addr, !DIExpression(), !17)
+  store ptr %A, ptr %A.addr, align 8, !tbaa !18
+  #dbg_declare(ptr %A.addr, !13, !DIExpression(), !17)
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !22
+  #dbg_declare(ptr %i, !14, !DIExpression(), !23)
+  store i32 0, ptr %i, align 4, !dbg !23, !tbaa !24
+  br label %for.cond, !dbg !22
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4, !dbg !26, !tbaa !24
+  %1 = load ptr, ptr %A.addr, align 8, !dbg !28, !tbaa !18
+  %2 = load i32, ptr %1, align 4, !dbg !29, !tbaa !24
+  %cmp = icmp slt i32 %0, %2, !dbg !30
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !31, !prof !32
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !33
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %3 = load ptr, ptr %A.addr, align 8, !dbg !34, !tbaa !18
+  %4 = load i32, ptr %i, align 4, !dbg !35, !tbaa !24
+  %idxprom = sext i32 %4 to i64, !dbg !34
+  %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !34
+  store i32 0, ptr %arrayidx, align 4, !dbg !36, !tbaa !24
+  br label %for.inc, !dbg !34
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, ptr %i, align 4, !dbg !37, !tbaa !24
+  %inc = add nsw i32 %5, 1, !dbg !37
+  store i32 %inc, ptr %i, align 4, !dbg !37, !tbaa !24
+  br label %for.cond, !dbg !33, !llvm.loop !38
+
+for.end:                                          ; preds = %for.cond.cleanup
+  ret void, !dbg !40
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: nounwind uwtable
+define dso_local void @bar(ptr %A) #0 !dbg !41 {
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store ptr [[A:%.*]], ptr [[A_ADDR]], align 8, !tbaa [[TBAA18]]
+; CHECK-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META43:![0-9]+]], !DIExpression(), [[META46:![0-9]+]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG47:![0-9]+]]
+; CHECK-NEXT:      #dbg_declare(ptr [[I]], [[META44:![0-9]+]], !DIExpression(), [[META48:![0-9]+]])
+; CHECK-NEXT:    store i32 0, ptr [[I]], align 4, !dbg [[META48]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_COND:%.*]], !dbg [[DBG47]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG49:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG51:![0-9]+]], !tbaa [[TBAA18]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG52:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP2]], !dbg [[DBG53:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG54:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG55:![0-9]+]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG56:![0-9]+]], !tbaa [[TBAA18]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG57:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64, !dbg [[DBG56]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 [[IDXPROM]], !dbg [[DBG56]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !dbg [[DBG58:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_INC:%.*]], !dbg [[DBG56]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG59:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP5]], 1, !dbg [[DBG59]]
+; CHECK-NEXT:    store i32 [[INC]], ptr [[I]], align 4, !dbg [[DBG59]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_COND]], !dbg [[DBG55]], !llvm.loop [[LOOP60:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void, !dbg [[DBG62:![0-9]+]]
+;
+entry:
+  %A.addr = alloca ptr, align 8
+  %i = alloca i32, align 4
+  store ptr %A, ptr %A.addr, align 8, !tbaa !18
+  #dbg_declare(ptr %A.addr, !43, !DIExpression(), !46)
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !47
+  #dbg_declare(ptr %i, !44, !DIExpression(), !48)
+  store i32 0, ptr %i, align 4, !dbg !48, !tbaa !24
+  br label %for.cond, !dbg !47
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4, !dbg !49, !tbaa !24
+  %1 = load ptr, ptr %A.addr, align 8, !dbg !51, !tbaa !18
+  %2 = load i32, ptr %1, align 4, !dbg !52, !tbaa !24
+  %cmp = icmp slt i32 %0, %2, !dbg !53
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !54
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !55
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %3 = load ptr, ptr %A.addr, align 8, !dbg !56, !tbaa !18
+  %4 = load i32, ptr %i, align 4, !dbg !57, !tbaa !24
+  %idxprom = sext i32 %4 to i64, !dbg !56
+  %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !56
+  store i32 0, ptr %arrayidx, align 4, !dbg !58, !tbaa !24
+  br label %for.inc, !dbg !56
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, ptr %i, align 4, !dbg !59, !tbaa !24
+  %inc = add nsw i32 %5, 1, !dbg !59
+  store i32 %inc, ptr %i, align 4, !dbg !59, !tbaa !24
+  br label %for.cond, !dbg !55, !llvm.loop !60
+
+for.end:                                          ; preds = %for.cond.cleanup
+  ret void, !dbg !62
+}
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "various_ir_values.c", directory: "/data/build/llvm-project")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)"}
+!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !{!13, !14}
+!13 = !DILocalVariable(name: "A", arg: 1, scope: !7, file: !1, line: 1, type: !10)
+!14 = !DILocalVariable(name: "i", scope: !15, file: !1, line: 3, type: !11)
+!15 = distinct !DILexicalBlock(scope: !7, file: !1, line: 3, column: 3)
+!16 = distinct !DIAssignID()
+!17 = !DILocation(line: 1, column: 15, scope: !7)
+!18 = !{!19, !19, i64 0}
+!19 = !{!"any pointer", !20, i64 0}
+!20 = !{!"omnipotent char", !21, i64 0}
+!21 = !{!"Simple C/C++ TBAA"}
+!22 = !DILocation(line: 3, column: 8, scope: !15)
+!23 = !DILocation(line: 3, column: 12, scope: !15)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"int", !20, i64 0}
+!26 = !DILocation(line: 3, column: 19, scope: !27)
+!27 = distinct !DILexicalBlock(scope: !15, file: !1, line: 3, column: 3)
+!28 = !DILocation(line: 3, column: 24, scope: !27)
+!29 = !DILocation(line: 3, column: 23, scope: !27)
+!30 = !DILocation(line: 3, column: 21, scope: !27)
+!31 = !DILocation(line: 3, column: 3, scope: !15)
+!32 = !{!"branch_weights", i32 1, i32 1048575}
+!33 = !DILocation(line: 3, column: 3, scope: !27)
+!34 = !DILocation(line: 4, column: 5, scope: !27)
+!35 = !DILocation(line: 4, column: 7, scope: !27)
+!36 = !DILocation(line: 4, column: 10, scope: !27)
+!37 = !DILocation(line: 3, column: 27, scope: !27)
+!38 = distinct !{!38, !31, !39}
+!39 = !DILocation(line: 4, column: 12, scope: !15)
+!40 = !DILocation(line: 5, column: 1, scope: !7)
+!41 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 7, type: !8, scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !42)
+!42 = !{!43, !44}
+!43 = !DILocalVariable(name: "A", arg: 1, scope: !41, file: !1, line: 7, type: !10)
+!44 = !DILocalVariable(name: "i", scope: !45, file: !1, line: 9, type: !11)
+!45 = distinct !DILexicalBlock(scope: !41, file: !1, line: 9, column: 3)
+!46 = !DILocation(line: 7, column: 15, scope: !41)
+!47 = !DILocation(line: 9, column: 8, scope: !45)
+!48 = !DILocation(line: 9, column: 12, scope: !45)
+!49 = !DILocation(line: 9, column: 19, scope: !50)
+!50 = distinct !DILexicalBlock(scope: !45, file: !1, line: 9, column: 3)
+!51 = !DILocation(line: 9, column: 24, scope: !50)
+!52 = !DILocation(line: 9, column: 23, scope: !50)
+!53 = !DILocation(line: 9, column: 21, scope: !50)
+!54 = !DILocation(line: 9, column: 3, scope: !45)
+!55 = !DILocation(line: 9, column: 3, scope: !50)
+!56 = !DILocation(line: 10, column: 5, scope: !50)
+!57 = !DILocation(line: 10, column: 7, scope: !50)
+!58 = !DILocation(line: 10, column: 10, scope: !50)
+!59 = !DILocation(line: 9, column: 27, scope: !50)
+!60 = distinct !{!60, !54, !61}
+!61 = !DILocation(line: 10, column: 12, scope: !45)
+!62 = !DILocation(line: 11, column: 1, scope: !41)
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.expected
new file mode 100644
index 0000000000000..5905e443deff2
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.expected
@@ -0,0 +1,240 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature
+; Just run it through opt, no passes needed.
+; RUN: opt < %s -S --write-experimental-debuginfo=true | FileCheck %s
+
+; ModuleID = 'various_ir_values.c'
+source_filename = "various_ir_values.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define dso_local void @foo(ptr %A) #0 !dbg !7 {
+; CHECK-LABEL: define {{[^@]+}}@foo
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG7:![0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, !DIAssignID [[DIASSIGNID16:![0-9]+]]
+; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+; CHECK-NEXT:      #dbg_assign(i1 undef, [[META13:![0-9]+]], !DIExpression(), [[DIASSIGNID16]], ptr [[A_ADDR]], !DIExpression(), [[META17:![0-9]+]])
+; CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8, !tbaa [[TBAA18:![0-9]+]]
+; CHECK-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META13]], !DIExpression(), [[META17]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR2:[0-9]+]], !dbg [[DBG22:![0-9]+]]
+; CHECK-NEXT:      #dbg_declare(ptr [[I]], [[META14:![0-9]+]], !DIExpression(), [[META23:![0-9]+]])
+; CHECK-NEXT:    store i32 0, ptr [[I]], align 4, !dbg [[META23]], !tbaa [[TBAA24:![0-9]+]]
+; CHECK-NEXT:    br label [[FOR_COND:%.*]], !dbg [[DBG22]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG26:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG28:![0-9]+]], !tbaa [[TBAA18]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG29:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP2]], !dbg [[DBG30:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG31:![0-9]+]], !prof [[PROF32:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG33:![0-9]+]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG34:![0-9]+]], !tbaa [[TBAA18]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG35:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64, !dbg [[DBG34]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 [[IDXPROM]], !dbg [[DBG34]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !dbg [[DBG36:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_INC:%.*]], !dbg [[DBG34]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG37:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP5]], 1, !dbg [[DBG37]]
+; CHECK-NEXT:    store i32 [[INC]], ptr [[I]], align 4, !dbg [[DBG37]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_COND]], !dbg [[DBG33]], !llvm.loop [[LOOP38:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void, !dbg [[DBG40:![0-9]+]]
+;
+entry:
+  %A.addr = alloca ptr, align 8, !DIAssignID !16
+  %i = alloca i32, align 4
+  #dbg_assign(i1 undef, !13, !DIExpression(), !16, ptr %A.addr, !DIExpression(), !17)
+  store ptr %A, ptr %A.addr, align 8, !tbaa !18
+  #dbg_declare(ptr %A.addr, !13, !DIExpression(), !17)
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !22
+  #dbg_declare(ptr %i, !14, !DIExpression(), !23)
+  store i32 0, ptr %i, align 4, !dbg !23, !tbaa !24
+  br label %for.cond, !dbg !22
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4, !dbg !26, !tbaa !24
+  %1 = load ptr, ptr %A.addr, align 8, !dbg !28, !tbaa !18
+  %2 = load i32, ptr %1, align 4, !dbg !29, !tbaa !24
+  %cmp = icmp slt i32 %0, %2, !dbg !30
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !31, !prof !32
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !33
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %3 = load ptr, ptr %A.addr, align 8, !dbg !34, !tbaa !18
+  %4 = load i32, ptr %i, align 4, !dbg !35, !tbaa !24
+  %idxprom = sext i32 %4 to i64, !dbg !34
+  %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !34
+  store i32 0, ptr %arrayidx, align 4, !dbg !36, !tbaa !24
+  br label %for.inc, !dbg !34
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, ptr %i, align 4, !dbg !37, !tbaa !24
+  %inc = add nsw i32 %5, 1, !dbg !37
+  store i32 %inc, ptr %i, align 4, !dbg !37, !tbaa !24
+  br label %for.cond, !dbg !33, !llvm.loop !38
+
+for.end:                                          ; preds = %for.cond.cleanup
+  ret void, !dbg !40
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: nounwind uwtable
+define dso_local void @bar(ptr %A) #0 !dbg !41 {
+; CHECK-LABEL: define {{[^@]+}}@bar
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR0]] !dbg [[DBG41:![0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8, !tbaa [[TBAA18]]
+; CHECK-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META43:![0-9]+]], !DIExpression(), [[META46:![0-9]+]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG47:![0-9]+]]
+; CHECK-NEXT:      #dbg_declare(ptr [[I]], [[META44:![0-9]+]], !DIExpression(), [[META48:![0-9]+]])
+; CHECK-NEXT:    store i32 0, ptr [[I]], align 4, !dbg [[META48]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_COND:%.*]], !dbg [[DBG47]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG49:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG51:![0-9]+]], !tbaa [[TBAA18]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG52:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP2]], !dbg [[DBG53:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG54:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG55:![0-9]+]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG56:![0-9]+]], !tbaa [[TBAA18]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG57:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64, !dbg [[DBG56]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 [[IDXPROM]], !dbg [[DBG56]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !dbg [[DBG58:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_INC:%.*]], !dbg [[DBG56]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG59:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP5]], 1, !dbg [[DBG59]]
+; CHECK-NEXT:    store i32 [[INC]], ptr [[I]], align 4, !dbg [[DBG59]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_COND]], !dbg [[DBG55]], !llvm.loop [[LOOP60:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void, !dbg [[DBG62:![0-9]+]]
+;
+entry:
+  %A.addr = alloca ptr, align 8
+  %i = alloca i32, align 4
+  store ptr %A, ptr %A.addr, align 8, !tbaa !18
+  #dbg_declare(ptr %A.addr, !43, !DIExpression(), !46)
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !47
+  #dbg_declare(ptr %i, !44, !DIExpression(), !48)
+  store i32 0, ptr %i, align 4, !dbg !48, !tbaa !24
+  br label %for.cond, !dbg !47
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4, !dbg !49, !tbaa !24
+  %1 = load ptr, ptr %A.addr, align 8, !dbg !51, !tbaa !18
+  %2 = load i32, ptr %1, align 4, !dbg !52, !tbaa !24
+  %cmp = icmp slt i32 %0, %2, !dbg !53
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !54
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !55
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %3 = load ptr, ptr %A.addr, align 8, !dbg !56, !tbaa !18
+  %4 = load i32, ptr %i, align 4, !dbg !57, !tbaa !24
+  %idxprom = sext i32 %4 to i64, !dbg !56
+  %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !56
+  store i32 0, ptr %arrayidx, align 4, !dbg !58, !tbaa !24
+  br label %for.inc, !dbg !56
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, ptr %i, align 4, !dbg !59, !tbaa !24
+  %inc = add nsw i32 %5, 1, !dbg !59
+  store i32 %inc, ptr %i, align 4, !dbg !59, !tbaa !24
+  br label %for.cond, !dbg !55, !llvm.loop !60
+
+for.end:                                          ; preds = %for.cond.cleanup
+  ret void, !dbg !62
+}
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "various_ir_values.c", directory: "/data/build/llvm-project")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)"}
+!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !{!13, !14}
+!13 = !DILocalVariable(name: "A", arg: 1, scope: !7, file: !1, line: 1, type: !10)
+!14 = !DILocalVariable(name: "i", scope: !15, file: !1, line: 3, type: !11)
+!15 = distinct !DILexicalBlock(scope: !7, file: !1, line: 3, column: 3)
+!16 = distinct !DIAssignID()
+!17 = !DILocation(line: 1, column: 15, scope: !7)
+!18 = !{!19, !19, i64 0}
+!19 = !{!"any pointer", !20, i64 0}
+!20 = !{!"omnipotent char", !21, i64 0}
+!21 = !{!"Simple C/C++ TBAA"}
+!22 = !DILocation(line: 3, column: 8, scope: !15)
+!23 = !DILocation(line: 3, column: 12, scope: !15)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"int", !20, i64 0}
+!26 = !DILocation(line: 3, column: 19, scope: !27)
+!27 = distinct !DILexicalBlock(scope: !15, file: !1, line: 3, column: 3)
+!28 = !DILocation(line: 3, column: 24, scope: !27)
+!29 = !DILocation(line: 3, column: 23, scope: !27)
+!30 = !DILocation(line: 3, column: 21, scope: !27)
+!31 = !DILocation(line: 3, column: 3, scope: !15)
+!32 = !{!"branch_weights", i32 1, i32 1048575}
+!33 = !DILocation(line: 3, column: 3, scope: !27)
+!34 = !DILocation(line: 4, column: 5, scope: !27)
+!35 = !DILocation(line: 4, column: 7, scope: !27)
+!36 = !DILocation(line: 4, column: 10, scope: !27)
+!37 = !DILocation(line: 3, column: 27, scope: !27)
+!38 = distinct !{!38, !31, !39}
+!39 = !DILocation(line: 4, column: 12, scope: !15)
+!40 = !DILocation(line: 5, column: 1, scope: !7)
+!41 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 7, type: !8, scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !42)
+!42 = !{!43, !44}
+!43 = !DILocalVariable(name: "A", arg: 1, scope: !41, file: !1, line: 7, type: !10)
+!44 = !DILocalVariable(name: "i", scope: !45, file: !1, line: 9, type: !11)
+!45 = distinct !DILexicalBlock(scope: !41, file: !1, line: 9, column: 3)
+!46 = !DILocation(line: 7, column: 15, scope: !41)
+!47 = !DILocation(line: 9, column: 8, scope: !45)
+!48 = !DILocation(line: 9, column: 12, scope: !45)
+!49 = !DILocation(line: 9, column: 19, scope: !50)
+!50 = distinct !DILexicalBlock(scope: !45, file: !1, line: 9, column: 3)
+!51 = !DILocation(line: 9, column: 24, scope: !50)
+!52 = !DILocation(line: 9, column: 23, scope: !50)
+!53 = !DILocation(line: 9, column: 21, scope: !50)
+!54 = !DILocation(line: 9, column: 3, scope: !45)
+!55 = !DILocation(line: 9, column: 3, scope: !50)
+!56 = !DILocation(line: 10, column: 5, scope: !50)
+!57 = !DILocation(line: 10, column: 7, scope: !50)
+!58 = !DILocation(line: 10, column: 10, scope: !50)
+!59 = !DILocation(line: 9, column: 27, scope: !50)
+!60 = distinct !{!60, !54, !61}
+!61 = !DILocation(line: 10, column: 12, scope: !45)
+!62 = !DILocation(line: 11, column: 1, scope: !41)
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.globals.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.globals.expected
new file mode 100644
index 0000000000000..579d6a437d0e5
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.globals.expected
@@ -0,0 +1,309 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
+; Just run it through opt, no passes needed.
+; RUN: opt < %s -S --write-experimental-debuginfo=true | FileCheck %s
+
+; ModuleID = 'various_ir_values.c'
+source_filename = "various_ir_values.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define dso_local void @foo(ptr %A) #0 !dbg !7 {
+; CHECK-LABEL: define {{[^@]+}}@foo
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG7:![0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, !DIAssignID [[DIASSIGNID16:![0-9]+]]
+; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+; CHECK-NEXT:      #dbg_assign(i1 undef, [[META13:![0-9]+]], !DIExpression(), [[DIASSIGNID16]], ptr [[A_ADDR]], !DIExpression(), [[META17:![0-9]+]])
+; CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8, !tbaa [[TBAA18:![0-9]+]]
+; CHECK-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META13]], !DIExpression(), [[META17]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR2:[0-9]+]], !dbg [[DBG22:![0-9]+]]
+; CHECK-NEXT:      #dbg_declare(ptr [[I]], [[META14:![0-9]+]], !DIExpression(), [[META23:![0-9]+]])
+; CHECK-NEXT:    store i32 0, ptr [[I]], align 4, !dbg [[META23]], !tbaa [[TBAA24:![0-9]+]]
+; CHECK-NEXT:    br label [[FOR_COND:%.*]], !dbg [[DBG22]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG26:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG28:![0-9]+]], !tbaa [[TBAA18]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG29:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP2]], !dbg [[DBG30:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG31:![0-9]+]], !prof [[PROF32:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG33:![0-9]+]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG34:![0-9]+]], !tbaa [[TBAA18]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG35:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64, !dbg [[DBG34]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 [[IDXPROM]], !dbg [[DBG34]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !dbg [[DBG36:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_INC:%.*]], !dbg [[DBG34]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG37:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP5]], 1, !dbg [[DBG37]]
+; CHECK-NEXT:    store i32 [[INC]], ptr [[I]], align 4, !dbg [[DBG37]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_COND]], !dbg [[DBG33]], !llvm.loop [[LOOP38:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void, !dbg [[DBG40:![0-9]+]]
+;
+entry:
+  %A.addr = alloca ptr, align 8, !DIAssignID !16
+  %i = alloca i32, align 4
+  #dbg_assign(i1 undef, !13, !DIExpression(), !16, ptr %A.addr, !DIExpression(), !17)
+  store ptr %A, ptr %A.addr, align 8, !tbaa !18
+  #dbg_declare(ptr %A.addr, !13, !DIExpression(), !17)
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !22
+  #dbg_declare(ptr %i, !14, !DIExpression(), !23)
+  store i32 0, ptr %i, align 4, !dbg !23, !tbaa !24
+  br label %for.cond, !dbg !22
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4, !dbg !26, !tbaa !24
+  %1 = load ptr, ptr %A.addr, align 8, !dbg !28, !tbaa !18
+  %2 = load i32, ptr %1, align 4, !dbg !29, !tbaa !24
+  %cmp = icmp slt i32 %0, %2, !dbg !30
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !31, !prof !32
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !33
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %3 = load ptr, ptr %A.addr, align 8, !dbg !34, !tbaa !18
+  %4 = load i32, ptr %i, align 4, !dbg !35, !tbaa !24
+  %idxprom = sext i32 %4 to i64, !dbg !34
+  %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !34
+  store i32 0, ptr %arrayidx, align 4, !dbg !36, !tbaa !24
+  br label %for.inc, !dbg !34
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, ptr %i, align 4, !dbg !37, !tbaa !24
+  %inc = add nsw i32 %5, 1, !dbg !37
+  store i32 %inc, ptr %i, align 4, !dbg !37, !tbaa !24
+  br label %for.cond, !dbg !33, !llvm.loop !38
+
+for.end:                                          ; preds = %for.cond.cleanup
+  ret void, !dbg !40
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: nounwind uwtable
+define dso_local void @bar(ptr %A) #0 !dbg !41 {
+; CHECK-LABEL: define {{[^@]+}}@bar
+; CHECK-SAME: (ptr [[A:%.*]]) #[[ATTR0]] !dbg [[DBG41:![0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8, !tbaa [[TBAA18]]
+; CHECK-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META43:![0-9]+]], !DIExpression(), [[META46:![0-9]+]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG47:![0-9]+]]
+; CHECK-NEXT:      #dbg_declare(ptr [[I]], [[META44:![0-9]+]], !DIExpression(), [[META48:![0-9]+]])
+; CHECK-NEXT:    store i32 0, ptr [[I]], align 4, !dbg [[META48]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_COND:%.*]], !dbg [[DBG47]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG49:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG51:![0-9]+]], !tbaa [[TBAA18]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG52:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP2]], !dbg [[DBG53:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG54:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG55:![0-9]+]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG56:![0-9]+]], !tbaa [[TBAA18]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG57:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64, !dbg [[DBG56]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 [[IDXPROM]], !dbg [[DBG56]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !dbg [[DBG58:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_INC:%.*]], !dbg [[DBG56]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG59:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP5]], 1, !dbg [[DBG59]]
+; CHECK-NEXT:    store i32 [[INC]], ptr [[I]], align 4, !dbg [[DBG59]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_COND]], !dbg [[DBG55]], !llvm.loop [[LOOP60:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void, !dbg [[DBG62:![0-9]+]]
+;
+entry:
+  %A.addr = alloca ptr, align 8
+  %i = alloca i32, align 4
+  store ptr %A, ptr %A.addr, align 8, !tbaa !18
+  #dbg_declare(ptr %A.addr, !43, !DIExpression(), !46)
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !47
+  #dbg_declare(ptr %i, !44, !DIExpression(), !48)
+  store i32 0, ptr %i, align 4, !dbg !48, !tbaa !24
+  br label %for.cond, !dbg !47
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4, !dbg !49, !tbaa !24
+  %1 = load ptr, ptr %A.addr, align 8, !dbg !51, !tbaa !18
+  %2 = load i32, ptr %1, align 4, !dbg !52, !tbaa !24
+  %cmp = icmp slt i32 %0, %2, !dbg !53
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !54
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !55
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %3 = load ptr, ptr %A.addr, align 8, !dbg !56, !tbaa !18
+  %4 = load i32, ptr %i, align 4, !dbg !57, !tbaa !24
+  %idxprom = sext i32 %4 to i64, !dbg !56
+  %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !56
+  store i32 0, ptr %arrayidx, align 4, !dbg !58, !tbaa !24
+  br label %for.inc, !dbg !56
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, ptr %i, align 4, !dbg !59, !tbaa !24
+  %inc = add nsw i32 %5, 1, !dbg !59
+  store i32 %inc, ptr %i, align 4, !dbg !59, !tbaa !24
+  br label %for.cond, !dbg !55, !llvm.loop !60
+
+for.end:                                          ; preds = %for.cond.cleanup
+  ret void, !dbg !62
+}
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "various_ir_values.c", directory: "/data/build/llvm-project")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)"}
+!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !{!13, !14}
+!13 = !DILocalVariable(name: "A", arg: 1, scope: !7, file: !1, line: 1, type: !10)
+!14 = !DILocalVariable(name: "i", scope: !15, file: !1, line: 3, type: !11)
+!15 = distinct !DILexicalBlock(scope: !7, file: !1, line: 3, column: 3)
+!16 = distinct !DIAssignID()
+!17 = !DILocation(line: 1, column: 15, scope: !7)
+!18 = !{!19, !19, i64 0}
+!19 = !{!"any pointer", !20, i64 0}
+!20 = !{!"omnipotent char", !21, i64 0}
+!21 = !{!"Simple C/C++ TBAA"}
+!22 = !DILocation(line: 3, column: 8, scope: !15)
+!23 = !DILocation(line: 3, column: 12, scope: !15)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"int", !20, i64 0}
+!26 = !DILocation(line: 3, column: 19, scope: !27)
+!27 = distinct !DILexicalBlock(scope: !15, file: !1, line: 3, column: 3)
+!28 = !DILocation(line: 3, column: 24, scope: !27)
+!29 = !DILocation(line: 3, column: 23, scope: !27)
+!30 = !DILocation(line: 3, column: 21, scope: !27)
+!31 = !DILocation(line: 3, column: 3, scope: !15)
+!32 = !{!"branch_weights", i32 1, i32 1048575}
+!33 = !DILocation(line: 3, column: 3, scope: !27)
+!34 = !DILocation(line: 4, column: 5, scope: !27)
+!35 = !DILocation(line: 4, column: 7, scope: !27)
+!36 = !DILocation(line: 4, column: 10, scope: !27)
+!37 = !DILocation(line: 3, column: 27, scope: !27)
+!38 = distinct !{!38, !31, !39}
+!39 = !DILocation(line: 4, column: 12, scope: !15)
+!40 = !DILocation(line: 5, column: 1, scope: !7)
+!41 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 7, type: !8, scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !42)
+!42 = !{!43, !44}
+!43 = !DILocalVariable(name: "A", arg: 1, scope: !41, file: !1, line: 7, type: !10)
+!44 = !DILocalVariable(name: "i", scope: !45, file: !1, line: 9, type: !11)
+!45 = distinct !DILexicalBlock(scope: !41, file: !1, line: 9, column: 3)
+!46 = !DILocation(line: 7, column: 15, scope: !41)
+!47 = !DILocation(line: 9, column: 8, scope: !45)
+!48 = !DILocation(line: 9, column: 12, scope: !45)
+!49 = !DILocation(line: 9, column: 19, scope: !50)
+!50 = distinct !DILexicalBlock(scope: !45, file: !1, line: 9, column: 3)
+!51 = !DILocation(line: 9, column: 24, scope: !50)
+!52 = !DILocation(line: 9, column: 23, scope: !50)
+!53 = !DILocation(line: 9, column: 21, scope: !50)
+!54 = !DILocation(line: 9, column: 3, scope: !45)
+!55 = !DILocation(line: 9, column: 3, scope: !50)
+!56 = !DILocation(line: 10, column: 5, scope: !50)
+!57 = !DILocation(line: 10, column: 7, scope: !50)
+!58 = !DILocation(line: 10, column: 10, scope: !50)
+!59 = !DILocation(line: 9, column: 27, scope: !50)
+!60 = distinct !{!60, !54, !61}
+!61 = !DILocation(line: 10, column: 12, scope: !45)
+!62 = !DILocation(line: 11, column: 1, scope: !41)
+;.
+; CHECK: attributes #[[ATTR0]] = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+; CHECK: attributes #[[ATTR2]] = { nounwind }
+;.
+; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
+; CHECK: [[META1]] = !DIFile(filename: "various_ir_values.c", directory: {{.*}})
+; CHECK: [[META2]] = !{}
+; CHECK: [[META3:![0-9]+]] = !{i32 7, !"Dwarf Version", i32 4}
+; CHECK: [[META4:![0-9]+]] = !{i32 2, !"Debug Info Version", i32 3}
+; CHECK: [[META5:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; CHECK: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; CHECK: [[DBG7]] = distinct !DISubprogram(name: "foo", scope: [[META1]], file: [[META1]], line: 1, type: [[META8:![0-9]+]], scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META12:![0-9]+]])
+; CHECK: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]])
+; CHECK: [[META9]] = !{null, [[META10:![0-9]+]]}
+; CHECK: [[META10]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[META11:![0-9]+]], size: 64)
+; CHECK: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; CHECK: [[META12]] = !{[[META13]], [[META14]]}
+; CHECK: [[META13]] = !DILocalVariable(name: "A", arg: 1, scope: [[DBG7]], file: [[META1]], line: 1, type: [[META10]])
+; CHECK: [[META14]] = !DILocalVariable(name: "i", scope: [[META15:![0-9]+]], file: [[META1]], line: 3, type: [[META11]])
+; CHECK: [[META15]] = distinct !DILexicalBlock(scope: [[DBG7]], file: [[META1]], line: 3, column: 3)
+; CHECK: [[DIASSIGNID16]] = distinct !DIAssignID()
+; CHECK: [[META17]] = !DILocation(line: 1, column: 15, scope: [[DBG7]])
+; CHECK: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
+; CHECK: [[META19]] = !{!"any pointer", [[META20:![0-9]+]], i64 0}
+; CHECK: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0}
+; CHECK: [[META21]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[DBG22]] = !DILocation(line: 3, column: 8, scope: [[META15]])
+; CHECK: [[META23]] = !DILocation(line: 3, column: 12, scope: [[META15]])
+; CHECK: [[TBAA24]] = !{[[META25:![0-9]+]], [[META25]], i64 0}
+; CHECK: [[META25]] = !{!"int", [[META20]], i64 0}
+; CHECK: [[DBG26]] = !DILocation(line: 3, column: 19, scope: [[META27:![0-9]+]])
+; CHECK: [[META27]] = distinct !DILexicalBlock(scope: [[META15]], file: [[META1]], line: 3, column: 3)
+; CHECK: [[DBG28]] = !DILocation(line: 3, column: 24, scope: [[META27]])
+; CHECK: [[DBG29]] = !DILocation(line: 3, column: 23, scope: [[META27]])
+; CHECK: [[DBG30]] = !DILocation(line: 3, column: 21, scope: [[META27]])
+; CHECK: [[DBG31]] = !DILocation(line: 3, column: 3, scope: [[META15]])
+; CHECK: [[PROF32]] = !{!"branch_weights", i32 1, i32 1048575}
+; CHECK: [[DBG33]] = !DILocation(line: 3, column: 3, scope: [[META27]])
+; CHECK: [[DBG34]] = !DILocation(line: 4, column: 5, scope: [[META27]])
+; CHECK: [[DBG35]] = !DILocation(line: 4, column: 7, scope: [[META27]])
+; CHECK: [[DBG36]] = !DILocation(line: 4, column: 10, scope: [[META27]])
+; CHECK: [[DBG37]] = !DILocation(line: 3, column: 27, scope: [[META27]])
+; CHECK: [[LOOP38]] = distinct !{[[LOOP38]], [[DBG31]], [[META39:![0-9]+]]}
+; CHECK: [[META39]] = !DILocation(line: 4, column: 12, scope: [[META15]])
+; CHECK: [[DBG40]] = !DILocation(line: 5, column: 1, scope: [[DBG7]])
+; CHECK: [[DBG41]] = distinct !DISubprogram(name: "bar", scope: [[META1]], file: [[META1]], line: 7, type: [[META8]], scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META42:![0-9]+]])
+; CHECK: [[META42]] = !{[[META43]], [[META44]]}
+; CHECK: [[META43]] = !DILocalVariable(name: "A", arg: 1, scope: [[DBG41]], file: [[META1]], line: 7, type: [[META10]])
+; CHECK: [[META44]] = !DILocalVariable(name: "i", scope: [[META45:![0-9]+]], file: [[META1]], line: 9, type: [[META11]])
+; CHECK: [[META45]] = distinct !DILexicalBlock(scope: [[DBG41]], file: [[META1]], line: 9, column: 3)
+; CHECK: [[META46]] = !DILocation(line: 7, column: 15, scope: [[DBG41]])
+; CHECK: [[DBG47]] = !DILocation(line: 9, column: 8, scope: [[META45]])
+; CHECK: [[META48]] = !DILocation(line: 9, column: 12, scope: [[META45]])
+; CHECK: [[DBG49]] = !DILocation(line: 9, column: 19, scope: [[META50:![0-9]+]])
+; CHECK: [[META50]] = distinct !DILexicalBlock(scope: [[META45]], file: [[META1]], line: 9, column: 3)
+; CHECK: [[DBG51]] = !DILocation(line: 9, column: 24, scope: [[META50]])
+; CHECK: [[DBG52]] = !DILocation(line: 9, column: 23, scope: [[META50]])
+; CHECK: [[DBG53]] = !DILocation(line: 9, column: 21, scope: [[META50]])
+; CHECK: [[DBG54]] = !DILocation(line: 9, column: 3, scope: [[META45]])
+; CHECK: [[DBG55]] = !DILocation(line: 9, column: 3, scope: [[META50]])
+; CHECK: [[DBG56]] = !DILocation(line: 10, column: 5, scope: [[META50]])
+; CHECK: [[DBG57]] = !DILocation(line: 10, column: 7, scope: [[META50]])
+; CHECK: [[DBG58]] = !DILocation(line: 10, column: 10, scope: [[META50]])
+; CHECK: [[DBG59]] = !DILocation(line: 9, column: 27, scope: [[META50]])
+; CHECK: [[LOOP60]] = distinct !{[[LOOP60]], [[DBG54]], [[META61:![0-9]+]]}
+; CHECK: [[META61]] = !DILocation(line: 10, column: 12, scope: [[META45]])
+; CHECK: [[DBG62]] = !DILocation(line: 11, column: 1, scope: [[DBG41]])
+;.
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.noglobals.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.noglobals.expected
new file mode 100644
index 0000000000000..1f9c37ccfbd88
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.noglobals.expected
@@ -0,0 +1,238 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; Just run it through opt, no passes needed.
+; RUN: opt < %s -S --write-experimental-debuginfo=true | FileCheck %s
+
+; ModuleID = 'various_ir_values.c'
+source_filename = "various_ir_values.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define dso_local void @foo(ptr %A) #0 !dbg !7 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, !DIAssignID [[DIASSIGNID16:![0-9]+]]
+; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+; CHECK-NEXT:      #dbg_assign(i1 undef, [[META13:![0-9]+]], !DIExpression(), [[DIASSIGNID16]], ptr [[A_ADDR]], !DIExpression(), [[META17:![0-9]+]])
+; CHECK-NEXT:    store ptr [[A:%.*]], ptr [[A_ADDR]], align 8, !tbaa [[TBAA18:![0-9]+]]
+; CHECK-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META13]], !DIExpression(), [[META17]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR2:[0-9]+]], !dbg [[DBG22:![0-9]+]]
+; CHECK-NEXT:      #dbg_declare(ptr [[I]], [[META14:![0-9]+]], !DIExpression(), [[META23:![0-9]+]])
+; CHECK-NEXT:    store i32 0, ptr [[I]], align 4, !dbg [[META23]], !tbaa [[TBAA24:![0-9]+]]
+; CHECK-NEXT:    br label [[FOR_COND:%.*]], !dbg [[DBG22]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG26:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG28:![0-9]+]], !tbaa [[TBAA18]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG29:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP2]], !dbg [[DBG30:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG31:![0-9]+]], !prof [[PROF32:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG33:![0-9]+]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG34:![0-9]+]], !tbaa [[TBAA18]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG35:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64, !dbg [[DBG34]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 [[IDXPROM]], !dbg [[DBG34]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !dbg [[DBG36:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_INC:%.*]], !dbg [[DBG34]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG37:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP5]], 1, !dbg [[DBG37]]
+; CHECK-NEXT:    store i32 [[INC]], ptr [[I]], align 4, !dbg [[DBG37]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_COND]], !dbg [[DBG33]], !llvm.loop [[LOOP38:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void, !dbg [[DBG40:![0-9]+]]
+;
+entry:
+  %A.addr = alloca ptr, align 8, !DIAssignID !16
+  %i = alloca i32, align 4
+  #dbg_assign(i1 undef, !13, !DIExpression(), !16, ptr %A.addr, !DIExpression(), !17)
+  store ptr %A, ptr %A.addr, align 8, !tbaa !18
+  #dbg_declare(ptr %A.addr, !13, !DIExpression(), !17)
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !22
+  #dbg_declare(ptr %i, !14, !DIExpression(), !23)
+  store i32 0, ptr %i, align 4, !dbg !23, !tbaa !24
+  br label %for.cond, !dbg !22
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4, !dbg !26, !tbaa !24
+  %1 = load ptr, ptr %A.addr, align 8, !dbg !28, !tbaa !18
+  %2 = load i32, ptr %1, align 4, !dbg !29, !tbaa !24
+  %cmp = icmp slt i32 %0, %2, !dbg !30
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !31, !prof !32
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !33
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %3 = load ptr, ptr %A.addr, align 8, !dbg !34, !tbaa !18
+  %4 = load i32, ptr %i, align 4, !dbg !35, !tbaa !24
+  %idxprom = sext i32 %4 to i64, !dbg !34
+  %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !34
+  store i32 0, ptr %arrayidx, align 4, !dbg !36, !tbaa !24
+  br label %for.inc, !dbg !34
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, ptr %i, align 4, !dbg !37, !tbaa !24
+  %inc = add nsw i32 %5, 1, !dbg !37
+  store i32 %inc, ptr %i, align 4, !dbg !37, !tbaa !24
+  br label %for.cond, !dbg !33, !llvm.loop !38
+
+for.end:                                          ; preds = %for.cond.cleanup
+  ret void, !dbg !40
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: nounwind uwtable
+define dso_local void @bar(ptr %A) #0 !dbg !41 {
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store ptr [[A:%.*]], ptr [[A_ADDR]], align 8, !tbaa [[TBAA18]]
+; CHECK-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META43:![0-9]+]], !DIExpression(), [[META46:![0-9]+]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG47:![0-9]+]]
+; CHECK-NEXT:      #dbg_declare(ptr [[I]], [[META44:![0-9]+]], !DIExpression(), [[META48:![0-9]+]])
+; CHECK-NEXT:    store i32 0, ptr [[I]], align 4, !dbg [[META48]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_COND:%.*]], !dbg [[DBG47]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG49:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG51:![0-9]+]], !tbaa [[TBAA18]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG52:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP2]], !dbg [[DBG53:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG54:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG55:![0-9]+]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG56:![0-9]+]], !tbaa [[TBAA18]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG57:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64, !dbg [[DBG56]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 [[IDXPROM]], !dbg [[DBG56]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !dbg [[DBG58:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_INC:%.*]], !dbg [[DBG56]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG59:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP5]], 1, !dbg [[DBG59]]
+; CHECK-NEXT:    store i32 [[INC]], ptr [[I]], align 4, !dbg [[DBG59]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_COND]], !dbg [[DBG55]], !llvm.loop [[LOOP60:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void, !dbg [[DBG62:![0-9]+]]
+;
+entry:
+  %A.addr = alloca ptr, align 8
+  %i = alloca i32, align 4
+  store ptr %A, ptr %A.addr, align 8, !tbaa !18
+  #dbg_declare(ptr %A.addr, !43, !DIExpression(), !46)
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !47
+  #dbg_declare(ptr %i, !44, !DIExpression(), !48)
+  store i32 0, ptr %i, align 4, !dbg !48, !tbaa !24
+  br label %for.cond, !dbg !47
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4, !dbg !49, !tbaa !24
+  %1 = load ptr, ptr %A.addr, align 8, !dbg !51, !tbaa !18
+  %2 = load i32, ptr %1, align 4, !dbg !52, !tbaa !24
+  %cmp = icmp slt i32 %0, %2, !dbg !53
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !54
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !55
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %3 = load ptr, ptr %A.addr, align 8, !dbg !56, !tbaa !18
+  %4 = load i32, ptr %i, align 4, !dbg !57, !tbaa !24
+  %idxprom = sext i32 %4 to i64, !dbg !56
+  %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !56
+  store i32 0, ptr %arrayidx, align 4, !dbg !58, !tbaa !24
+  br label %for.inc, !dbg !56
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, ptr %i, align 4, !dbg !59, !tbaa !24
+  %inc = add nsw i32 %5, 1, !dbg !59
+  store i32 %inc, ptr %i, align 4, !dbg !59, !tbaa !24
+  br label %for.cond, !dbg !55, !llvm.loop !60
+
+for.end:                                          ; preds = %for.cond.cleanup
+  ret void, !dbg !62
+}
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "various_ir_values.c", directory: "/data/build/llvm-project")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)"}
+!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !{!13, !14}
+!13 = !DILocalVariable(name: "A", arg: 1, scope: !7, file: !1, line: 1, type: !10)
+!14 = !DILocalVariable(name: "i", scope: !15, file: !1, line: 3, type: !11)
+!15 = distinct !DILexicalBlock(scope: !7, file: !1, line: 3, column: 3)
+!16 = distinct !DIAssignID()
+!17 = !DILocation(line: 1, column: 15, scope: !7)
+!18 = !{!19, !19, i64 0}
+!19 = !{!"any pointer", !20, i64 0}
+!20 = !{!"omnipotent char", !21, i64 0}
+!21 = !{!"Simple C/C++ TBAA"}
+!22 = !DILocation(line: 3, column: 8, scope: !15)
+!23 = !DILocation(line: 3, column: 12, scope: !15)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"int", !20, i64 0}
+!26 = !DILocation(line: 3, column: 19, scope: !27)
+!27 = distinct !DILexicalBlock(scope: !15, file: !1, line: 3, column: 3)
+!28 = !DILocation(line: 3, column: 24, scope: !27)
+!29 = !DILocation(line: 3, column: 23, scope: !27)
+!30 = !DILocation(line: 3, column: 21, scope: !27)
+!31 = !DILocation(line: 3, column: 3, scope: !15)
+!32 = !{!"branch_weights", i32 1, i32 1048575}
+!33 = !DILocation(line: 3, column: 3, scope: !27)
+!34 = !DILocation(line: 4, column: 5, scope: !27)
+!35 = !DILocation(line: 4, column: 7, scope: !27)
+!36 = !DILocation(line: 4, column: 10, scope: !27)
+!37 = !DILocation(line: 3, column: 27, scope: !27)
+!38 = distinct !{!38, !31, !39}
+!39 = !DILocation(line: 4, column: 12, scope: !15)
+!40 = !DILocation(line: 5, column: 1, scope: !7)
+!41 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 7, type: !8, scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !42)
+!42 = !{!43, !44}
+!43 = !DILocalVariable(name: "A", arg: 1, scope: !41, file: !1, line: 7, type: !10)
+!44 = !DILocalVariable(name: "i", scope: !45, file: !1, line: 9, type: !11)
+!45 = distinct !DILexicalBlock(scope: !41, file: !1, line: 9, column: 3)
+!46 = !DILocation(line: 7, column: 15, scope: !41)
+!47 = !DILocation(line: 9, column: 8, scope: !45)
+!48 = !DILocation(line: 9, column: 12, scope: !45)
+!49 = !DILocation(line: 9, column: 19, scope: !50)
+!50 = distinct !DILexicalBlock(scope: !45, file: !1, line: 9, column: 3)
+!51 = !DILocation(line: 9, column: 24, scope: !50)
+!52 = !DILocation(line: 9, column: 23, scope: !50)
+!53 = !DILocation(line: 9, column: 21, scope: !50)
+!54 = !DILocation(line: 9, column: 3, scope: !45)
+!55 = !DILocation(line: 9, column: 3, scope: !50)
+!56 = !DILocation(line: 10, column: 5, scope: !50)
+!57 = !DILocation(line: 10, column: 7, scope: !50)
+!58 = !DILocation(line: 10, column: 10, scope: !50)
+!59 = !DILocation(line: 9, column: 27, scope: !50)
+!60 = distinct !{!60, !54, !61}
+!61 = !DILocation(line: 10, column: 12, scope: !45)
+!62 = !DILocation(line: 11, column: 1, scope: !41)
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.transitiveglobals.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.transitiveglobals.expected
new file mode 100644
index 0000000000000..e2c426029a6b0
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/various_ir_values_dbgrecords.ll.funcsig.transitiveglobals.expected
@@ -0,0 +1,299 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals smart
+; Just run it through opt, no passes needed.
+; RUN: opt < %s -S --write-experimental-debuginfo=true | FileCheck %s
+
+; ModuleID = 'various_ir_values.c'
+source_filename = "various_ir_values.c"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define dso_local void @foo(ptr %A) #0 !dbg !7 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8, !DIAssignID [[DIASSIGNID16:![0-9]+]]
+; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+; CHECK-NEXT:      #dbg_assign(i1 undef, [[META13:![0-9]+]], !DIExpression(), [[DIASSIGNID16]], ptr [[A_ADDR]], !DIExpression(), [[META17:![0-9]+]])
+; CHECK-NEXT:    store ptr [[A:%.*]], ptr [[A_ADDR]], align 8, !tbaa [[TBAA18:![0-9]+]]
+; CHECK-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META13]], !DIExpression(), [[META17]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR2:[0-9]+]], !dbg [[DBG22:![0-9]+]]
+; CHECK-NEXT:      #dbg_declare(ptr [[I]], [[META14:![0-9]+]], !DIExpression(), [[META23:![0-9]+]])
+; CHECK-NEXT:    store i32 0, ptr [[I]], align 4, !dbg [[META23]], !tbaa [[TBAA24:![0-9]+]]
+; CHECK-NEXT:    br label [[FOR_COND:%.*]], !dbg [[DBG22]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG26:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG28:![0-9]+]], !tbaa [[TBAA18]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG29:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP2]], !dbg [[DBG30:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG31:![0-9]+]], !prof [[PROF32:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG33:![0-9]+]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG34:![0-9]+]], !tbaa [[TBAA18]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG35:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64, !dbg [[DBG34]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 [[IDXPROM]], !dbg [[DBG34]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !dbg [[DBG36:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_INC:%.*]], !dbg [[DBG34]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG37:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP5]], 1, !dbg [[DBG37]]
+; CHECK-NEXT:    store i32 [[INC]], ptr [[I]], align 4, !dbg [[DBG37]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_COND]], !dbg [[DBG33]], !llvm.loop [[LOOP38:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void, !dbg [[DBG40:![0-9]+]]
+;
+entry:
+  %A.addr = alloca ptr, align 8, !DIAssignID !16
+  %i = alloca i32, align 4
+  #dbg_assign(i1 undef, !13, !DIExpression(), !16, ptr %A.addr, !DIExpression(), !17)
+  store ptr %A, ptr %A.addr, align 8, !tbaa !18
+  #dbg_declare(ptr %A.addr, !13, !DIExpression(), !17)
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !22
+  #dbg_declare(ptr %i, !14, !DIExpression(), !23)
+  store i32 0, ptr %i, align 4, !dbg !23, !tbaa !24
+  br label %for.cond, !dbg !22
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4, !dbg !26, !tbaa !24
+  %1 = load ptr, ptr %A.addr, align 8, !dbg !28, !tbaa !18
+  %2 = load i32, ptr %1, align 4, !dbg !29, !tbaa !24
+  %cmp = icmp slt i32 %0, %2, !dbg !30
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !31, !prof !32
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !33
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %3 = load ptr, ptr %A.addr, align 8, !dbg !34, !tbaa !18
+  %4 = load i32, ptr %i, align 4, !dbg !35, !tbaa !24
+  %idxprom = sext i32 %4 to i64, !dbg !34
+  %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !34
+  store i32 0, ptr %arrayidx, align 4, !dbg !36, !tbaa !24
+  br label %for.inc, !dbg !34
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, ptr %i, align 4, !dbg !37, !tbaa !24
+  %inc = add nsw i32 %5, 1, !dbg !37
+  store i32 %inc, ptr %i, align 4, !dbg !37, !tbaa !24
+  br label %for.cond, !dbg !33, !llvm.loop !38
+
+for.end:                                          ; preds = %for.cond.cleanup
+  ret void, !dbg !40
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: nounwind uwtable
+define dso_local void @bar(ptr %A) #0 !dbg !41 {
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store ptr [[A:%.*]], ptr [[A_ADDR]], align 8, !tbaa [[TBAA18]]
+; CHECK-NEXT:      #dbg_declare(ptr [[A_ADDR]], [[META43:![0-9]+]], !DIExpression(), [[META46:![0-9]+]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG47:![0-9]+]]
+; CHECK-NEXT:      #dbg_declare(ptr [[I]], [[META44:![0-9]+]], !DIExpression(), [[META48:![0-9]+]])
+; CHECK-NEXT:    store i32 0, ptr [[I]], align 4, !dbg [[META48]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_COND:%.*]], !dbg [[DBG47]]
+; CHECK:       for.cond:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG49:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG51:![0-9]+]], !tbaa [[TBAA18]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4, !dbg [[DBG52:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], [[TMP2]], !dbg [[DBG53:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG54:![0-9]+]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR2]], !dbg [[DBG55:![0-9]+]]
+; CHECK-NEXT:    br label [[FOR_END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG56:![0-9]+]], !tbaa [[TBAA18]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG57:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64, !dbg [[DBG56]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 [[IDXPROM]], !dbg [[DBG56]]
+; CHECK-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !dbg [[DBG58:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_INC:%.*]], !dbg [[DBG56]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !dbg [[DBG59:![0-9]+]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP5]], 1, !dbg [[DBG59]]
+; CHECK-NEXT:    store i32 [[INC]], ptr [[I]], align 4, !dbg [[DBG59]], !tbaa [[TBAA24]]
+; CHECK-NEXT:    br label [[FOR_COND]], !dbg [[DBG55]], !llvm.loop [[LOOP60:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void, !dbg [[DBG62:![0-9]+]]
+;
+entry:
+  %A.addr = alloca ptr, align 8
+  %i = alloca i32, align 4
+  store ptr %A, ptr %A.addr, align 8, !tbaa !18
+  #dbg_declare(ptr %A.addr, !43, !DIExpression(), !46)
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #2, !dbg !47
+  #dbg_declare(ptr %i, !44, !DIExpression(), !48)
+  store i32 0, ptr %i, align 4, !dbg !48, !tbaa !24
+  br label %for.cond, !dbg !47
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, ptr %i, align 4, !dbg !49, !tbaa !24
+  %1 = load ptr, ptr %A.addr, align 8, !dbg !51, !tbaa !18
+  %2 = load i32, ptr %1, align 4, !dbg !52, !tbaa !24
+  %cmp = icmp slt i32 %0, %2, !dbg !53
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !54
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #2, !dbg !55
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %3 = load ptr, ptr %A.addr, align 8, !dbg !56, !tbaa !18
+  %4 = load i32, ptr %i, align 4, !dbg !57, !tbaa !24
+  %idxprom = sext i32 %4 to i64, !dbg !56
+  %arrayidx = getelementptr inbounds i32, ptr %3, i64 %idxprom, !dbg !56
+  store i32 0, ptr %arrayidx, align 4, !dbg !58, !tbaa !24
+  br label %for.inc, !dbg !56
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, ptr %i, align 4, !dbg !59, !tbaa !24
+  %inc = add nsw i32 %5, 1, !dbg !59
+  store i32 %inc, ptr %i, align 4, !dbg !59, !tbaa !24
+  br label %for.cond, !dbg !55, !llvm.loop !60
+
+for.end:                                          ; preds = %for.cond.cleanup
+  ret void, !dbg !62
+}
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "various_ir_values.c", directory: "/data/build/llvm-project")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 11.0.0 (git@github.com:llvm/llvm-project.git 1d5da8cd30fce1c0a2c2fa6ba656dbfaa36192c8)"}
+!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !{!13, !14}
+!13 = !DILocalVariable(name: "A", arg: 1, scope: !7, file: !1, line: 1, type: !10)
+!14 = !DILocalVariable(name: "i", scope: !15, file: !1, line: 3, type: !11)
+!15 = distinct !DILexicalBlock(scope: !7, file: !1, line: 3, column: 3)
+!16 = distinct !DIAssignID()
+!17 = !DILocation(line: 1, column: 15, scope: !7)
+!18 = !{!19, !19, i64 0}
+!19 = !{!"any pointer", !20, i64 0}
+!20 = !{!"omnipotent char", !21, i64 0}
+!21 = !{!"Simple C/C++ TBAA"}
+!22 = !DILocation(line: 3, column: 8, scope: !15)
+!23 = !DILocation(line: 3, column: 12, scope: !15)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"int", !20, i64 0}
+!26 = !DILocation(line: 3, column: 19, scope: !27)
+!27 = distinct !DILexicalBlock(scope: !15, file: !1, line: 3, column: 3)
+!28 = !DILocation(line: 3, column: 24, scope: !27)
+!29 = !DILocation(line: 3, column: 23, scope: !27)
+!30 = !DILocation(line: 3, column: 21, scope: !27)
+!31 = !DILocation(line: 3, column: 3, scope: !15)
+!32 = !{!"branch_weights", i32 1, i32 1048575}
+!33 = !DILocation(line: 3, column: 3, scope: !27)
+!34 = !DILocation(line: 4, column: 5, scope: !27)
+!35 = !DILocation(line: 4, column: 7, scope: !27)
+!36 = !DILocation(line: 4, column: 10, scope: !27)
+!37 = !DILocation(line: 3, column: 27, scope: !27)
+!38 = distinct !{!38, !31, !39}
+!39 = !DILocation(line: 4, column: 12, scope: !15)
+!40 = !DILocation(line: 5, column: 1, scope: !7)
+!41 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 7, type: !8, scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !42)
+!42 = !{!43, !44}
+!43 = !DILocalVariable(name: "A", arg: 1, scope: !41, file: !1, line: 7, type: !10)
+!44 = !DILocalVariable(name: "i", scope: !45, file: !1, line: 9, type: !11)
+!45 = distinct !DILexicalBlock(scope: !41, file: !1, line: 9, column: 3)
+!46 = !DILocation(line: 7, column: 15, scope: !41)
+!47 = !DILocation(line: 9, column: 8, scope: !45)
+!48 = !DILocation(line: 9, column: 12, scope: !45)
+!49 = !DILocation(line: 9, column: 19, scope: !50)
+!50 = distinct !DILexicalBlock(scope: !45, file: !1, line: 9, column: 3)
+!51 = !DILocation(line: 9, column: 24, scope: !50)
+!52 = !DILocation(line: 9, column: 23, scope: !50)
+!53 = !DILocation(line: 9, column: 21, scope: !50)
+!54 = !DILocation(line: 9, column: 3, scope: !45)
+!55 = !DILocation(line: 9, column: 3, scope: !50)
+!56 = !DILocation(line: 10, column: 5, scope: !50)
+!57 = !DILocation(line: 10, column: 7, scope: !50)
+!58 = !DILocation(line: 10, column: 10, scope: !50)
+!59 = !DILocation(line: 9, column: 27, scope: !50)
+!60 = distinct !{!60, !54, !61}
+!61 = !DILocation(line: 10, column: 12, scope: !45)
+!62 = !DILocation(line: 11, column: 1, scope: !41)
+;.
+; CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
+; CHECK: [[META1]] = !DIFile(filename: "various_ir_values.c", directory: {{.*}})
+; CHECK: [[META2]] = !{}
+; CHECK: [[META7:![0-9]+]] = distinct !DISubprogram(name: "foo", scope: [[META1]], file: [[META1]], line: 1, type: [[META8:![0-9]+]], scopeLine: 1, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META12:![0-9]+]])
+; CHECK: [[META8]] = !DISubroutineType(types: [[META9:![0-9]+]])
+; CHECK: [[META9]] = !{null, [[META10:![0-9]+]]}
+; CHECK: [[META10]] = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: [[META11:![0-9]+]], size: 64)
+; CHECK: [[META11]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; CHECK: [[META12]] = !{[[META13]], [[META14]]}
+; CHECK: [[META13]] = !DILocalVariable(name: "A", arg: 1, scope: [[META7]], file: [[META1]], line: 1, type: [[META10]])
+; CHECK: [[META14]] = !DILocalVariable(name: "i", scope: [[META15:![0-9]+]], file: [[META1]], line: 3, type: [[META11]])
+; CHECK: [[META15]] = distinct !DILexicalBlock(scope: [[META7]], file: [[META1]], line: 3, column: 3)
+; CHECK: [[DIASSIGNID16]] = distinct !DIAssignID()
+; CHECK: [[META17]] = !DILocation(line: 1, column: 15, scope: [[META7]])
+; CHECK: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
+; CHECK: [[META19]] = !{!"any pointer", [[META20:![0-9]+]], i64 0}
+; CHECK: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0}
+; CHECK: [[META21]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[DBG22]] = !DILocation(line: 3, column: 8, scope: [[META15]])
+; CHECK: [[META23]] = !DILocation(line: 3, column: 12, scope: [[META15]])
+; CHECK: [[TBAA24]] = !{[[META25:![0-9]+]], [[META25]], i64 0}
+; CHECK: [[META25]] = !{!"int", [[META20]], i64 0}
+; CHECK: [[DBG26]] = !DILocation(line: 3, column: 19, scope: [[META27:![0-9]+]])
+; CHECK: [[META27]] = distinct !DILexicalBlock(scope: [[META15]], file: [[META1]], line: 3, column: 3)
+; CHECK: [[DBG28]] = !DILocation(line: 3, column: 24, scope: [[META27]])
+; CHECK: [[DBG29]] = !DILocation(line: 3, column: 23, scope: [[META27]])
+; CHECK: [[DBG30]] = !DILocation(line: 3, column: 21, scope: [[META27]])
+; CHECK: [[DBG31]] = !DILocation(line: 3, column: 3, scope: [[META15]])
+; CHECK: [[PROF32]] = !{!"branch_weights", i32 1, i32 1048575}
+; CHECK: [[DBG33]] = !DILocation(line: 3, column: 3, scope: [[META27]])
+; CHECK: [[DBG34]] = !DILocation(line: 4, column: 5, scope: [[META27]])
+; CHECK: [[DBG35]] = !DILocation(line: 4, column: 7, scope: [[META27]])
+; CHECK: [[DBG36]] = !DILocation(line: 4, column: 10, scope: [[META27]])
+; CHECK: [[DBG37]] = !DILocation(line: 3, column: 27, scope: [[META27]])
+; CHECK: [[LOOP38]] = distinct !{[[LOOP38]], [[DBG31]], [[META39:![0-9]+]]}
+; CHECK: [[META39]] = !DILocation(line: 4, column: 12, scope: [[META15]])
+; CHECK: [[DBG40]] = !DILocation(line: 5, column: 1, scope: [[META7]])
+; CHECK: [[META41:![0-9]+]] = distinct !DISubprogram(name: "bar", scope: [[META1]], file: [[META1]], line: 7, type: [[META8]], scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META42:![0-9]+]])
+; CHECK: [[META42]] = !{[[META43]], [[META44]]}
+; CHECK: [[META43]] = !DILocalVariable(name: "A", arg: 1, scope: [[META41]], file: [[META1]], line: 7, type: [[META10]])
+; CHECK: [[META44]] = !DILocalVariable(name: "i", scope: [[META45:![0-9]+]], file: [[META1]], line: 9, type: [[META11]])
+; CHECK: [[META45]] = distinct !DILexicalBlock(scope: [[META41]], file: [[META1]], line: 9, column: 3)
+; CHECK: [[META46]] = !DILocation(line: 7, column: 15, scope: [[META41]])
+; CHECK: [[DBG47]] = !DILocation(line: 9, column: 8, scope: [[META45]])
+; CHECK: [[META48]] = !DILocation(line: 9, column: 12, scope: [[META45]])
+; CHECK: [[DBG49]] = !DILocation(line: 9, column: 19, scope: [[META50:![0-9]+]])
+; CHECK: [[META50]] = distinct !DILexicalBlock(scope: [[META45]], file: [[META1]], line: 9, column: 3)
+; CHECK: [[DBG51]] = !DILocation(line: 9, column: 24, scope: [[META50]])
+; CHECK: [[DBG52]] = !DILocation(line: 9, column: 23, scope: [[META50]])
+; CHECK: [[DBG53]] = !DILocation(line: 9, column: 21, scope: [[META50]])
+; CHECK: [[DBG54]] = !DILocation(line: 9, column: 3, scope: [[META45]])
+; CHECK: [[DBG55]] = !DILocation(line: 9, column: 3, scope: [[META50]])
+; CHECK: [[DBG56]] = !DILocation(line: 10, column: 5, scope: [[META50]])
+; CHECK: [[DBG57]] = !DILocation(line: 10, column: 7, scope: [[META50]])
+; CHECK: [[DBG58]] = !DILocation(line: 10, column: 10, scope: [[META50]])
+; CHECK: [[DBG59]] = !DILocation(line: 9, column: 27, scope: [[META50]])
+; CHECK: [[LOOP60]] = distinct !{[[LOOP60]], [[DBG54]], [[META61:![0-9]+]]}
+; CHECK: [[META61]] = !DILocation(line: 10, column: 12, scope: [[META45]])
+; CHECK: [[DBG62]] = !DILocation(line: 11, column: 1, scope: [[META41]])
+;.
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/phi-labels.test b/llvm/test/tools/UpdateTestChecks/update_test_checks/phi-labels.test
new file mode 100644
index 0000000000000..411c84de1dcba
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/phi-labels.test
@@ -0,0 +1,5 @@
+# RUN: cp -f %S/Inputs/phi-labels.ll %t.ll && %update_test_checks --version 4 %t.ll
+# RUN: diff -u %t.ll %S/Inputs/phi-labels.ll.expected
+## Check that running the script again does not change the result:
+# RUN: %update_test_checks %t.ll
+# RUN: diff -u %t.ll %S/Inputs/phi-labels.ll.expected
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/stable_ir_values_funcs.test b/llvm/test/tools/UpdateTestChecks/update_test_checks/stable_ir_values_funcs.test
new file mode 100644
index 0000000000000..5132fb9a26ff4
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/stable_ir_values_funcs.test
@@ -0,0 +1,2 @@
+# RUN: cp -f %S/Inputs/stable_ir_values_funcs.ll %t.ll && %update_test_checks %t.ll
+# RUN: diff -u %t.ll %S/Inputs/stable_ir_values_funcs.ll.expected
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/various_ir_values_dbgrecords.test b/llvm/test/tools/UpdateTestChecks/update_test_checks/various_ir_values_dbgrecords.test
new file mode 100644
index 0000000000000..9cc77d894d62c
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/various_ir_values_dbgrecords.test
@@ -0,0 +1,24 @@
+## Basic test checking that update_test_checks.py works correctly on various "IR value" kinds
+# RUN: cp -f %S/Inputs/various_ir_values_dbgrecords.ll %t.ll && %update_test_checks %t.ll
+# RUN: diff -u %t.ll %S/Inputs/various_ir_values_dbgrecords.ll.expected
+## Check that running the script again does not change the result:
+# RUN: %update_test_checks %t.ll
+# RUN: diff -u %t.ll %S/Inputs/various_ir_values_dbgrecords.ll.expected
+## Also try the --function-signature flag
+# RUN: %update_test_checks %t.ll --function-signature
+# RUN: diff -u %t.ll %S/Inputs/various_ir_values_dbgrecords.ll.funcsig.expected
+## Verify that running without the --function-signature flag does not removes
+## the -SAME: lines since the generated file will have --function-signature in
+## an UTC_ARGS: comment in the first line (from the invocation above) which is
+## added to the update invocation below.
+# RUN: %update_test_checks %t.ll
+# RUN: diff -u %t.ll %S/Inputs/various_ir_values_dbgrecords.ll.funcsig.expected
+## Also try the --check-globals flag
+# RUN: %update_test_checks %t.ll --check-globals
+# RUN: diff -u %t.ll %S/Inputs/various_ir_values_dbgrecords.ll.funcsig.globals.expected
+# RUN: cp -f %S/Inputs/various_ir_values_dbgrecords.ll %t.ll && %update_test_checks %t.ll --function-signature --check-globals all
+# RUN: diff -u %t.ll %S/Inputs/various_ir_values_dbgrecords.ll.funcsig.globals.expected
+# RUN: cp -f %S/Inputs/various_ir_values_dbgrecords.ll %t.ll && %update_test_checks %t.ll --check-globals none
+# RUN: diff -u %t.ll %S/Inputs/various_ir_values_dbgrecords.ll.funcsig.noglobals.expected
+# RUN: cp -f %S/Inputs/various_ir_values_dbgrecords.ll %t.ll && %update_test_checks %t.ll --check-globals smart
+# RUN: diff -u %t.ll %S/Inputs/various_ir_values_dbgrecords.ll.funcsig.transitiveglobals.expected
diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-neon-instructions.s
index 73eadc268bf26..f703392f3e9d0 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-neon-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-neon-instructions.s
@@ -1070,14 +1070,14 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      4     0.50                        abs	d29, d24
-# CHECK-NEXT:  1      4     0.50                        abs	v0.16b, v0.16b
-# CHECK-NEXT:  1      4     0.50                        abs	v0.2d, v0.2d
-# CHECK-NEXT:  1      4     0.50                        abs	v0.2s, v0.2s
-# CHECK-NEXT:  1      4     0.50                        abs	v0.4h, v0.4h
-# CHECK-NEXT:  1      4     0.50                        abs	v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        abs	v0.8b, v0.8b
-# CHECK-NEXT:  1      4     0.50                        abs	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        abs	d29, d24
+# CHECK-NEXT:  1      3     0.50                        abs	v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        abs	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        abs	v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        abs	v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        abs	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        abs	v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        abs	v0.8h, v0.8h
 # CHECK-NEXT:  1      3     0.50                        add	d17, d31, d29
 # CHECK-NEXT:  1      3     0.50                        add	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  1      4     0.50                        addhn	v0.2s, v0.2d, v0.2d
@@ -1086,8 +1086,8 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     0.50                        addhn2	v0.16b, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     0.50                        addhn2	v0.4s, v0.2d, v0.2d
 # CHECK-NEXT:  1      4     0.50                        addhn2	v0.8h, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        addp	v0.2d, v0.2d, v0.2d
-# CHECK-NEXT:  1      4     0.50                        addp	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        addp	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        addp	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  1      3     0.50                        and	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  1      3     0.50                        bic	v0.4h, #15, lsl #8
 # CHECK-NEXT:  1      3     0.50                        bic	v0.8b, v0.8b, v0.8b
@@ -1441,13 +1441,13 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      3     0.50                        mvni	v0.2s, #0
 # CHECK-NEXT:  1      3     0.50                        mvni	v0.4s, #16, msl #16
 # CHECK-NEXT:  1      3     0.50                        neg	d29, d24
-# CHECK-NEXT:  1      4     0.50                        neg	v0.16b, v0.16b
-# CHECK-NEXT:  1      4     0.50                        neg	v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        neg	v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        neg	v0.2d, v0.2d
 # CHECK-NEXT:  1      3     0.50                        neg	v0.2s, v0.2s
 # CHECK-NEXT:  1      3     0.50                        neg	v0.4h, v0.4h
-# CHECK-NEXT:  1      4     0.50                        neg	v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        neg	v0.4s, v0.4s
 # CHECK-NEXT:  1      3     0.50                        neg	v0.8b, v0.8b
-# CHECK-NEXT:  1      4     0.50                        neg	v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        neg	v0.8h, v0.8h
 # CHECK-NEXT:  1      3     0.50                        mvn	v0.16b, v0.16b
 # CHECK-NEXT:  1      3     0.50                        mvn	v0.8b, v0.8b
 # CHECK-NEXT:  1      3     0.50                        orn	v0.16b, v0.16b, v0.16b
@@ -1457,12 +1457,12 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     0.50                        pmul	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  1      4     0.50                        pmull	v0.8h, v0.8b, v0.8b
 # CHECK-NEXT:  1      4     0.50                        pmull2	v0.8h, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     0.50                        raddhn	v0.2s, v0.2d, v0.2d
-# CHECK-NEXT:  1      4     0.50                        raddhn	v0.4h, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        raddhn	v0.8b, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        raddhn2	v0.16b, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        raddhn2	v0.4s, v0.2d, v0.2d
-# CHECK-NEXT:  1      4     0.50                        raddhn2	v0.8h, v0.4s, v0.4s
+# CHECK-NEXT:  1      8     0.50                        raddhn	v0.2s, v0.2d, v0.2d
+# CHECK-NEXT:  1      8     0.50                        raddhn	v0.4h, v0.4s, v0.4s
+# CHECK-NEXT:  1      8     0.50                        raddhn	v0.8b, v0.8h, v0.8h
+# CHECK-NEXT:  1      8     0.50                        raddhn2	v0.16b, v0.8h, v0.8h
+# CHECK-NEXT:  1      8     0.50                        raddhn2	v0.4s, v0.2d, v0.2d
+# CHECK-NEXT:  1      8     0.50                        raddhn2	v0.8h, v0.4s, v0.4s
 # CHECK-NEXT:  1      4     0.50                        rbit	v0.16b, v0.16b
 # CHECK-NEXT:  1      4     0.50                        rbit	v0.8b, v0.8b
 # CHECK-NEXT:  1      4     0.50                        rev16	v21.8b, v1.8b
@@ -1483,19 +1483,19 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     0.50                        rshrn2	v0.16b, v0.8h, #3
 # CHECK-NEXT:  1      4     0.50                        rshrn2	v0.4s, v0.2d, #3
 # CHECK-NEXT:  1      4     0.50                        rshrn2	v0.8h, v0.4s, #3
-# CHECK-NEXT:  1      4     0.50                        rsubhn	v0.2s, v0.2d, v0.2d
-# CHECK-NEXT:  1      4     0.50                        rsubhn	v0.4h, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        rsubhn	v0.8b, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        rsubhn2	v0.16b, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        rsubhn2	v0.4s, v0.2d, v0.2d
-# CHECK-NEXT:  1      4     0.50                        rsubhn2	v0.8h, v0.4s, v0.4s
-# CHECK-NEXT:  1      8     0.50                        saba	v0.16b, v0.16b, v0.16b
-# CHECK-NEXT:  1      8     0.50                        sabal	v0.2d, v0.2s, v0.2s
-# CHECK-NEXT:  1      8     0.50                        sabal	v0.4s, v0.4h, v0.4h
-# CHECK-NEXT:  1      8     0.50                        sabal	v0.8h, v0.8b, v0.8b
-# CHECK-NEXT:  1      8     0.50                        sabal2	v0.2d, v0.4s, v0.4s
-# CHECK-NEXT:  1      8     0.50                        sabal2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  1      8     0.50                        sabal2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      8     0.50                        rsubhn	v0.2s, v0.2d, v0.2d
+# CHECK-NEXT:  1      8     0.50                        rsubhn	v0.4h, v0.4s, v0.4s
+# CHECK-NEXT:  1      8     0.50                        rsubhn	v0.8b, v0.8h, v0.8h
+# CHECK-NEXT:  1      8     0.50                        rsubhn2	v0.16b, v0.8h, v0.8h
+# CHECK-NEXT:  1      8     0.50                        rsubhn2	v0.4s, v0.2d, v0.2d
+# CHECK-NEXT:  1      8     0.50                        rsubhn2	v0.8h, v0.4s, v0.4s
+# CHECK-NEXT:  1      6     0.50                        saba	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      6     0.50                        sabal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      6     0.50                        sabal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      6     0.50                        sabal	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      6     0.50                        sabal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      6     0.50                        sabal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      6     0.50                        sabal2	v0.8h, v0.16b, v0.16b
 # CHECK-NEXT:  1      3     0.50                        sabd	v0.4h, v0.4h, v0.4h
 # CHECK-NEXT:  1      3     0.50                        sabdl	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  1      3     0.50                        sabdl	v0.4s, v0.4h, v0.4h
@@ -1503,30 +1503,30 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      3     0.50                        sabdl2	v0.2d, v0.4s, v0.4s
 # CHECK-NEXT:  1      3     0.50                        sabdl2	v0.4s, v0.8h, v0.8h
 # CHECK-NEXT:  1      3     0.50                        sabdl2	v0.8h, v0.16b, v0.16b
-# CHECK-NEXT:  1      8     1.00                        sadalp	v0.1d, v0.2s
-# CHECK-NEXT:  1      8     1.00                        sadalp	v0.2d, v0.4s
-# CHECK-NEXT:  1      8     1.00                        sadalp	v0.2s, v0.4h
-# CHECK-NEXT:  1      8     1.00                        sadalp	v0.4h, v0.8b
-# CHECK-NEXT:  1      8     1.00                        sadalp	v0.4s, v0.8h
-# CHECK-NEXT:  1      8     1.00                        sadalp	v0.8h, v0.16b
-# CHECK-NEXT:  1      4     0.50                        saddl	v0.2d, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     0.50                        saddl	v0.4s, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     0.50                        saddl	v0.8h, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     0.50                        saddl2	v0.2d, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        saddl2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        saddl2	v0.8h, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     0.50                        saddlp	v0.1d, v0.2s
-# CHECK-NEXT:  1      4     0.50                        saddlp	v0.2d, v0.4s
-# CHECK-NEXT:  1      4     0.50                        saddlp	v0.2s, v0.4h
-# CHECK-NEXT:  1      4     0.50                        saddlp	v0.4h, v0.8b
-# CHECK-NEXT:  1      4     0.50                        saddlp	v0.4s, v0.8h
-# CHECK-NEXT:  1      4     0.50                        saddlp	v0.8h, v0.16b
-# CHECK-NEXT:  1      4     0.50                        saddw	v0.2d, v0.2d, v0.2s
-# CHECK-NEXT:  1      4     0.50                        saddw	v0.4s, v0.4s, v0.4h
-# CHECK-NEXT:  1      4     0.50                        saddw	v0.8h, v0.8h, v0.8b
-# CHECK-NEXT:  1      4     0.50                        saddw2	v0.2d, v0.2d, v0.4s
-# CHECK-NEXT:  1      4     0.50                        saddw2	v0.4s, v0.4s, v0.8h
-# CHECK-NEXT:  1      4     0.50                        saddw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  1      7     1.00                        sadalp	v0.1d, v0.2s
+# CHECK-NEXT:  1      7     1.00                        sadalp	v0.2d, v0.4s
+# CHECK-NEXT:  1      7     1.00                        sadalp	v0.2s, v0.4h
+# CHECK-NEXT:  1      7     1.00                        sadalp	v0.4h, v0.8b
+# CHECK-NEXT:  1      7     1.00                        sadalp	v0.4s, v0.8h
+# CHECK-NEXT:  1      7     1.00                        sadalp	v0.8h, v0.16b
+# CHECK-NEXT:  1      3     0.50                        saddl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        saddl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        saddl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        saddl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        saddl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        saddl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        saddlp	v0.1d, v0.2s
+# CHECK-NEXT:  1      3     0.50                        saddlp	v0.2d, v0.4s
+# CHECK-NEXT:  1      3     0.50                        saddlp	v0.2s, v0.4h
+# CHECK-NEXT:  1      3     0.50                        saddlp	v0.4h, v0.8b
+# CHECK-NEXT:  1      3     0.50                        saddlp	v0.4s, v0.8h
+# CHECK-NEXT:  1      3     0.50                        saddlp	v0.8h, v0.16b
+# CHECK-NEXT:  1      3     0.50                        saddw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  1      3     0.50                        saddw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  1      3     0.50                        saddw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  1      3     0.50                        saddw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  1      3     0.50                        saddw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  1      3     0.50                        saddw2	v0.8h, v0.8h, v0.16b
 # CHECK-NEXT:  1      4     0.50                        scvtf	d21, d12
 # CHECK-NEXT:  1      4     0.50                        scvtf	d21, d12, #64
 # CHECK-NEXT:  1      4     0.50                        scvtf	s22, s13
@@ -1573,18 +1573,18 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     0.50                        sli	v0.4s, v0.4s, #3
 # CHECK-NEXT:  1      4     0.50                        sli	v0.8b, v0.8b, #3
 # CHECK-NEXT:  1      4     0.50                        sli	v0.8h, v0.8h, #3
-# CHECK-NEXT:  1      4     0.50                        smax	v0.2s, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     0.50                        smax	v0.4h, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     0.50                        smax	v0.8b, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     0.50                        smaxp	v0.2s, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     0.50                        smaxp	v0.4h, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     0.50                        smaxp	v0.8b, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     0.50                        smin	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        smax	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        smax	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        smax	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        smaxp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        smaxp	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        smaxp	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        smin	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  1      4     0.50                        smin	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        smin	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        sminp	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        smin	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        sminp	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  1      4     0.50                        sminp	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        sminp	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        sminp	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     0.50                        smlal	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  1      4     0.50                        smlal	v0.4s, v0.4h, v0.4h
 # CHECK-NEXT:  1      4     0.50                        smlal	v0.8h, v0.8b, v0.8b
@@ -1777,14 +1777,14 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      3     0.50                        srshr	v0.4s, v0.4s, #3
 # CHECK-NEXT:  1      3     0.50                        srshr	v0.8b, v0.8b, #3
 # CHECK-NEXT:  1      3     0.50                        srshr	v0.8h, v0.8h, #3
-# CHECK-NEXT:  1      8     1.00                        srsra	d15, d11, #19
-# CHECK-NEXT:  1      8     1.00                        srsra	v0.16b, v0.16b, #3
-# CHECK-NEXT:  1      8     1.00                        srsra	v0.2d, v0.2d, #3
-# CHECK-NEXT:  1      8     1.00                        srsra	v0.2s, v0.2s, #3
-# CHECK-NEXT:  1      8     1.00                        srsra	v0.4h, v0.4h, #3
-# CHECK-NEXT:  1      8     1.00                        srsra	v0.4s, v0.4s, #3
-# CHECK-NEXT:  1      8     1.00                        srsra	v0.8b, v0.8b, #3
-# CHECK-NEXT:  1      8     1.00                        srsra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      7     1.00                        srsra	d15, d11, #19
+# CHECK-NEXT:  1      7     1.00                        srsra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      7     1.00                        srsra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      7     1.00                        srsra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      7     1.00                        srsra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      7     1.00                        srsra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      7     1.00                        srsra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      7     1.00                        srsra	v0.8h, v0.8h, #3
 # CHECK-NEXT:  1      3     0.50                        sshl	d31, d31, d31
 # CHECK-NEXT:  1      3     0.50                        sshl	v0.2d, v0.2d, v0.2d
 # CHECK-NEXT:  1      3     0.50                        sshl	v0.2s, v0.2s, v0.2s
@@ -1800,26 +1800,26 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      3     0.50                        sshr	v0.4s, v0.4s, #3
 # CHECK-NEXT:  1      3     0.50                        sshr	v0.8b, v0.8b, #3
 # CHECK-NEXT:  1      3     0.50                        sshr	v0.8h, v0.8h, #3
-# CHECK-NEXT:  1      8     1.00                        ssra	d18, d12, #21
-# CHECK-NEXT:  1      8     1.00                        ssra	v0.16b, v0.16b, #3
-# CHECK-NEXT:  1      8     1.00                        ssra	v0.2d, v0.2d, #3
-# CHECK-NEXT:  1      8     1.00                        ssra	v0.2s, v0.2s, #3
-# CHECK-NEXT:  1      8     1.00                        ssra	v0.4h, v0.4h, #3
-# CHECK-NEXT:  1      8     1.00                        ssra	v0.4s, v0.4s, #3
-# CHECK-NEXT:  1      8     1.00                        ssra	v0.8b, v0.8b, #3
-# CHECK-NEXT:  1      8     1.00                        ssra	v0.8h, v0.8h, #3
-# CHECK-NEXT:  1      4     0.50                        ssubl	v0.2d, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     0.50                        ssubl	v0.4s, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     0.50                        ssubl	v0.8h, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     0.50                        ssubl2	v0.2d, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        ssubl2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        ssubl2	v0.8h, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     0.50                        ssubw	v0.2d, v0.2d, v0.2s
-# CHECK-NEXT:  1      4     0.50                        ssubw	v0.4s, v0.4s, v0.4h
-# CHECK-NEXT:  1      4     0.50                        ssubw	v0.8h, v0.8h, v0.8b
-# CHECK-NEXT:  1      4     0.50                        ssubw2	v0.2d, v0.2d, v0.4s
-# CHECK-NEXT:  1      4     0.50                        ssubw2	v0.4s, v0.4s, v0.8h
-# CHECK-NEXT:  1      4     0.50                        ssubw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  1      3     0.50                        ssra	d18, d12, #21
+# CHECK-NEXT:  1      3     0.50                        ssra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      3     0.50                        ssra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     0.50                        ssra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      3     0.50                        ssra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      3     0.50                        ssra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     0.50                        ssra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      3     0.50                        ssra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      3     0.50                        ssubl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        ssubl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        ssubl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        ssubl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        ssubl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        ssubl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        ssubw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  1      3     0.50                        ssubw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  1      3     0.50                        ssubw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  1      3     0.50                        ssubw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  1      3     0.50                        ssubw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  1      3     0.50                        ssubw2	v0.8h, v0.8h, v0.16b
 # CHECK-NEXT:  1      4     1.00           *            st1	{ v0.16b }, [x0]
 # CHECK-NEXT:  2      5     2.00           *            st1	{ v0.2d, v1.2d, v2.2d }, [x0], #48
 # CHECK-NEXT:  1      5     4.00           *            st1	{ v0.2d, v1.2d, v2.2d, v3.2d }, [x0]
@@ -1843,7 +1843,7 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      5     2.00           *            st4	{ v0.b, v1.b, v2.b, v3.b }[9], [x0]
 # CHECK-NEXT:  2      5     2.00           *            st4	{ v0.b, v1.b, v2.b, v3.b }[9], [x0], x5
 # CHECK-NEXT:  1      3     0.50                        sub	d15, d5, d16
-# CHECK-NEXT:  1      4     0.50                        sub	v0.2d, v0.2d, v0.2d
+# CHECK-NEXT:  1      3     0.50                        sub	v0.2d, v0.2d, v0.2d
 # CHECK-NEXT:  1      4     0.50                        suqadd	b19, b14
 # CHECK-NEXT:  1      4     0.50                        suqadd	d18, d22
 # CHECK-NEXT:  1      4     0.50                        suqadd	h20, h15
@@ -1885,13 +1885,13 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     0.50                        trn2	v0.4s, v0.4s, v0.4s
 # CHECK-NEXT:  1      4     0.50                        trn2	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  1      4     0.50                        trn2	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT:  1      8     0.50                        uaba	v0.8b, v0.8b, v0.8b
-# CHECK-NEXT:  1      8     0.50                        uabal	v0.2d, v0.2s, v0.2s
-# CHECK-NEXT:  1      8     0.50                        uabal	v0.4s, v0.4h, v0.4h
-# CHECK-NEXT:  1      8     0.50                        uabal	v0.8h, v0.8b, v0.8b
-# CHECK-NEXT:  1      8     0.50                        uabal2	v0.2d, v0.4s, v0.4s
-# CHECK-NEXT:  1      8     0.50                        uabal2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  1      8     0.50                        uabal2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      6     0.50                        uaba	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      6     0.50                        uabal	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      6     0.50                        uabal	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      6     0.50                        uabal	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      6     0.50                        uabal2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      6     0.50                        uabal2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      6     0.50                        uabal2	v0.8h, v0.16b, v0.16b
 # CHECK-NEXT:  1      3     0.50                        uabd	v0.4h, v0.4h, v0.4h
 # CHECK-NEXT:  1      3     0.50                        uabdl	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  1      3     0.50                        uabdl	v0.4s, v0.4h, v0.4h
@@ -1899,30 +1899,30 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      3     0.50                        uabdl2	v0.2d, v0.4s, v0.4s
 # CHECK-NEXT:  1      3     0.50                        uabdl2	v0.4s, v0.8h, v0.8h
 # CHECK-NEXT:  1      3     0.50                        uabdl2	v0.8h, v0.16b, v0.16b
-# CHECK-NEXT:  1      8     1.00                        uadalp	v0.1d, v0.2s
-# CHECK-NEXT:  1      8     1.00                        uadalp	v0.2d, v0.4s
-# CHECK-NEXT:  1      8     1.00                        uadalp	v0.2s, v0.4h
-# CHECK-NEXT:  1      8     1.00                        uadalp	v0.4h, v0.8b
-# CHECK-NEXT:  1      8     1.00                        uadalp	v0.4s, v0.8h
-# CHECK-NEXT:  1      8     1.00                        uadalp	v0.8h, v0.16b
-# CHECK-NEXT:  1      4     0.50                        uaddl	v0.2d, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     0.50                        uaddl	v0.4s, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     0.50                        uaddl	v0.8h, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     0.50                        uaddl2	v0.2d, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        uaddl2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        uaddl2	v0.8h, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     0.50                        uaddlp	v0.1d, v0.2s
-# CHECK-NEXT:  1      4     0.50                        uaddlp	v0.2d, v0.4s
-# CHECK-NEXT:  1      4     0.50                        uaddlp	v0.2s, v0.4h
-# CHECK-NEXT:  1      4     0.50                        uaddlp	v0.4h, v0.8b
-# CHECK-NEXT:  1      4     0.50                        uaddlp	v0.4s, v0.8h
-# CHECK-NEXT:  1      4     0.50                        uaddlp	v0.8h, v0.16b
-# CHECK-NEXT:  1      4     0.50                        uaddw	v0.2d, v0.2d, v0.2s
-# CHECK-NEXT:  1      4     0.50                        uaddw	v0.4s, v0.4s, v0.4h
-# CHECK-NEXT:  1      4     0.50                        uaddw	v0.8h, v0.8h, v0.8b
-# CHECK-NEXT:  1      4     0.50                        uaddw2	v0.2d, v0.2d, v0.4s
-# CHECK-NEXT:  1      4     0.50                        uaddw2	v0.4s, v0.4s, v0.8h
-# CHECK-NEXT:  1      4     0.50                        uaddw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  1      7     1.00                        uadalp	v0.1d, v0.2s
+# CHECK-NEXT:  1      7     1.00                        uadalp	v0.2d, v0.4s
+# CHECK-NEXT:  1      7     1.00                        uadalp	v0.2s, v0.4h
+# CHECK-NEXT:  1      7     1.00                        uadalp	v0.4h, v0.8b
+# CHECK-NEXT:  1      7     1.00                        uadalp	v0.4s, v0.8h
+# CHECK-NEXT:  1      7     1.00                        uadalp	v0.8h, v0.16b
+# CHECK-NEXT:  1      3     0.50                        uaddl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        uaddl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        uaddl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        uaddl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        uaddl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        uaddl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        uaddlp	v0.1d, v0.2s
+# CHECK-NEXT:  1      3     0.50                        uaddlp	v0.2d, v0.4s
+# CHECK-NEXT:  1      3     0.50                        uaddlp	v0.2s, v0.4h
+# CHECK-NEXT:  1      3     0.50                        uaddlp	v0.4h, v0.8b
+# CHECK-NEXT:  1      3     0.50                        uaddlp	v0.4s, v0.8h
+# CHECK-NEXT:  1      3     0.50                        uaddlp	v0.8h, v0.16b
+# CHECK-NEXT:  1      3     0.50                        uaddw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  1      3     0.50                        uaddw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  1      3     0.50                        uaddw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  1      3     0.50                        uaddw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  1      3     0.50                        uaddw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  1      3     0.50                        uaddw2	v0.8h, v0.8h, v0.16b
 # CHECK-NEXT:  1      4     0.50                        ucvtf	d21, d14
 # CHECK-NEXT:  1      4     0.50                        ucvtf	d21, d14, #64
 # CHECK-NEXT:  1      4     0.50                        ucvtf	s22, s13
@@ -1935,21 +1935,21 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     0.50                        ucvtf	v0.4s, v0.4s
 # CHECK-NEXT:  1      4     0.50                        ucvtf	v0.4s, v0.4s, #3
 # CHECK-NEXT:  1      4     0.50                        ucvtf	v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        uhadd	v0.16b, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     0.50                        uhadd	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        uhsub	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        umax	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        uhadd	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        uhadd	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        uhsub	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        umax	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  1      4     0.50                        umax	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        umax	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        umaxp	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        umax	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        umaxp	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  1      4     0.50                        umaxp	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        umaxp	v0.8h, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        umin	v0.2s, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     0.50                        umin	v0.4h, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     0.50                        umin	v0.8b, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     0.50                        uminp	v0.2s, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     0.50                        uminp	v0.4h, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     0.50                        uminp	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        umaxp	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        umin	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        umin	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        umin	v0.8b, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        uminp	v0.2s, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        uminp	v0.4h, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        uminp	v0.8b, v0.8b, v0.8b
 # CHECK-NEXT:  1      4     0.50                        umlal	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  1      4     0.50                        umlal	v0.4s, v0.4h, v0.4h
 # CHECK-NEXT:  1      4     0.50                        umlal	v0.8h, v0.8b, v0.8b
@@ -2024,9 +2024,9 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     0.50                        uqxtn2	v0.8h, v0.4s
 # CHECK-NEXT:  1      4     0.50                        urecpe	v0.2s, v0.2s
 # CHECK-NEXT:  1      4     0.50                        urecpe	v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        urhadd	v0.16b, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     0.50                        urhadd	v0.4s, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        urhadd	v0.8h, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        urhadd	v0.16b, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        urhadd	v0.4s, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        urhadd	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      3     0.50                        urshl	d8, d7, d4
 # CHECK-NEXT:  1      3     0.50                        urshl	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  1      3     0.50                        urshl	v0.2d, v0.2d, v0.2d
@@ -2042,14 +2042,14 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      3     0.50                        urshr	v0.8h, v0.8h, #3
 # CHECK-NEXT:  1      12    9.00                        ursqrte	v0.2s, v0.2s
 # CHECK-NEXT:  1      12    9.00                        ursqrte	v0.4s, v0.4s
-# CHECK-NEXT:  1      8     1.00                        ursra	d18, d10, #13
-# CHECK-NEXT:  1      8     1.00                        ursra	v0.16b, v0.16b, #3
-# CHECK-NEXT:  1      8     1.00                        ursra	v0.2d, v0.2d, #3
-# CHECK-NEXT:  1      8     1.00                        ursra	v0.2s, v0.2s, #3
-# CHECK-NEXT:  1      8     1.00                        ursra	v0.4h, v0.4h, #3
-# CHECK-NEXT:  1      8     1.00                        ursra	v0.4s, v0.4s, #3
-# CHECK-NEXT:  1      8     1.00                        ursra	v0.8b, v0.8b, #3
-# CHECK-NEXT:  1      8     1.00                        ursra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      7     1.00                        ursra	d18, d10, #13
+# CHECK-NEXT:  1      7     1.00                        ursra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      7     1.00                        ursra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      7     1.00                        ursra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      7     1.00                        ursra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      7     1.00                        ursra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      7     1.00                        ursra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      7     1.00                        ursra	v0.8h, v0.8h, #3
 # CHECK-NEXT:  1      3     0.50                        ushl	d0, d0, d0
 # CHECK-NEXT:  1      3     0.50                        ushl	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  1      3     0.50                        ushl	v0.4s, v0.4s, v0.4s
@@ -2075,26 +2075,26 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  1      4     0.50                        usqadd	v0.4s, v0.4s
 # CHECK-NEXT:  1      4     0.50                        usqadd	v0.8b, v0.8b
 # CHECK-NEXT:  1      4     0.50                        usqadd	v0.8h, v0.8h
-# CHECK-NEXT:  1      8     1.00                        usra	d20, d13, #61
-# CHECK-NEXT:  1      8     1.00                        usra	v0.16b, v0.16b, #3
-# CHECK-NEXT:  1      8     1.00                        usra	v0.2d, v0.2d, #3
-# CHECK-NEXT:  1      8     1.00                        usra	v0.2s, v0.2s, #3
-# CHECK-NEXT:  1      8     1.00                        usra	v0.4h, v0.4h, #3
-# CHECK-NEXT:  1      8     1.00                        usra	v0.4s, v0.4s, #3
-# CHECK-NEXT:  1      8     1.00                        usra	v0.8b, v0.8b, #3
-# CHECK-NEXT:  1      8     1.00                        usra	v0.8h, v0.8h, #3
-# CHECK-NEXT:  1      4     0.50                        usubl	v0.2d, v0.2s, v0.2s
-# CHECK-NEXT:  1      4     0.50                        usubl	v0.4s, v0.4h, v0.4h
-# CHECK-NEXT:  1      4     0.50                        usubl	v0.8h, v0.8b, v0.8b
-# CHECK-NEXT:  1      4     0.50                        usubl2	v0.2d, v0.4s, v0.4s
-# CHECK-NEXT:  1      4     0.50                        usubl2	v0.4s, v0.8h, v0.8h
-# CHECK-NEXT:  1      4     0.50                        usubl2	v0.8h, v0.16b, v0.16b
-# CHECK-NEXT:  1      4     0.50                        usubw	v0.2d, v0.2d, v0.2s
-# CHECK-NEXT:  1      4     0.50                        usubw	v0.4s, v0.4s, v0.4h
-# CHECK-NEXT:  1      4     0.50                        usubw	v0.8h, v0.8h, v0.8b
-# CHECK-NEXT:  1      4     0.50                        usubw2	v0.2d, v0.2d, v0.4s
-# CHECK-NEXT:  1      4     0.50                        usubw2	v0.4s, v0.4s, v0.8h
-# CHECK-NEXT:  1      4     0.50                        usubw2	v0.8h, v0.8h, v0.16b
+# CHECK-NEXT:  1      3     0.50                        usra	d20, d13, #61
+# CHECK-NEXT:  1      3     0.50                        usra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  1      3     0.50                        usra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  1      3     0.50                        usra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  1      3     0.50                        usra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  1      3     0.50                        usra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  1      3     0.50                        usra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  1      3     0.50                        usra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  1      3     0.50                        usubl	v0.2d, v0.2s, v0.2s
+# CHECK-NEXT:  1      3     0.50                        usubl	v0.4s, v0.4h, v0.4h
+# CHECK-NEXT:  1      3     0.50                        usubl	v0.8h, v0.8b, v0.8b
+# CHECK-NEXT:  1      3     0.50                        usubl2	v0.2d, v0.4s, v0.4s
+# CHECK-NEXT:  1      3     0.50                        usubl2	v0.4s, v0.8h, v0.8h
+# CHECK-NEXT:  1      3     0.50                        usubl2	v0.8h, v0.16b, v0.16b
+# CHECK-NEXT:  1      3     0.50                        usubw	v0.2d, v0.2d, v0.2s
+# CHECK-NEXT:  1      3     0.50                        usubw	v0.4s, v0.4s, v0.4h
+# CHECK-NEXT:  1      3     0.50                        usubw	v0.8h, v0.8h, v0.8b
+# CHECK-NEXT:  1      3     0.50                        usubw2	v0.2d, v0.2d, v0.4s
+# CHECK-NEXT:  1      3     0.50                        usubw2	v0.4s, v0.4s, v0.8h
+# CHECK-NEXT:  1      3     0.50                        usubw2	v0.8h, v0.8h, v0.16b
 # CHECK-NEXT:  1      4     0.50                        uzp1	v0.16b, v0.16b, v0.16b
 # CHECK-NEXT:  1      4     0.50                        uzp1	v0.2d, v0.2d, v0.2d
 # CHECK-NEXT:  1      4     0.50                        uzp1	v0.2s, v0.2s, v0.2s
@@ -2148,7 +2148,7 @@ zip2	v0.8h, v0.8h, v0.8h
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1.0]  [1.1]  [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10.0] [10.1] [11]
-# CHECK-NEXT:  -      -      -      -      -     39.00  91.00   -      -     509.00 509.00 3.00   3.00   197.00
+# CHECK-NEXT:  -      -      -      -      -     39.00  91.00   -      -     501.00 501.00 3.00   3.00   197.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1.0]  [1.1]  [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10.0] [10.1] [11]   Instructions:
@@ -2882,14 +2882,14 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     sshr	v0.4s, v0.4s, #3
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     sshr	v0.8b, v0.8b, #3
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     sshr	v0.8h, v0.8h, #3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -     ssra	d18, d12, #21
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -     ssra	v0.16b, v0.16b, #3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -     ssra	v0.2d, v0.2d, #3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -     ssra	v0.2s, v0.2s, #3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -     ssra	v0.4h, v0.4h, #3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -     ssra	v0.4s, v0.4s, #3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -     ssra	v0.8b, v0.8b, #3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -     ssra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     ssra	d18, d12, #21
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     ssra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     ssra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     ssra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     ssra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     ssra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     ssra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     ssra	v0.8h, v0.8h, #3
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     ssubl	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     ssubl	v0.4s, v0.4h, v0.4h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     ssubl	v0.8h, v0.8b, v0.8b
@@ -3157,14 +3157,14 @@ zip2	v0.8h, v0.8h, v0.8h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     usqadd	v0.4s, v0.4s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     usqadd	v0.8b, v0.8b
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     usqadd	v0.8h, v0.8h
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -     usra	d20, d13, #61
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -     usra	v0.16b, v0.16b, #3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -     usra	v0.2d, v0.2d, #3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -     usra	v0.2s, v0.2s, #3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -     usra	v0.4h, v0.4h, #3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -     usra	v0.4s, v0.4s, #3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -     usra	v0.8b, v0.8b, #3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -     usra	v0.8h, v0.8h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     usra	d20, d13, #61
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     usra	v0.16b, v0.16b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     usra	v0.2d, v0.2d, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     usra	v0.2s, v0.2s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     usra	v0.4h, v0.4h, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     usra	v0.4s, v0.4s, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     usra	v0.8b, v0.8b, #3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     usra	v0.8h, v0.8h, #3
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     usubl	v0.2d, v0.2s, v0.2s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     usubl	v0.4s, v0.4h, v0.4h
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -     usubl	v0.8h, v0.8b, v0.8b
diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-sve-instructions.s
index a8fb8b669838f..d8051e7ecb4fe 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-sve-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A510-sve-instructions.s
@@ -3476,12 +3476,12 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      3     0.50                        add	z31.s, p7/m, z31.s, z31.s
 # CHECK-NEXT:  1      3     0.50                        add	z31.s, z31.s, #65280
 # CHECK-NEXT:  1      3     0.50                        add	z31.s, z31.s, z31.s
-# CHECK-NEXT:  1      4     0.50                        addhnb	z0.b, z1.h, z31.h
-# CHECK-NEXT:  1      4     0.50                        addhnb	z0.h, z1.s, z31.s
-# CHECK-NEXT:  1      4     0.50                        addhnb	z0.s, z1.d, z31.d
-# CHECK-NEXT:  1      4     0.50                        addhnt	z0.b, z1.h, z31.h
-# CHECK-NEXT:  1      4     0.50                        addhnt	z0.h, z1.s, z31.s
-# CHECK-NEXT:  1      4     0.50                        addhnt	z0.s, z1.d, z31.d
+# CHECK-NEXT:  1      8     0.50                        addhnb	z0.b, z1.h, z31.h
+# CHECK-NEXT:  1      8     0.50                        addhnb	z0.h, z1.s, z31.s
+# CHECK-NEXT:  1      8     0.50                        addhnb	z0.s, z1.d, z31.d
+# CHECK-NEXT:  1      8     0.50                        addhnt	z0.b, z1.h, z31.h
+# CHECK-NEXT:  1      8     0.50                        addhnt	z0.h, z1.s, z31.s
+# CHECK-NEXT:  1      8     0.50                        addhnt	z0.s, z1.d, z31.d
 # CHECK-NEXT:  1      3     0.50                        addp	z0.b, p0/m, z0.b, z1.b
 # CHECK-NEXT:  1      3     0.50                        addp	z0.h, p0/m, z0.h, z1.h
 # CHECK-NEXT:  1      3     0.50                        addp	z29.s, p7/m, z29.s, z30.s
@@ -3516,7 +3516,7 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      3     0.50                        aesimc	z31.b, z31.b
 # CHECK-NEXT:  1      3     0.50                        aesmc	z0.b, z0.b
 # CHECK-NEXT:  1      3     0.50                        aesmc	z31.b, z31.b
-# CHECK-NEXT:  1      6     1.00                        and	p0.b, p0/z, p0.b, p1.b
+# CHECK-NEXT:  1      2     1.00                        and	p0.b, p0/z, p0.b, p1.b
 # CHECK-NEXT:  1      3     0.50                        and	z0.d, z0.d, #0x6
 # CHECK-NEXT:  1      3     0.50                        and	z0.d, z0.d, #0xfffffffffffffff9
 # CHECK-NEXT:  1      3     0.50                        and	z0.d, z0.d, z0.d
@@ -3531,7 +3531,7 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      3     0.50                        and	z31.s, p7/m, z31.s, z31.s
 # CHECK-NEXT:  1      3     0.50                        and	z5.b, z5.b, #0x6
 # CHECK-NEXT:  1      3     0.50                        and	z5.b, z5.b, #0xf9
-# CHECK-NEXT:  1      6     1.00                        ands	p0.b, p0/z, p0.b, p1.b
+# CHECK-NEXT:  1      2     1.00                        ands	p0.b, p0/z, p0.b, p1.b
 # CHECK-NEXT:  1      4     1.00                        andv	b0, p7, z31.b
 # CHECK-NEXT:  1      4     1.00                        andv	d0, p7, z31.d
 # CHECK-NEXT:  1      4     1.00                        andv	h0, p7, z31.h
@@ -3574,7 +3574,7 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      3     0.50                        asrr	z0.d, p0/m, z0.d, z0.d
 # CHECK-NEXT:  1      3     0.50                        asrr	z0.h, p0/m, z0.h, z0.h
 # CHECK-NEXT:  1      3     0.50                        asrr	z0.s, p0/m, z0.s, z0.s
-# CHECK-NEXT:  1      3     0.50                        bcax	z29.d, z29.d, z30.d, z31.d
+# CHECK-NEXT:  1      4     0.50                        bcax	z29.d, z29.d, z30.d, z31.d
 # CHECK-NEXT:  1      14    13.00                       bdep	z0.b, z1.b, z31.b
 # CHECK-NEXT:  1      70    69.00                       bdep	z0.d, z1.d, z31.d
 # CHECK-NEXT:  1      22    21.00                       bdep	z0.h, z1.h, z31.h
@@ -3603,34 +3603,34 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      70    69.00                       bgrp	z0.d, z1.d, z31.d
 # CHECK-NEXT:  1      22    21.00                       bgrp	z0.h, z1.h, z31.h
 # CHECK-NEXT:  1      38    37.00                       bgrp	z0.s, z1.s, z31.s
-# CHECK-NEXT:  1      6     1.00                        bic	p0.b, p0/z, p0.b, p0.b
-# CHECK-NEXT:  1      6     1.00                        bic	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        bic	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        bic	p15.b, p15/z, p15.b, p15.b
 # CHECK-NEXT:  1      3     0.50                        bic	z0.d, z0.d, z0.d
 # CHECK-NEXT:  1      3     0.50                        bic	z23.d, z13.d, z8.d
 # CHECK-NEXT:  1      3     0.50                        bic	z31.b, p7/m, z31.b, z31.b
 # CHECK-NEXT:  1      3     0.50                        bic	z31.d, p7/m, z31.d, z31.d
 # CHECK-NEXT:  1      3     0.50                        bic	z31.h, p7/m, z31.h, z31.h
 # CHECK-NEXT:  1      3     0.50                        bic	z31.s, p7/m, z31.s, z31.s
-# CHECK-NEXT:  1      6     1.00                        bics	p0.b, p0/z, p0.b, p0.b
-# CHECK-NEXT:  1      6     1.00                        bics	p15.b, p15/z, p15.b, p15.b
-# CHECK-NEXT:  1      6     1.00                        brka	p0.b, p15/m, p15.b
-# CHECK-NEXT:  1      6     1.00                        brka	p0.b, p15/z, p15.b
-# CHECK-NEXT:  1      6     1.00                        brkas	p0.b, p15/z, p15.b
-# CHECK-NEXT:  1      6     1.00                        brkb	p0.b, p15/m, p15.b
-# CHECK-NEXT:  1      6     1.00                        brkb	p0.b, p15/z, p15.b
-# CHECK-NEXT:  1      6     1.00                        brkbs	p0.b, p15/z, p15.b
-# CHECK-NEXT:  1      6     1.00                        brkn	p0.b, p15/z, p1.b, p0.b
-# CHECK-NEXT:  1      6     1.00                        brkn	p15.b, p15/z, p15.b, p15.b
-# CHECK-NEXT:  1      6     1.00                        brkns	p0.b, p15/z, p1.b, p0.b
-# CHECK-NEXT:  1      6     1.00                        brkns	p15.b, p15/z, p15.b, p15.b
-# CHECK-NEXT:  1      6     1.00                        brkpa	p0.b, p15/z, p1.b, p2.b
-# CHECK-NEXT:  1      6     1.00                        brkpa	p15.b, p15/z, p15.b, p15.b
-# CHECK-NEXT:  1      6     1.00                        brkpas	p0.b, p15/z, p1.b, p2.b
-# CHECK-NEXT:  1      6     1.00                        brkpas	p15.b, p15/z, p15.b, p15.b
-# CHECK-NEXT:  1      6     1.00                        brkpb	p0.b, p15/z, p1.b, p2.b
-# CHECK-NEXT:  1      6     1.00                        brkpb	p15.b, p15/z, p15.b, p15.b
-# CHECK-NEXT:  1      6     1.00                        brkpbs	p0.b, p15/z, p1.b, p2.b
-# CHECK-NEXT:  1      6     1.00                        brkpbs	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        bics	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        bics	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        brka	p0.b, p15/m, p15.b
+# CHECK-NEXT:  1      2     1.00                        brka	p0.b, p15/z, p15.b
+# CHECK-NEXT:  1      2     1.00                        brkas	p0.b, p15/z, p15.b
+# CHECK-NEXT:  1      2     1.00                        brkb	p0.b, p15/m, p15.b
+# CHECK-NEXT:  1      2     1.00                        brkb	p0.b, p15/z, p15.b
+# CHECK-NEXT:  1      2     1.00                        brkbs	p0.b, p15/z, p15.b
+# CHECK-NEXT:  1      2     1.00                        brkn	p0.b, p15/z, p1.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        brkn	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        brkns	p0.b, p15/z, p1.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        brkns	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        brkpa	p0.b, p15/z, p1.b, p2.b
+# CHECK-NEXT:  1      2     1.00                        brkpa	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      4     1.00                        brkpas	p0.b, p15/z, p1.b, p2.b
+# CHECK-NEXT:  1      4     1.00                        brkpas	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        brkpb	p0.b, p15/z, p1.b, p2.b
+# CHECK-NEXT:  1      2     1.00                        brkpb	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      4     1.00                        brkpbs	p0.b, p15/z, p1.b, p2.b
+# CHECK-NEXT:  1      4     1.00                        brkpbs	p15.b, p15/z, p15.b, p15.b
 # CHECK-NEXT:  1      3     0.50                        bsl	z0.d, z0.d, z1.d, z2.d
 # CHECK-NEXT:  1      3     0.50                        bsl1n	z0.d, z0.d, z1.d, z2.d
 # CHECK-NEXT:  1      3     0.50                        bsl2n	z0.d, z0.d, z1.d, z2.d
@@ -3704,163 +3704,163 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      4     0.50                        cmla	z31.h, z31.h, z31.h, #180
 # CHECK-NEXT:  1      4     0.50                        cmla	z31.s, z30.s, z7.s[0], #180
 # CHECK-NEXT:  1      4     0.50                        cmla	z31.s, z31.s, z31.s, #180
-# CHECK-NEXT:  1      3     0.50                        cmpeq	p0.b, p0/z, z0.b, #-16
-# CHECK-NEXT:  1      3     0.50                        cmpeq	p0.b, p0/z, z0.b, #15
-# CHECK-NEXT:  1      3     0.50                        cmpeq	p0.b, p0/z, z0.b, z0.b
-# CHECK-NEXT:  1      3     0.50                        cmpeq	p0.b, p0/z, z0.b, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmpeq	p0.d, p0/z, z0.d, #-16
-# CHECK-NEXT:  1      3     0.50                        cmpeq	p0.d, p0/z, z0.d, #15
-# CHECK-NEXT:  1      3     0.50                        cmpeq	p0.d, p0/z, z0.d, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmpeq	p0.h, p0/z, z0.h, #-16
-# CHECK-NEXT:  1      3     0.50                        cmpeq	p0.h, p0/z, z0.h, #15
-# CHECK-NEXT:  1      3     0.50                        cmpeq	p0.h, p0/z, z0.h, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmpeq	p0.h, p0/z, z0.h, z0.h
-# CHECK-NEXT:  1      3     0.50                        cmpeq	p0.s, p0/z, z0.s, #-16
-# CHECK-NEXT:  1      3     0.50                        cmpeq	p0.s, p0/z, z0.s, #15
-# CHECK-NEXT:  1      3     0.50                        cmpeq	p0.s, p0/z, z0.s, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmpeq	p0.s, p0/z, z0.s, z0.s
-# CHECK-NEXT:  1      3     0.50                        cmpge	p0.b, p0/z, z0.b, #-16
-# CHECK-NEXT:  1      3     0.50                        cmpge	p0.b, p0/z, z0.b, #15
-# CHECK-NEXT:  1      3     0.50                        cmpge	p0.b, p0/z, z0.b, z0.b
-# CHECK-NEXT:  1      3     0.50                        cmpge	p0.b, p0/z, z0.b, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmpge	p0.b, p0/z, z1.b, z0.b
-# CHECK-NEXT:  1      3     0.50                        cmpge	p0.d, p0/z, z0.d, #-16
-# CHECK-NEXT:  1      3     0.50                        cmpge	p0.d, p0/z, z0.d, #15
-# CHECK-NEXT:  1      3     0.50                        cmpge	p0.d, p0/z, z0.d, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmpge	p0.d, p0/z, z1.d, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmpge	p0.h, p0/z, z0.h, #-16
-# CHECK-NEXT:  1      3     0.50                        cmpge	p0.h, p0/z, z0.h, #15
-# CHECK-NEXT:  1      3     0.50                        cmpge	p0.h, p0/z, z0.h, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmpge	p0.h, p0/z, z0.h, z0.h
-# CHECK-NEXT:  1      3     0.50                        cmpge	p0.h, p0/z, z1.h, z0.h
-# CHECK-NEXT:  1      3     0.50                        cmpge	p0.s, p0/z, z0.s, #-16
-# CHECK-NEXT:  1      3     0.50                        cmpge	p0.s, p0/z, z0.s, #15
-# CHECK-NEXT:  1      3     0.50                        cmpge	p0.s, p0/z, z0.s, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmpge	p0.s, p0/z, z0.s, z0.s
-# CHECK-NEXT:  1      3     0.50                        cmpge	p0.s, p0/z, z1.s, z0.s
-# CHECK-NEXT:  1      3     0.50                        cmpgt	p0.b, p0/z, z0.b, #-16
-# CHECK-NEXT:  1      3     0.50                        cmpgt	p0.b, p0/z, z0.b, #15
-# CHECK-NEXT:  1      3     0.50                        cmpgt	p0.b, p0/z, z0.b, z0.b
-# CHECK-NEXT:  1      3     0.50                        cmpgt	p0.b, p0/z, z0.b, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmpgt	p0.b, p0/z, z1.b, z0.b
-# CHECK-NEXT:  1      3     0.50                        cmpgt	p0.d, p0/z, z0.d, #-16
-# CHECK-NEXT:  1      3     0.50                        cmpgt	p0.d, p0/z, z0.d, #15
-# CHECK-NEXT:  1      3     0.50                        cmpgt	p0.d, p0/z, z0.d, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmpgt	p0.d, p0/z, z1.d, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmpgt	p0.h, p0/z, z0.h, #-16
-# CHECK-NEXT:  1      3     0.50                        cmpgt	p0.h, p0/z, z0.h, #15
-# CHECK-NEXT:  1      3     0.50                        cmpgt	p0.h, p0/z, z0.h, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmpgt	p0.h, p0/z, z0.h, z0.h
-# CHECK-NEXT:  1      3     0.50                        cmpgt	p0.h, p0/z, z1.h, z0.h
-# CHECK-NEXT:  1      3     0.50                        cmpgt	p0.s, p0/z, z0.s, #-16
-# CHECK-NEXT:  1      3     0.50                        cmpgt	p0.s, p0/z, z0.s, #15
-# CHECK-NEXT:  1      3     0.50                        cmpgt	p0.s, p0/z, z0.s, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmpgt	p0.s, p0/z, z0.s, z0.s
-# CHECK-NEXT:  1      3     0.50                        cmpgt	p0.s, p0/z, z1.s, z0.s
-# CHECK-NEXT:  1      3     0.50                        cmphi	p0.b, p0/z, z0.b, #0
-# CHECK-NEXT:  1      3     0.50                        cmphi	p0.b, p0/z, z0.b, #127
-# CHECK-NEXT:  1      3     0.50                        cmphi	p0.b, p0/z, z0.b, z0.b
-# CHECK-NEXT:  1      3     0.50                        cmphi	p0.b, p0/z, z0.b, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmphi	p0.b, p0/z, z1.b, z0.b
-# CHECK-NEXT:  1      3     0.50                        cmphi	p0.d, p0/z, z0.d, #0
-# CHECK-NEXT:  1      3     0.50                        cmphi	p0.d, p0/z, z0.d, #127
-# CHECK-NEXT:  1      3     0.50                        cmphi	p0.d, p0/z, z0.d, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmphi	p0.d, p0/z, z1.d, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmphi	p0.h, p0/z, z0.h, #0
-# CHECK-NEXT:  1      3     0.50                        cmphi	p0.h, p0/z, z0.h, #127
-# CHECK-NEXT:  1      3     0.50                        cmphi	p0.h, p0/z, z0.h, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmphi	p0.h, p0/z, z0.h, z0.h
-# CHECK-NEXT:  1      3     0.50                        cmphi	p0.h, p0/z, z1.h, z0.h
-# CHECK-NEXT:  1      3     0.50                        cmphi	p0.s, p0/z, z0.s, #0
-# CHECK-NEXT:  1      3     0.50                        cmphi	p0.s, p0/z, z0.s, #127
-# CHECK-NEXT:  1      3     0.50                        cmphi	p0.s, p0/z, z0.s, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmphi	p0.s, p0/z, z0.s, z0.s
-# CHECK-NEXT:  1      3     0.50                        cmphi	p0.s, p0/z, z1.s, z0.s
-# CHECK-NEXT:  1      3     0.50                        cmphs	p0.b, p0/z, z0.b, #0
-# CHECK-NEXT:  1      3     0.50                        cmphs	p0.b, p0/z, z0.b, #127
-# CHECK-NEXT:  1      3     0.50                        cmphs	p0.b, p0/z, z0.b, z0.b
-# CHECK-NEXT:  1      3     0.50                        cmphs	p0.b, p0/z, z0.b, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmphs	p0.b, p0/z, z1.b, z0.b
-# CHECK-NEXT:  1      3     0.50                        cmphs	p0.d, p0/z, z0.d, #0
-# CHECK-NEXT:  1      3     0.50                        cmphs	p0.d, p0/z, z0.d, #127
-# CHECK-NEXT:  1      3     0.50                        cmphs	p0.d, p0/z, z0.d, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmphs	p0.d, p0/z, z1.d, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmphs	p0.h, p0/z, z0.h, #0
-# CHECK-NEXT:  1      3     0.50                        cmphs	p0.h, p0/z, z0.h, #127
-# CHECK-NEXT:  1      3     0.50                        cmphs	p0.h, p0/z, z0.h, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmphs	p0.h, p0/z, z0.h, z0.h
-# CHECK-NEXT:  1      3     0.50                        cmphs	p0.h, p0/z, z1.h, z0.h
-# CHECK-NEXT:  1      3     0.50                        cmphs	p0.s, p0/z, z0.s, #0
-# CHECK-NEXT:  1      3     0.50                        cmphs	p0.s, p0/z, z0.s, #127
-# CHECK-NEXT:  1      3     0.50                        cmphs	p0.s, p0/z, z0.s, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmphs	p0.s, p0/z, z0.s, z0.s
-# CHECK-NEXT:  1      3     0.50                        cmphs	p0.s, p0/z, z1.s, z0.s
-# CHECK-NEXT:  1      3     0.50                        cmple	p0.b, p0/z, z0.b, #-16
-# CHECK-NEXT:  1      3     0.50                        cmple	p0.b, p0/z, z0.b, #15
-# CHECK-NEXT:  1      3     0.50                        cmple	p0.b, p0/z, z0.b, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmple	p0.d, p0/z, z0.d, #-16
-# CHECK-NEXT:  1      3     0.50                        cmple	p0.d, p0/z, z0.d, #15
-# CHECK-NEXT:  1      3     0.50                        cmple	p0.h, p0/z, z0.h, #-16
-# CHECK-NEXT:  1      3     0.50                        cmple	p0.h, p0/z, z0.h, #15
-# CHECK-NEXT:  1      3     0.50                        cmple	p0.h, p0/z, z0.h, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmple	p0.s, p0/z, z0.s, #-16
-# CHECK-NEXT:  1      3     0.50                        cmple	p0.s, p0/z, z0.s, #15
-# CHECK-NEXT:  1      3     0.50                        cmple	p0.s, p0/z, z0.s, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmplo	p0.b, p0/z, z0.b, #0
-# CHECK-NEXT:  1      3     0.50                        cmplo	p0.b, p0/z, z0.b, #127
-# CHECK-NEXT:  1      3     0.50                        cmplo	p0.b, p0/z, z0.b, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmplo	p0.d, p0/z, z0.d, #0
-# CHECK-NEXT:  1      3     0.50                        cmplo	p0.d, p0/z, z0.d, #127
-# CHECK-NEXT:  1      3     0.50                        cmplo	p0.h, p0/z, z0.h, #0
-# CHECK-NEXT:  1      3     0.50                        cmplo	p0.h, p0/z, z0.h, #127
-# CHECK-NEXT:  1      3     0.50                        cmplo	p0.h, p0/z, z0.h, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmplo	p0.s, p0/z, z0.s, #0
-# CHECK-NEXT:  1      3     0.50                        cmplo	p0.s, p0/z, z0.s, #127
-# CHECK-NEXT:  1      3     0.50                        cmplo	p0.s, p0/z, z0.s, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmpls	p0.b, p0/z, z0.b, #0
-# CHECK-NEXT:  1      3     0.50                        cmpls	p0.b, p0/z, z0.b, #127
-# CHECK-NEXT:  1      3     0.50                        cmpls	p0.b, p0/z, z0.b, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmpls	p0.d, p0/z, z0.d, #0
-# CHECK-NEXT:  1      3     0.50                        cmpls	p0.d, p0/z, z0.d, #127
-# CHECK-NEXT:  1      3     0.50                        cmpls	p0.h, p0/z, z0.h, #0
-# CHECK-NEXT:  1      3     0.50                        cmpls	p0.h, p0/z, z0.h, #127
-# CHECK-NEXT:  1      3     0.50                        cmpls	p0.h, p0/z, z0.h, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmpls	p0.s, p0/z, z0.s, #0
-# CHECK-NEXT:  1      3     0.50                        cmpls	p0.s, p0/z, z0.s, #127
-# CHECK-NEXT:  1      3     0.50                        cmpls	p0.s, p0/z, z0.s, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmplt	p0.b, p0/z, z0.b, #-16
-# CHECK-NEXT:  1      3     0.50                        cmplt	p0.b, p0/z, z0.b, #15
-# CHECK-NEXT:  1      3     0.50                        cmplt	p0.b, p0/z, z0.b, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmplt	p0.d, p0/z, z0.d, #-16
-# CHECK-NEXT:  1      3     0.50                        cmplt	p0.d, p0/z, z0.d, #15
-# CHECK-NEXT:  1      3     0.50                        cmplt	p0.h, p0/z, z0.h, #-16
-# CHECK-NEXT:  1      3     0.50                        cmplt	p0.h, p0/z, z0.h, #15
-# CHECK-NEXT:  1      3     0.50                        cmplt	p0.h, p0/z, z0.h, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmplt	p0.s, p0/z, z0.s, #-16
-# CHECK-NEXT:  1      3     0.50                        cmplt	p0.s, p0/z, z0.s, #15
-# CHECK-NEXT:  1      3     0.50                        cmplt	p0.s, p0/z, z0.s, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmpne	p0.b, p0/z, z0.b, #-16
-# CHECK-NEXT:  1      3     0.50                        cmpne	p0.b, p0/z, z0.b, #15
-# CHECK-NEXT:  1      3     0.50                        cmpne	p0.b, p0/z, z0.b, z0.b
-# CHECK-NEXT:  1      3     0.50                        cmpne	p0.b, p0/z, z0.b, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmpne	p0.d, p0/z, z0.d, #-16
-# CHECK-NEXT:  1      3     0.50                        cmpne	p0.d, p0/z, z0.d, #15
-# CHECK-NEXT:  1      3     0.50                        cmpne	p0.d, p0/z, z0.d, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmpne	p0.h, p0/z, z0.h, #-16
-# CHECK-NEXT:  1      3     0.50                        cmpne	p0.h, p0/z, z0.h, #15
-# CHECK-NEXT:  1      3     0.50                        cmpne	p0.h, p0/z, z0.h, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmpne	p0.h, p0/z, z0.h, z0.h
-# CHECK-NEXT:  1      3     0.50                        cmpne	p0.s, p0/z, z0.s, #-16
-# CHECK-NEXT:  1      3     0.50                        cmpne	p0.s, p0/z, z0.s, #15
-# CHECK-NEXT:  1      3     0.50                        cmpne	p0.s, p0/z, z0.s, z0.d
-# CHECK-NEXT:  1      3     0.50                        cmpne	p0.s, p0/z, z0.s, z0.s
+# CHECK-NEXT:  1      4     0.50                        cmpeq	p0.b, p0/z, z0.b, #-16
+# CHECK-NEXT:  1      4     0.50                        cmpeq	p0.b, p0/z, z0.b, #15
+# CHECK-NEXT:  1      4     0.50                        cmpeq	p0.b, p0/z, z0.b, z0.b
+# CHECK-NEXT:  1      4     0.50                        cmpeq	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmpeq	p0.d, p0/z, z0.d, #-16
+# CHECK-NEXT:  1      4     0.50                        cmpeq	p0.d, p0/z, z0.d, #15
+# CHECK-NEXT:  1      4     0.50                        cmpeq	p0.d, p0/z, z0.d, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmpeq	p0.h, p0/z, z0.h, #-16
+# CHECK-NEXT:  1      4     0.50                        cmpeq	p0.h, p0/z, z0.h, #15
+# CHECK-NEXT:  1      4     0.50                        cmpeq	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmpeq	p0.h, p0/z, z0.h, z0.h
+# CHECK-NEXT:  1      4     0.50                        cmpeq	p0.s, p0/z, z0.s, #-16
+# CHECK-NEXT:  1      4     0.50                        cmpeq	p0.s, p0/z, z0.s, #15
+# CHECK-NEXT:  1      4     0.50                        cmpeq	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmpeq	p0.s, p0/z, z0.s, z0.s
+# CHECK-NEXT:  1      4     0.50                        cmpge	p0.b, p0/z, z0.b, #-16
+# CHECK-NEXT:  1      4     0.50                        cmpge	p0.b, p0/z, z0.b, #15
+# CHECK-NEXT:  1      4     0.50                        cmpge	p0.b, p0/z, z0.b, z0.b
+# CHECK-NEXT:  1      4     0.50                        cmpge	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmpge	p0.b, p0/z, z1.b, z0.b
+# CHECK-NEXT:  1      4     0.50                        cmpge	p0.d, p0/z, z0.d, #-16
+# CHECK-NEXT:  1      4     0.50                        cmpge	p0.d, p0/z, z0.d, #15
+# CHECK-NEXT:  1      4     0.50                        cmpge	p0.d, p0/z, z0.d, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmpge	p0.d, p0/z, z1.d, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmpge	p0.h, p0/z, z0.h, #-16
+# CHECK-NEXT:  1      4     0.50                        cmpge	p0.h, p0/z, z0.h, #15
+# CHECK-NEXT:  1      4     0.50                        cmpge	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmpge	p0.h, p0/z, z0.h, z0.h
+# CHECK-NEXT:  1      4     0.50                        cmpge	p0.h, p0/z, z1.h, z0.h
+# CHECK-NEXT:  1      4     0.50                        cmpge	p0.s, p0/z, z0.s, #-16
+# CHECK-NEXT:  1      4     0.50                        cmpge	p0.s, p0/z, z0.s, #15
+# CHECK-NEXT:  1      4     0.50                        cmpge	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmpge	p0.s, p0/z, z0.s, z0.s
+# CHECK-NEXT:  1      4     0.50                        cmpge	p0.s, p0/z, z1.s, z0.s
+# CHECK-NEXT:  1      4     0.50                        cmpgt	p0.b, p0/z, z0.b, #-16
+# CHECK-NEXT:  1      4     0.50                        cmpgt	p0.b, p0/z, z0.b, #15
+# CHECK-NEXT:  1      4     0.50                        cmpgt	p0.b, p0/z, z0.b, z0.b
+# CHECK-NEXT:  1      4     0.50                        cmpgt	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmpgt	p0.b, p0/z, z1.b, z0.b
+# CHECK-NEXT:  1      4     0.50                        cmpgt	p0.d, p0/z, z0.d, #-16
+# CHECK-NEXT:  1      4     0.50                        cmpgt	p0.d, p0/z, z0.d, #15
+# CHECK-NEXT:  1      4     0.50                        cmpgt	p0.d, p0/z, z0.d, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmpgt	p0.d, p0/z, z1.d, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmpgt	p0.h, p0/z, z0.h, #-16
+# CHECK-NEXT:  1      4     0.50                        cmpgt	p0.h, p0/z, z0.h, #15
+# CHECK-NEXT:  1      4     0.50                        cmpgt	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmpgt	p0.h, p0/z, z0.h, z0.h
+# CHECK-NEXT:  1      4     0.50                        cmpgt	p0.h, p0/z, z1.h, z0.h
+# CHECK-NEXT:  1      4     0.50                        cmpgt	p0.s, p0/z, z0.s, #-16
+# CHECK-NEXT:  1      4     0.50                        cmpgt	p0.s, p0/z, z0.s, #15
+# CHECK-NEXT:  1      4     0.50                        cmpgt	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmpgt	p0.s, p0/z, z0.s, z0.s
+# CHECK-NEXT:  1      4     0.50                        cmpgt	p0.s, p0/z, z1.s, z0.s
+# CHECK-NEXT:  1      4     0.50                        cmphi	p0.b, p0/z, z0.b, #0
+# CHECK-NEXT:  1      4     0.50                        cmphi	p0.b, p0/z, z0.b, #127
+# CHECK-NEXT:  1      4     0.50                        cmphi	p0.b, p0/z, z0.b, z0.b
+# CHECK-NEXT:  1      4     0.50                        cmphi	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmphi	p0.b, p0/z, z1.b, z0.b
+# CHECK-NEXT:  1      4     0.50                        cmphi	p0.d, p0/z, z0.d, #0
+# CHECK-NEXT:  1      4     0.50                        cmphi	p0.d, p0/z, z0.d, #127
+# CHECK-NEXT:  1      4     0.50                        cmphi	p0.d, p0/z, z0.d, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmphi	p0.d, p0/z, z1.d, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmphi	p0.h, p0/z, z0.h, #0
+# CHECK-NEXT:  1      4     0.50                        cmphi	p0.h, p0/z, z0.h, #127
+# CHECK-NEXT:  1      4     0.50                        cmphi	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmphi	p0.h, p0/z, z0.h, z0.h
+# CHECK-NEXT:  1      4     0.50                        cmphi	p0.h, p0/z, z1.h, z0.h
+# CHECK-NEXT:  1      4     0.50                        cmphi	p0.s, p0/z, z0.s, #0
+# CHECK-NEXT:  1      4     0.50                        cmphi	p0.s, p0/z, z0.s, #127
+# CHECK-NEXT:  1      4     0.50                        cmphi	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmphi	p0.s, p0/z, z0.s, z0.s
+# CHECK-NEXT:  1      4     0.50                        cmphi	p0.s, p0/z, z1.s, z0.s
+# CHECK-NEXT:  1      4     0.50                        cmphs	p0.b, p0/z, z0.b, #0
+# CHECK-NEXT:  1      4     0.50                        cmphs	p0.b, p0/z, z0.b, #127
+# CHECK-NEXT:  1      4     0.50                        cmphs	p0.b, p0/z, z0.b, z0.b
+# CHECK-NEXT:  1      4     0.50                        cmphs	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmphs	p0.b, p0/z, z1.b, z0.b
+# CHECK-NEXT:  1      4     0.50                        cmphs	p0.d, p0/z, z0.d, #0
+# CHECK-NEXT:  1      4     0.50                        cmphs	p0.d, p0/z, z0.d, #127
+# CHECK-NEXT:  1      4     0.50                        cmphs	p0.d, p0/z, z0.d, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmphs	p0.d, p0/z, z1.d, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmphs	p0.h, p0/z, z0.h, #0
+# CHECK-NEXT:  1      4     0.50                        cmphs	p0.h, p0/z, z0.h, #127
+# CHECK-NEXT:  1      4     0.50                        cmphs	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmphs	p0.h, p0/z, z0.h, z0.h
+# CHECK-NEXT:  1      4     0.50                        cmphs	p0.h, p0/z, z1.h, z0.h
+# CHECK-NEXT:  1      4     0.50                        cmphs	p0.s, p0/z, z0.s, #0
+# CHECK-NEXT:  1      4     0.50                        cmphs	p0.s, p0/z, z0.s, #127
+# CHECK-NEXT:  1      4     0.50                        cmphs	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmphs	p0.s, p0/z, z0.s, z0.s
+# CHECK-NEXT:  1      4     0.50                        cmphs	p0.s, p0/z, z1.s, z0.s
+# CHECK-NEXT:  1      4     0.50                        cmple	p0.b, p0/z, z0.b, #-16
+# CHECK-NEXT:  1      4     0.50                        cmple	p0.b, p0/z, z0.b, #15
+# CHECK-NEXT:  1      4     0.50                        cmple	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmple	p0.d, p0/z, z0.d, #-16
+# CHECK-NEXT:  1      4     0.50                        cmple	p0.d, p0/z, z0.d, #15
+# CHECK-NEXT:  1      4     0.50                        cmple	p0.h, p0/z, z0.h, #-16
+# CHECK-NEXT:  1      4     0.50                        cmple	p0.h, p0/z, z0.h, #15
+# CHECK-NEXT:  1      4     0.50                        cmple	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmple	p0.s, p0/z, z0.s, #-16
+# CHECK-NEXT:  1      4     0.50                        cmple	p0.s, p0/z, z0.s, #15
+# CHECK-NEXT:  1      4     0.50                        cmple	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmplo	p0.b, p0/z, z0.b, #0
+# CHECK-NEXT:  1      4     0.50                        cmplo	p0.b, p0/z, z0.b, #127
+# CHECK-NEXT:  1      4     0.50                        cmplo	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmplo	p0.d, p0/z, z0.d, #0
+# CHECK-NEXT:  1      4     0.50                        cmplo	p0.d, p0/z, z0.d, #127
+# CHECK-NEXT:  1      4     0.50                        cmplo	p0.h, p0/z, z0.h, #0
+# CHECK-NEXT:  1      4     0.50                        cmplo	p0.h, p0/z, z0.h, #127
+# CHECK-NEXT:  1      4     0.50                        cmplo	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmplo	p0.s, p0/z, z0.s, #0
+# CHECK-NEXT:  1      4     0.50                        cmplo	p0.s, p0/z, z0.s, #127
+# CHECK-NEXT:  1      4     0.50                        cmplo	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmpls	p0.b, p0/z, z0.b, #0
+# CHECK-NEXT:  1      4     0.50                        cmpls	p0.b, p0/z, z0.b, #127
+# CHECK-NEXT:  1      4     0.50                        cmpls	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmpls	p0.d, p0/z, z0.d, #0
+# CHECK-NEXT:  1      4     0.50                        cmpls	p0.d, p0/z, z0.d, #127
+# CHECK-NEXT:  1      4     0.50                        cmpls	p0.h, p0/z, z0.h, #0
+# CHECK-NEXT:  1      4     0.50                        cmpls	p0.h, p0/z, z0.h, #127
+# CHECK-NEXT:  1      4     0.50                        cmpls	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmpls	p0.s, p0/z, z0.s, #0
+# CHECK-NEXT:  1      4     0.50                        cmpls	p0.s, p0/z, z0.s, #127
+# CHECK-NEXT:  1      4     0.50                        cmpls	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmplt	p0.b, p0/z, z0.b, #-16
+# CHECK-NEXT:  1      4     0.50                        cmplt	p0.b, p0/z, z0.b, #15
+# CHECK-NEXT:  1      4     0.50                        cmplt	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmplt	p0.d, p0/z, z0.d, #-16
+# CHECK-NEXT:  1      4     0.50                        cmplt	p0.d, p0/z, z0.d, #15
+# CHECK-NEXT:  1      4     0.50                        cmplt	p0.h, p0/z, z0.h, #-16
+# CHECK-NEXT:  1      4     0.50                        cmplt	p0.h, p0/z, z0.h, #15
+# CHECK-NEXT:  1      4     0.50                        cmplt	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmplt	p0.s, p0/z, z0.s, #-16
+# CHECK-NEXT:  1      4     0.50                        cmplt	p0.s, p0/z, z0.s, #15
+# CHECK-NEXT:  1      4     0.50                        cmplt	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmpne	p0.b, p0/z, z0.b, #-16
+# CHECK-NEXT:  1      4     0.50                        cmpne	p0.b, p0/z, z0.b, #15
+# CHECK-NEXT:  1      4     0.50                        cmpne	p0.b, p0/z, z0.b, z0.b
+# CHECK-NEXT:  1      4     0.50                        cmpne	p0.b, p0/z, z0.b, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmpne	p0.d, p0/z, z0.d, #-16
+# CHECK-NEXT:  1      4     0.50                        cmpne	p0.d, p0/z, z0.d, #15
+# CHECK-NEXT:  1      4     0.50                        cmpne	p0.d, p0/z, z0.d, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmpne	p0.h, p0/z, z0.h, #-16
+# CHECK-NEXT:  1      4     0.50                        cmpne	p0.h, p0/z, z0.h, #15
+# CHECK-NEXT:  1      4     0.50                        cmpne	p0.h, p0/z, z0.h, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmpne	p0.h, p0/z, z0.h, z0.h
+# CHECK-NEXT:  1      4     0.50                        cmpne	p0.s, p0/z, z0.s, #-16
+# CHECK-NEXT:  1      4     0.50                        cmpne	p0.s, p0/z, z0.s, #15
+# CHECK-NEXT:  1      4     0.50                        cmpne	p0.s, p0/z, z0.s, z0.d
+# CHECK-NEXT:  1      4     0.50                        cmpne	p0.s, p0/z, z0.s, z0.s
 # CHECK-NEXT:  1      3     0.50                        cnot	z31.b, p7/m, z31.b
 # CHECK-NEXT:  1      3     0.50                        cnot	z31.d, p7/m, z31.d
 # CHECK-NEXT:  1      3     0.50                        cnot	z31.h, p7/m, z31.h
 # CHECK-NEXT:  1      3     0.50                        cnot	z31.s, p7/m, z31.s
-# CHECK-NEXT:  1      4     0.50                        cnt	z31.b, p7/m, z31.b
+# CHECK-NEXT:  1      3     0.50                        cnt	z31.b, p7/m, z31.b
 # CHECK-NEXT:  1      12    0.50                        cnt	z31.d, p7/m, z31.d
-# CHECK-NEXT:  1      4     0.50                        cnt	z31.h, p7/m, z31.h
+# CHECK-NEXT:  1      3     0.50                        cnt	z31.h, p7/m, z31.h
 # CHECK-NEXT:  1      8     0.50                        cnt	z31.s, p7/m, z31.s
 # CHECK-NEXT:  1      1     0.33                        cntb	x0
 # CHECK-NEXT:  1      1     0.33                        cntb	x0, #28
@@ -3874,10 +3874,10 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      1     0.33                        cnth	x0, #28
 # CHECK-NEXT:  1      1     0.33                        cnth	x0, all, mul #16
 # CHECK-NEXT:  1      1     0.33                        cnth	x0, pow2
-# CHECK-NEXT:  1      6     1.00                        cntp	x0, p15, p0.b
-# CHECK-NEXT:  1      6     1.00                        cntp	x0, p15, p0.d
-# CHECK-NEXT:  1      6     1.00                        cntp	x0, p15, p0.h
-# CHECK-NEXT:  1      6     1.00                        cntp	x0, p15, p0.s
+# CHECK-NEXT:  1      4     1.00                        cntp	x0, p15, p0.b
+# CHECK-NEXT:  1      4     1.00                        cntp	x0, p15, p0.d
+# CHECK-NEXT:  1      4     1.00                        cntp	x0, p15, p0.h
+# CHECK-NEXT:  1      4     1.00                        cntp	x0, p15, p0.s
 # CHECK-NEXT:  1      1     0.33                        cntw	x0
 # CHECK-NEXT:  1      1     0.33                        cntw	x0, #28
 # CHECK-NEXT:  1      1     0.33                        cntw	x0, all, mul #16
@@ -3892,42 +3892,42 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      1     0.33                        ctermne	wzr, w30
 # CHECK-NEXT:  1      1     0.33                        ctermne	x30, xzr
 # CHECK-NEXT:  1      1     0.33                        ctermne	xzr, x30
-# CHECK-NEXT:  1      1     0.33                        decb	x0
-# CHECK-NEXT:  1      1     0.33                        decb	x0, #14
-# CHECK-NEXT:  1      1     0.33                        decb	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        decb	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        decb	x0, vl1
-# CHECK-NEXT:  1      1     0.33                        decd	x0
-# CHECK-NEXT:  1      1     0.33                        decd	x0, #14
-# CHECK-NEXT:  1      1     0.33                        decd	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        decd	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        decd	x0, vl1
-# CHECK-NEXT:  1      1     0.33                        dech	x0
-# CHECK-NEXT:  1      1     0.33                        dech	x0, #14
-# CHECK-NEXT:  1      1     0.33                        dech	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        dech	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        dech	x0, vl1
-# CHECK-NEXT:  1      6     1.00                        decp	x0, p0.b
-# CHECK-NEXT:  1      6     1.00                        decp	x0, p0.d
-# CHECK-NEXT:  1      6     1.00                        decp	x0, p0.h
-# CHECK-NEXT:  1      6     1.00                        decp	x0, p0.s
-# CHECK-NEXT:  1      6     1.00                        decp	xzr, p15.b
-# CHECK-NEXT:  1      6     1.00                        decp	xzr, p15.d
-# CHECK-NEXT:  1      6     1.00                        decp	xzr, p15.h
-# CHECK-NEXT:  1      6     1.00                        decp	xzr, p15.s
+# CHECK-NEXT:  1      3     0.33                        decb	x0
+# CHECK-NEXT:  1      3     0.33                        decb	x0, #14
+# CHECK-NEXT:  1      3     0.33                        decb	x0, all, mul #16
+# CHECK-NEXT:  1      3     0.33                        decb	x0, pow2
+# CHECK-NEXT:  1      3     0.33                        decb	x0, vl1
+# CHECK-NEXT:  1      3     0.33                        decd	x0
+# CHECK-NEXT:  1      3     0.33                        decd	x0, #14
+# CHECK-NEXT:  1      3     0.33                        decd	x0, all, mul #16
+# CHECK-NEXT:  1      3     0.33                        decd	x0, pow2
+# CHECK-NEXT:  1      3     0.33                        decd	x0, vl1
+# CHECK-NEXT:  1      3     0.33                        dech	x0
+# CHECK-NEXT:  1      3     0.33                        dech	x0, #14
+# CHECK-NEXT:  1      3     0.33                        dech	x0, all, mul #16
+# CHECK-NEXT:  1      3     0.33                        dech	x0, pow2
+# CHECK-NEXT:  1      3     0.33                        dech	x0, vl1
+# CHECK-NEXT:  1      4     1.00                        decp	x0, p0.b
+# CHECK-NEXT:  1      4     1.00                        decp	x0, p0.d
+# CHECK-NEXT:  1      4     1.00                        decp	x0, p0.h
+# CHECK-NEXT:  1      4     1.00                        decp	x0, p0.s
+# CHECK-NEXT:  1      4     1.00                        decp	xzr, p15.b
+# CHECK-NEXT:  1      4     1.00                        decp	xzr, p15.d
+# CHECK-NEXT:  1      4     1.00                        decp	xzr, p15.h
+# CHECK-NEXT:  1      4     1.00                        decp	xzr, p15.s
 # CHECK-NEXT:  1      4     0.50                        decp	z31.d, p15.d
 # CHECK-NEXT:  1      4     0.50                        decp	z31.h, p15.h
 # CHECK-NEXT:  1      4     0.50                        decp	z31.s, p15.s
-# CHECK-NEXT:  1      1     0.33                        decw	x0
-# CHECK-NEXT:  1      1     0.33                        decw	x0, #14
-# CHECK-NEXT:  1      1     0.33                        decw	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        decw	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        decw	x0, vl1
+# CHECK-NEXT:  1      3     0.33                        decw	x0
+# CHECK-NEXT:  1      3     0.33                        decw	x0, #14
+# CHECK-NEXT:  1      3     0.33                        decw	x0, all, mul #16
+# CHECK-NEXT:  1      3     0.33                        decw	x0, pow2
+# CHECK-NEXT:  1      3     0.33                        decw	x0, vl1
 # CHECK-NEXT:  1      4     0.50                        dupm	z0.d, #0xfffffffffffffff9
 # CHECK-NEXT:  1      4     0.50                        dupm	z0.s, #0xfffffff9
 # CHECK-NEXT:  1      4     0.50                        dupm	z23.h, #0xfff9
 # CHECK-NEXT:  1      4     0.50                        dupm	z5.b, #0xf9
-# CHECK-NEXT:  1      6     1.00                        eor	p0.b, p0/z, p0.b, p1.b
+# CHECK-NEXT:  1      2     1.00                        eor	p0.b, p0/z, p0.b, p1.b
 # CHECK-NEXT:  1      3     0.50                        eor	z0.d, z0.d, #0x6
 # CHECK-NEXT:  1      3     0.50                        eor	z0.d, z0.d, #0xfffffffffffffff9
 # CHECK-NEXT:  1      3     0.50                        eor	z0.d, z0.d, z0.d
@@ -3942,12 +3942,12 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      3     0.50                        eor	z31.s, p7/m, z31.s, z31.s
 # CHECK-NEXT:  1      3     0.50                        eor	z5.b, z5.b, #0x6
 # CHECK-NEXT:  1      3     0.50                        eor	z5.b, z5.b, #0xf9
-# CHECK-NEXT:  1      3     0.50                        eor3	z29.d, z29.d, z30.d, z31.d
+# CHECK-NEXT:  1      4     0.50                        eor3	z29.d, z29.d, z30.d, z31.d
 # CHECK-NEXT:  1      4     0.50                        eorbt	z0.b, z1.b, z31.b
 # CHECK-NEXT:  1      4     0.50                        eorbt	z0.d, z1.d, z31.d
 # CHECK-NEXT:  1      4     0.50                        eorbt	z0.h, z1.h, z31.h
 # CHECK-NEXT:  1      4     0.50                        eorbt	z0.s, z1.s, z31.s
-# CHECK-NEXT:  1      6     1.00                        eors	p0.b, p0/z, p0.b, p1.b
+# CHECK-NEXT:  1      2     1.00                        eors	p0.b, p0/z, p0.b, p1.b
 # CHECK-NEXT:  1      4     0.50                        eortb	z0.b, z1.b, z31.b
 # CHECK-NEXT:  1      4     0.50                        eortb	z0.d, z1.d, z31.d
 # CHECK-NEXT:  1      4     0.50                        eortb	z0.h, z1.h, z31.h
@@ -4303,49 +4303,49 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      4     0.50                        ftsmul	z0.d, z1.d, z31.d
 # CHECK-NEXT:  1      4     0.50                        ftsmul	z0.h, z1.h, z31.h
 # CHECK-NEXT:  1      4     0.50                        ftsmul	z0.s, z1.s, z31.s
-# CHECK-NEXT:  1      4     0.50                        ftssel	z0.d, z1.d, z31.d
-# CHECK-NEXT:  1      4     0.50                        ftssel	z0.h, z1.h, z31.h
-# CHECK-NEXT:  1      4     0.50                        ftssel	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      3     0.50                        ftssel	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      3     0.50                        ftssel	z0.h, z1.h, z31.h
+# CHECK-NEXT:  1      3     0.50                        ftssel	z0.s, z1.s, z31.s
 # CHECK-NEXT:  1      8     2.00                        histcnt	z0.s, p0/z, z1.s, z2.s
 # CHECK-NEXT:  1      8     2.00                        histcnt	z29.d, p7/z, z30.d, z31.d
 # CHECK-NEXT:  1      8     2.00                        histseg	z0.b, z1.b, z31.b
-# CHECK-NEXT:  1      1     0.33                        incb	x0
-# CHECK-NEXT:  1      1     0.33                        incb	x0, #14
-# CHECK-NEXT:  1      1     0.33                        incb	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        incb	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        incb	x0, vl1
-# CHECK-NEXT:  1      1     0.33                        incd	x0
-# CHECK-NEXT:  1      1     0.33                        incd	x0, #14
-# CHECK-NEXT:  1      1     0.33                        incd	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        incd	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        incd	x0, vl1
-# CHECK-NEXT:  1      4     0.50                        incd	z0.d
-# CHECK-NEXT:  1      4     0.50                        incd	z0.d, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        inch	x0
-# CHECK-NEXT:  1      1     0.33                        inch	x0, #14
-# CHECK-NEXT:  1      1     0.33                        inch	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        inch	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        inch	x0, vl1
-# CHECK-NEXT:  1      4     0.50                        inch	z0.h
-# CHECK-NEXT:  1      4     0.50                        inch	z0.h, all, mul #16
-# CHECK-NEXT:  1      6     1.00                        incp	x0, p0.b
-# CHECK-NEXT:  1      6     1.00                        incp	x0, p0.d
-# CHECK-NEXT:  1      6     1.00                        incp	x0, p0.h
-# CHECK-NEXT:  1      6     1.00                        incp	x0, p0.s
-# CHECK-NEXT:  1      6     1.00                        incp	xzr, p15.b
-# CHECK-NEXT:  1      6     1.00                        incp	xzr, p15.d
-# CHECK-NEXT:  1      6     1.00                        incp	xzr, p15.h
-# CHECK-NEXT:  1      6     1.00                        incp	xzr, p15.s
+# CHECK-NEXT:  1      3     0.33                        incb	x0
+# CHECK-NEXT:  1      3     0.33                        incb	x0, #14
+# CHECK-NEXT:  1      3     0.33                        incb	x0, all, mul #16
+# CHECK-NEXT:  1      3     0.33                        incb	x0, pow2
+# CHECK-NEXT:  1      3     0.33                        incb	x0, vl1
+# CHECK-NEXT:  1      3     0.33                        incd	x0
+# CHECK-NEXT:  1      3     0.33                        incd	x0, #14
+# CHECK-NEXT:  1      3     0.33                        incd	x0, all, mul #16
+# CHECK-NEXT:  1      3     0.33                        incd	x0, pow2
+# CHECK-NEXT:  1      3     0.33                        incd	x0, vl1
+# CHECK-NEXT:  1      3     0.50                        incd	z0.d
+# CHECK-NEXT:  1      3     0.50                        incd	z0.d, all, mul #16
+# CHECK-NEXT:  1      3     0.33                        inch	x0
+# CHECK-NEXT:  1      3     0.33                        inch	x0, #14
+# CHECK-NEXT:  1      3     0.33                        inch	x0, all, mul #16
+# CHECK-NEXT:  1      3     0.33                        inch	x0, pow2
+# CHECK-NEXT:  1      3     0.33                        inch	x0, vl1
+# CHECK-NEXT:  1      3     0.50                        inch	z0.h
+# CHECK-NEXT:  1      3     0.50                        inch	z0.h, all, mul #16
+# CHECK-NEXT:  1      4     1.00                        incp	x0, p0.b
+# CHECK-NEXT:  1      4     1.00                        incp	x0, p0.d
+# CHECK-NEXT:  1      4     1.00                        incp	x0, p0.h
+# CHECK-NEXT:  1      4     1.00                        incp	x0, p0.s
+# CHECK-NEXT:  1      4     1.00                        incp	xzr, p15.b
+# CHECK-NEXT:  1      4     1.00                        incp	xzr, p15.d
+# CHECK-NEXT:  1      4     1.00                        incp	xzr, p15.h
+# CHECK-NEXT:  1      4     1.00                        incp	xzr, p15.s
 # CHECK-NEXT:  1      4     0.50                        incp	z31.d, p15.d
 # CHECK-NEXT:  1      4     0.50                        incp	z31.h, p15.h
 # CHECK-NEXT:  1      4     0.50                        incp	z31.s, p15.s
-# CHECK-NEXT:  1      1     0.33                        incw	x0
-# CHECK-NEXT:  1      1     0.33                        incw	x0, #14
-# CHECK-NEXT:  1      1     0.33                        incw	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        incw	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        incw	x0, vl1
-# CHECK-NEXT:  1      4     0.50                        incw	z0.s
-# CHECK-NEXT:  1      4     0.50                        incw	z0.s, all, mul #16
+# CHECK-NEXT:  1      3     0.33                        incw	x0
+# CHECK-NEXT:  1      3     0.33                        incw	x0, #14
+# CHECK-NEXT:  1      3     0.33                        incw	x0, all, mul #16
+# CHECK-NEXT:  1      3     0.33                        incw	x0, pow2
+# CHECK-NEXT:  1      3     0.33                        incw	x0, vl1
+# CHECK-NEXT:  1      3     0.50                        incw	z0.s
+# CHECK-NEXT:  1      3     0.50                        incw	z0.s, all, mul #16
 # CHECK-NEXT:  1      4     0.50                        index	z0.b, #0, #0
 # CHECK-NEXT:  1      4     0.50                        index	z0.d, #0, #0
 # CHECK-NEXT:  1      4     0.50                        index	z0.h, #0, #0
@@ -4412,8 +4412,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      3     0.50    *                   ld1b	{ z0.d }, p0/z, [x0]
 # CHECK-NEXT:  1      7     7.00    *                   ld1b	{ z0.d }, p0/z, [z0.d]
 # CHECK-NEXT:  1      3     0.50    *                   ld1b	{ z0.h }, p0/z, [x0]
-# CHECK-NEXT:  1      9     4.50    *                   ld1b	{ z0.s }, p0/z, [x0, z0.s, sxtw]
-# CHECK-NEXT:  1      9     4.50    *                   ld1b	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  1      7     3.50    *                   ld1b	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  1      7     3.50    *                   ld1b	{ z0.s }, p0/z, [x0, z0.s, uxtw]
 # CHECK-NEXT:  1      3     0.50    *                   ld1b	{ z0.s }, p0/z, [x0]
 # CHECK-NEXT:  1      9     9.00    *                   ld1b	{ z0.s }, p0/z, [z0.s]
 # CHECK-NEXT:  1      3     0.50    *                   ld1b	{ z21.b }, p5/z, [x10, #5, mul vl]
@@ -4450,8 +4450,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      3     0.50    *                   ld1h	{ z0.d }, p0/z, [x0]
 # CHECK-NEXT:  1      7     7.00    *                   ld1h	{ z0.d }, p0/z, [z0.d]
 # CHECK-NEXT:  1      3     0.50    *                   ld1h	{ z0.h }, p0/z, [x0]
-# CHECK-NEXT:  1      9     4.50    *                   ld1h	{ z0.s }, p0/z, [x0, z0.s, sxtw]
-# CHECK-NEXT:  1      9     4.50    *                   ld1h	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  1      7     3.50    *                   ld1h	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  1      7     3.50    *                   ld1h	{ z0.s }, p0/z, [x0, z0.s, uxtw]
 # CHECK-NEXT:  1      3     0.50    *                   ld1h	{ z0.s }, p0/z, [x0]
 # CHECK-NEXT:  1      9     9.00    *                   ld1h	{ z0.s }, p0/z, [z0.s]
 # CHECK-NEXT:  1      3     0.50    *                   ld1h	{ z21.d }, p5/z, [x10, #5, mul vl]
@@ -4467,8 +4467,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      7     7.00    *                   ld1h	{ z31.d }, p7/z, [z31.d, #62]
 # CHECK-NEXT:  1      3     0.50    *                   ld1h	{ z31.h }, p7/z, [sp, #-1, mul vl]
 # CHECK-NEXT:  1      3     0.50    *                   ld1h	{ z31.s }, p7/z, [sp, #-1, mul vl]
-# CHECK-NEXT:  1      9     4.50    *                   ld1h	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
-# CHECK-NEXT:  1      9     4.50    *                   ld1h	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
+# CHECK-NEXT:  1      7     3.50    *                   ld1h	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
+# CHECK-NEXT:  1      7     3.50    *                   ld1h	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
 # CHECK-NEXT:  1      9     9.00    *                   ld1h	{ z31.s }, p7/z, [z31.s, #62]
 # CHECK-NEXT:  1      3     0.50    *                   ld1h	{ z5.h }, p3/z, [sp, x16, lsl #1]
 # CHECK-NEXT:  1      3     0.50    *                   ld1h	{ z5.h }, p3/z, [x17, x16, lsl #1]
@@ -4529,7 +4529,7 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      3     0.50    *                   ld1sb	{ z0.h }, p0/z, [sp, x0]
 # CHECK-NEXT:  1      3     0.50    *                   ld1sb	{ z0.h }, p0/z, [x0, x0]
 # CHECK-NEXT:  1      3     0.50    *                   ld1sb	{ z0.h }, p0/z, [x0]
-# CHECK-NEXT:  1      9     4.50    *                   ld1sb	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  1      7     3.50    *                   ld1sb	{ z0.s }, p0/z, [x0, z0.s, sxtw]
 # CHECK-NEXT:  1      3     0.50    *                   ld1sb	{ z0.s }, p0/z, [x0]
 # CHECK-NEXT:  1      9     9.00    *                   ld1sb	{ z0.s }, p0/z, [z0.s]
 # CHECK-NEXT:  1      3     0.50    *                   ld1sb	{ z21.d }, p5/z, [x10, #5, mul vl]
@@ -4549,8 +4549,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      7     7.00    *                   ld1sh	{ z0.d }, p0/z, [x0, z0.d, uxtw #1]
 # CHECK-NEXT:  1      3     0.50    *                   ld1sh	{ z0.d }, p0/z, [x0]
 # CHECK-NEXT:  1      7     7.00    *                   ld1sh	{ z0.d }, p0/z, [z0.d]
-# CHECK-NEXT:  1      9     4.50    *                   ld1sh	{ z0.s }, p0/z, [x0, z0.s, sxtw]
-# CHECK-NEXT:  1      9     4.50    *                   ld1sh	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  1      7     3.50    *                   ld1sh	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  1      7     3.50    *                   ld1sh	{ z0.s }, p0/z, [x0, z0.s, uxtw]
 # CHECK-NEXT:  1      3     0.50    *                   ld1sh	{ z0.s }, p0/z, [x0]
 # CHECK-NEXT:  1      9     9.00    *                   ld1sh	{ z0.s }, p0/z, [z0.s]
 # CHECK-NEXT:  1      3     0.50    *                   ld1sh	{ z21.d }, p5/z, [x10, #5, mul vl]
@@ -4565,8 +4565,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      7     7.00    *                   ld1sh	{ z31.d }, p7/z, [sp, z31.d]
 # CHECK-NEXT:  1      7     7.00    *                   ld1sh	{ z31.d }, p7/z, [z31.d, #62]
 # CHECK-NEXT:  1      3     0.50    *                   ld1sh	{ z31.s }, p7/z, [sp, #-1, mul vl]
-# CHECK-NEXT:  1      9     4.50    *                   ld1sh	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
-# CHECK-NEXT:  1      9     4.50    *                   ld1sh	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
+# CHECK-NEXT:  1      7     3.50    *                   ld1sh	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
+# CHECK-NEXT:  1      7     3.50    *                   ld1sh	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
 # CHECK-NEXT:  1      9     9.00    *                   ld1sh	{ z31.s }, p7/z, [z31.s, #62]
 # CHECK-NEXT:  1      7     7.00    *                   ld1sw	{ z0.d }, p0/z, [x0, z0.d, sxtw #2]
 # CHECK-NEXT:  1      7     7.00    *                   ld1sw	{ z0.d }, p0/z, [x0, z0.d, uxtw #2]
@@ -4585,8 +4585,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      7     7.00    *                   ld1w	{ z0.d }, p0/z, [x0, z0.d, uxtw #2]
 # CHECK-NEXT:  1      3     0.50    *                   ld1w	{ z0.d }, p0/z, [x0]
 # CHECK-NEXT:  1      7     7.00    *                   ld1w	{ z0.d }, p0/z, [z0.d]
-# CHECK-NEXT:  1      9     4.50    *                   ld1w	{ z0.s }, p0/z, [x0, z0.s, sxtw]
-# CHECK-NEXT:  1      9     4.50    *                   ld1w	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  1      7     3.50    *                   ld1w	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  1      7     3.50    *                   ld1w	{ z0.s }, p0/z, [x0, z0.s, uxtw]
 # CHECK-NEXT:  1      3     0.50    *                   ld1w	{ z0.s }, p0/z, [x0]
 # CHECK-NEXT:  1      9     9.00    *                   ld1w	{ z0.s }, p0/z, [z0.s]
 # CHECK-NEXT:  1      3     0.50    *                   ld1w	{ z21.d }, p5/z, [x10, #5, mul vl]
@@ -4601,8 +4601,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      7     7.00    *                   ld1w	{ z31.d }, p7/z, [sp, z31.d]
 # CHECK-NEXT:  1      7     7.00    *                   ld1w	{ z31.d }, p7/z, [z31.d, #124]
 # CHECK-NEXT:  1      3     0.50    *                   ld1w	{ z31.s }, p7/z, [sp, #-1, mul vl]
-# CHECK-NEXT:  1      9     4.50    *                   ld1w	{ z31.s }, p7/z, [sp, z31.s, sxtw #2]
-# CHECK-NEXT:  1      9     4.50    *                   ld1w	{ z31.s }, p7/z, [sp, z31.s, uxtw #2]
+# CHECK-NEXT:  1      7     3.50    *                   ld1w	{ z31.s }, p7/z, [sp, z31.s, sxtw #2]
+# CHECK-NEXT:  1      7     3.50    *                   ld1w	{ z31.s }, p7/z, [sp, z31.s, uxtw #2]
 # CHECK-NEXT:  1      9     9.00    *                   ld1w	{ z31.s }, p7/z, [z31.s, #124]
 # CHECK-NEXT:  1      3     2.00    *                   ld2b	{ z0.b, z1.b }, p0/z, [x0, x0]
 # CHECK-NEXT:  1      3     1.00    *                   ld2b	{ z0.b, z1.b }, p0/z, [x0]
@@ -4668,8 +4668,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      7     7.00    *             U     ldff1b	{ z0.d }, p0/z, [z0.d]
 # CHECK-NEXT:  1      3     0.50    *             U     ldff1b	{ z0.h }, p0/z, [x0, x0]
 # CHECK-NEXT:  1      3     0.50    *             U     ldff1b	{ z0.s }, p0/z, [x0, x0]
-# CHECK-NEXT:  1      9     4.50    *             U     ldff1b	{ z0.s }, p0/z, [x0, z0.s, sxtw]
-# CHECK-NEXT:  1      9     4.50    *             U     ldff1b	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  1      7     3.50    *             U     ldff1b	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  1      7     3.50    *             U     ldff1b	{ z0.s }, p0/z, [x0, z0.s, uxtw]
 # CHECK-NEXT:  1      9     9.00    *             U     ldff1b	{ z0.s }, p0/z, [z0.s]
 # CHECK-NEXT:  1      7     7.00    *             U     ldff1b	{ z21.d }, p5/z, [x10, z21.d, sxtw]
 # CHECK-NEXT:  1      7     7.00    *             U     ldff1b	{ z21.d }, p5/z, [x10, z21.d, uxtw]
@@ -4696,8 +4696,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      7     7.00    *             U     ldff1h	{ z0.d }, p0/z, [z0.d]
 # CHECK-NEXT:  1      3     0.50    *             U     ldff1h	{ z0.h }, p0/z, [x0, x0, lsl #1]
 # CHECK-NEXT:  1      3     0.50    *             U     ldff1h	{ z0.s }, p0/z, [x0, x0, lsl #1]
-# CHECK-NEXT:  1      9     4.50    *             U     ldff1h	{ z0.s }, p0/z, [x0, z0.s, sxtw]
-# CHECK-NEXT:  1      9     4.50    *             U     ldff1h	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  1      7     3.50    *             U     ldff1h	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  1      7     3.50    *             U     ldff1h	{ z0.s }, p0/z, [x0, z0.s, uxtw]
 # CHECK-NEXT:  1      9     9.00    *             U     ldff1h	{ z0.s }, p0/z, [z0.s]
 # CHECK-NEXT:  1      7     7.00    *             U     ldff1h	{ z21.d }, p5/z, [x10, z21.d, sxtw]
 # CHECK-NEXT:  1      7     7.00    *             U     ldff1h	{ z21.d }, p5/z, [x10, z21.d, uxtw]
@@ -4706,16 +4706,16 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      3     0.50    *             U     ldff1h	{ z31.d }, p7/z, [sp]
 # CHECK-NEXT:  1      7     7.00    *             U     ldff1h	{ z31.d }, p7/z, [z31.d, #62]
 # CHECK-NEXT:  1      3     0.50    *             U     ldff1h	{ z31.h }, p7/z, [sp]
-# CHECK-NEXT:  1      9     4.50    *             U     ldff1h	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
-# CHECK-NEXT:  1      9     4.50    *             U     ldff1h	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
+# CHECK-NEXT:  1      7     3.50    *             U     ldff1h	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
+# CHECK-NEXT:  1      7     3.50    *             U     ldff1h	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
 # CHECK-NEXT:  1      3     0.50    *             U     ldff1h	{ z31.s }, p7/z, [sp]
 # CHECK-NEXT:  1      9     9.00    *             U     ldff1h	{ z31.s }, p7/z, [z31.s, #62]
 # CHECK-NEXT:  1      3     0.50    *             U     ldff1sb	{ z0.d }, p0/z, [x0, x0]
 # CHECK-NEXT:  1      7     7.00    *             U     ldff1sb	{ z0.d }, p0/z, [z0.d]
 # CHECK-NEXT:  1      3     0.50    *             U     ldff1sb	{ z0.h }, p0/z, [x0, x0]
 # CHECK-NEXT:  1      3     0.50    *             U     ldff1sb	{ z0.s }, p0/z, [x0, x0]
-# CHECK-NEXT:  1      9     4.50    *             U     ldff1sb	{ z0.s }, p0/z, [x0, z0.s, sxtw]
-# CHECK-NEXT:  1      9     4.50    *             U     ldff1sb	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  1      7     3.50    *             U     ldff1sb	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  1      7     3.50    *             U     ldff1sb	{ z0.s }, p0/z, [x0, z0.s, uxtw]
 # CHECK-NEXT:  1      9     9.00    *             U     ldff1sb	{ z0.s }, p0/z, [z0.s]
 # CHECK-NEXT:  1      7     7.00    *             U     ldff1sb	{ z21.d }, p5/z, [x10, z21.d, sxtw]
 # CHECK-NEXT:  1      7     7.00    *             U     ldff1sb	{ z21.d }, p5/z, [x10, z21.d, uxtw]
@@ -4730,8 +4730,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      7     7.00    *             U     ldff1sh	{ z0.d }, p0/z, [x0, z0.d, uxtw #1]
 # CHECK-NEXT:  1      7     7.00    *             U     ldff1sh	{ z0.d }, p0/z, [z0.d]
 # CHECK-NEXT:  1      3     0.50    *             U     ldff1sh	{ z0.s }, p0/z, [x0, x0, lsl #1]
-# CHECK-NEXT:  1      9     4.50    *             U     ldff1sh	{ z0.s }, p0/z, [x0, z0.s, sxtw]
-# CHECK-NEXT:  1      9     4.50    *             U     ldff1sh	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  1      7     3.50    *             U     ldff1sh	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  1      7     3.50    *             U     ldff1sh	{ z0.s }, p0/z, [x0, z0.s, uxtw]
 # CHECK-NEXT:  1      9     9.00    *             U     ldff1sh	{ z0.s }, p0/z, [z0.s]
 # CHECK-NEXT:  1      7     7.00    *             U     ldff1sh	{ z21.d }, p5/z, [x10, z21.d, sxtw]
 # CHECK-NEXT:  1      7     7.00    *             U     ldff1sh	{ z21.d }, p5/z, [x10, z21.d, uxtw]
@@ -4739,8 +4739,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      7     7.00    *             U     ldff1sh	{ z31.d }, p7/z, [sp, z31.d]
 # CHECK-NEXT:  1      3     0.50    *             U     ldff1sh	{ z31.d }, p7/z, [sp]
 # CHECK-NEXT:  1      7     7.00    *             U     ldff1sh	{ z31.d }, p7/z, [z31.d, #62]
-# CHECK-NEXT:  1      9     4.50    *             U     ldff1sh	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
-# CHECK-NEXT:  1      9     4.50    *             U     ldff1sh	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
+# CHECK-NEXT:  1      7     3.50    *             U     ldff1sh	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
+# CHECK-NEXT:  1      7     3.50    *             U     ldff1sh	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
 # CHECK-NEXT:  1      3     0.50    *             U     ldff1sh	{ z31.s }, p7/z, [sp]
 # CHECK-NEXT:  1      9     9.00    *             U     ldff1sh	{ z31.s }, p7/z, [z31.s, #62]
 # CHECK-NEXT:  1      3     0.50    *             U     ldff1sw	{ z0.d }, p0/z, [x0, x0, lsl #2]
@@ -4758,8 +4758,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      7     7.00    *             U     ldff1w	{ z0.d }, p0/z, [x0, z0.d, uxtw #2]
 # CHECK-NEXT:  1      7     7.00    *             U     ldff1w	{ z0.d }, p0/z, [z0.d]
 # CHECK-NEXT:  1      3     0.50    *             U     ldff1w	{ z0.s }, p0/z, [x0, x0, lsl #2]
-# CHECK-NEXT:  1      9     4.50    *             U     ldff1w	{ z0.s }, p0/z, [x0, z0.s, sxtw]
-# CHECK-NEXT:  1      9     4.50    *             U     ldff1w	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  1      7     3.50    *             U     ldff1w	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  1      7     3.50    *             U     ldff1w	{ z0.s }, p0/z, [x0, z0.s, uxtw]
 # CHECK-NEXT:  1      9     9.00    *             U     ldff1w	{ z0.s }, p0/z, [z0.s]
 # CHECK-NEXT:  1      7     7.00    *             U     ldff1w	{ z21.d }, p5/z, [x10, z21.d, sxtw]
 # CHECK-NEXT:  1      7     7.00    *             U     ldff1w	{ z21.d }, p5/z, [x10, z21.d, uxtw]
@@ -4767,8 +4767,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      7     7.00    *             U     ldff1w	{ z31.d }, p7/z, [sp, z31.d]
 # CHECK-NEXT:  1      3     0.50    *             U     ldff1w	{ z31.d }, p7/z, [sp]
 # CHECK-NEXT:  1      7     7.00    *             U     ldff1w	{ z31.d }, p7/z, [z31.d, #124]
-# CHECK-NEXT:  1      9     4.50    *             U     ldff1w	{ z31.s }, p7/z, [sp, z31.s, sxtw #2]
-# CHECK-NEXT:  1      9     4.50    *             U     ldff1w	{ z31.s }, p7/z, [sp, z31.s, uxtw #2]
+# CHECK-NEXT:  1      7     3.50    *             U     ldff1w	{ z31.s }, p7/z, [sp, z31.s, sxtw #2]
+# CHECK-NEXT:  1      7     3.50    *             U     ldff1w	{ z31.s }, p7/z, [sp, z31.s, uxtw #2]
 # CHECK-NEXT:  1      3     0.50    *             U     ldff1w	{ z31.s }, p7/z, [sp]
 # CHECK-NEXT:  1      9     9.00    *             U     ldff1w	{ z31.s }, p7/z, [z31.s, #124]
 # CHECK-NEXT:  1      3     0.50    *             U     ldnf1b	{ z0.b }, p0/z, [x0]
@@ -4959,12 +4959,12 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      4     0.50                        mls	z0.h, z1.h, z7.h[7]
 # CHECK-NEXT:  1      4     0.50                        mls	z0.s, p7/m, z1.s, z31.s
 # CHECK-NEXT:  1      4     0.50                        mls	z0.s, z1.s, z7.s[3]
-# CHECK-NEXT:  1      6     1.00                        mov	p0.b, p0.b
-# CHECK-NEXT:  1      6     1.00                        mov	p0.b, p0/m, p0.b
-# CHECK-NEXT:  1      6     1.00                        mov	p0.b, p0/z, p0.b
-# CHECK-NEXT:  1      6     1.00                        mov	p15.b, p15.b
-# CHECK-NEXT:  1      6     1.00                        mov	p15.b, p15/m, p15.b
-# CHECK-NEXT:  1      6     1.00                        mov	p15.b, p15/z, p15.b
+# CHECK-NEXT:  1      2     1.00                        mov	p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        mov	p0.b, p0/m, p0.b
+# CHECK-NEXT:  1      2     1.00                        mov	p0.b, p0/z, p0.b
+# CHECK-NEXT:  1      2     1.00                        mov	p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        mov	p15.b, p15/m, p15.b
+# CHECK-NEXT:  1      2     1.00                        mov	p15.b, p15/z, p15.b
 # CHECK-NEXT:  1      3     0.50                        mov	z0.b, #127
 # CHECK-NEXT:  1      3     0.50                        mov	z0.b, b0
 # CHECK-NEXT:  1      3     0.50                        mov	z0.b, p0/m, b0
@@ -5062,10 +5062,10 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      3     0.50                        mov	z5.h, #-6
 # CHECK-NEXT:  1      3     0.50                        mov	z5.q, z17.q[3]
 # CHECK-NEXT:  1      3     0.50                        mov	z5.s, #-6
-# CHECK-NEXT:  1      6     1.00                        movs	p0.b, p0.b
-# CHECK-NEXT:  1      6     1.00                        movs	p0.b, p0/z, p0.b
-# CHECK-NEXT:  1      6     1.00                        movs	p15.b, p15.b
-# CHECK-NEXT:  1      6     1.00                        movs	p15.b, p15/z, p15.b
+# CHECK-NEXT:  1      2     1.00                        movs	p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        movs	p0.b, p0/z, p0.b
+# CHECK-NEXT:  1      2     1.00                        movs	p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        movs	p15.b, p15/z, p15.b
 # CHECK-NEXT:  1      1     1.00                  U     mrs	x3, ID_AA64ZFR0_EL1
 # CHECK-NEXT:  1      1     1.00                  U     mrs	x3, ZCR_EL1
 # CHECK-NEXT:  1      1     1.00                  U     mrs	x3, ZCR_EL12
@@ -5098,10 +5098,10 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      4     0.50                        mul	z31.h, z31.h, #127
 # CHECK-NEXT:  1      4     0.50                        mul	z31.s, z31.s, #-128
 # CHECK-NEXT:  1      4     0.50                        mul	z31.s, z31.s, #127
-# CHECK-NEXT:  1      6     1.00                        nand	p0.b, p0/z, p0.b, p0.b
-# CHECK-NEXT:  1      6     1.00                        nand	p15.b, p15/z, p15.b, p15.b
-# CHECK-NEXT:  1      6     1.00                        nands	p0.b, p0/z, p0.b, p0.b
-# CHECK-NEXT:  1      6     1.00                        nands	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        nand	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        nand	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        nands	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        nands	p15.b, p15/z, p15.b, p15.b
 # CHECK-NEXT:  1      3     0.50                        nbsl	z0.d, z0.d, z1.d, z2.d
 # CHECK-NEXT:  1      3     0.50                        neg	z0.b, p0/m, z0.b
 # CHECK-NEXT:  1      3     0.50                        neg	z0.d, p0/m, z0.d
@@ -5115,23 +5115,23 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      7     1.00                        nmatch	p0.h, p0/z, z0.h, z0.h
 # CHECK-NEXT:  1      7     1.00                        nmatch	p15.b, p7/z, z30.b, z31.b
 # CHECK-NEXT:  1      7     1.00                        nmatch	p15.h, p7/z, z30.h, z31.h
-# CHECK-NEXT:  1      6     1.00                        nor	p0.b, p0/z, p0.b, p0.b
-# CHECK-NEXT:  1      6     1.00                        nor	p15.b, p15/z, p15.b, p15.b
-# CHECK-NEXT:  1      6     1.00                        nors	p0.b, p0/z, p0.b, p0.b
-# CHECK-NEXT:  1      6     1.00                        nors	p15.b, p15/z, p15.b, p15.b
-# CHECK-NEXT:  1      6     1.00                        not	p0.b, p0/z, p0.b
-# CHECK-NEXT:  1      6     1.00                        not	p15.b, p15/z, p15.b
+# CHECK-NEXT:  1      2     1.00                        nor	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        nor	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        nors	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        nors	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        not	p0.b, p0/z, p0.b
+# CHECK-NEXT:  1      2     1.00                        not	p15.b, p15/z, p15.b
 # CHECK-NEXT:  1      3     0.50                        not	z31.b, p7/m, z31.b
 # CHECK-NEXT:  1      3     0.50                        not	z31.d, p7/m, z31.d
 # CHECK-NEXT:  1      3     0.50                        not	z31.h, p7/m, z31.h
 # CHECK-NEXT:  1      3     0.50                        not	z31.s, p7/m, z31.s
-# CHECK-NEXT:  1      6     1.00                        nots	p0.b, p0/z, p0.b
-# CHECK-NEXT:  1      6     1.00                        nots	p15.b, p15/z, p15.b
-# CHECK-NEXT:  1      6     1.00                        orn	p0.b, p0/z, p0.b, p0.b
-# CHECK-NEXT:  1      6     1.00                        orn	p15.b, p15/z, p15.b, p15.b
-# CHECK-NEXT:  1      6     1.00                        orns	p0.b, p0/z, p0.b, p0.b
-# CHECK-NEXT:  1      6     1.00                        orns	p15.b, p15/z, p15.b, p15.b
-# CHECK-NEXT:  1      6     1.00                        orr	p0.b, p0/z, p0.b, p1.b
+# CHECK-NEXT:  1      2     1.00                        nots	p0.b, p0/z, p0.b
+# CHECK-NEXT:  1      2     1.00                        nots	p15.b, p15/z, p15.b
+# CHECK-NEXT:  1      2     1.00                        orn	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        orn	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        orns	p0.b, p0/z, p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        orns	p15.b, p15/z, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        orr	p0.b, p0/z, p0.b, p1.b
 # CHECK-NEXT:  1      3     0.50                        orr	z0.d, z0.d, #0x6
 # CHECK-NEXT:  1      3     0.50                        orr	z0.d, z0.d, #0xfffffffffffffff9
 # CHECK-NEXT:  1      3     0.50                        orr	z0.s, z0.s, #0x6
@@ -5145,27 +5145,27 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      3     0.50                        orr	z31.s, p7/m, z31.s, z31.s
 # CHECK-NEXT:  1      3     0.50                        orr	z5.b, z5.b, #0x6
 # CHECK-NEXT:  1      3     0.50                        orr	z5.b, z5.b, #0xf9
-# CHECK-NEXT:  1      6     1.00                        orrs	p0.b, p0/z, p0.b, p1.b
+# CHECK-NEXT:  1      2     1.00                        orrs	p0.b, p0/z, p0.b, p1.b
 # CHECK-NEXT:  1      4     1.00                        orv	b0, p7, z31.b
 # CHECK-NEXT:  1      4     1.00                        orv	d0, p7, z31.d
 # CHECK-NEXT:  1      4     1.00                        orv	h0, p7, z31.h
 # CHECK-NEXT:  1      4     1.00                        orv	s0, p7, z31.s
-# CHECK-NEXT:  1      6     1.00                        pfalse	p15.b
-# CHECK-NEXT:  1      6     1.00                        pfirst	p0.b, p15, p0.b
-# CHECK-NEXT:  1      6     1.00                        pfirst	p15.b, p15, p15.b
+# CHECK-NEXT:  1      2     1.00                        pfalse	p15.b
+# CHECK-NEXT:  1      2     1.00                        pfirst	p0.b, p15, p0.b
+# CHECK-NEXT:  1      2     1.00                        pfirst	p15.b, p15, p15.b
 # CHECK-NEXT:  1      4     0.50                        pmul	z0.b, z1.b, z2.b
 # CHECK-NEXT:  1      4     0.50                        pmul	z29.b, z30.b, z31.b
-# CHECK-NEXT:  1      6     1.00                        pmullb	z0.h, z1.b, z2.b
-# CHECK-NEXT:  1      6     1.00                        pmullb	z29.q, z30.d, z31.d
-# CHECK-NEXT:  1      6     1.00                        pmullb	z31.d, z31.s, z31.s
-# CHECK-NEXT:  1      6     1.00                        pmullt	z0.h, z1.b, z2.b
-# CHECK-NEXT:  1      6     1.00                        pmullt	z29.q, z30.d, z31.d
-# CHECK-NEXT:  1      6     1.00                        pmullt	z31.d, z31.s, z31.s
-# CHECK-NEXT:  1      6     1.00                        pnext	p0.b, p15, p0.b
-# CHECK-NEXT:  1      6     1.00                        pnext	p0.d, p15, p0.d
-# CHECK-NEXT:  1      6     1.00                        pnext	p0.h, p15, p0.h
-# CHECK-NEXT:  1      6     1.00                        pnext	p0.s, p15, p0.s
-# CHECK-NEXT:  1      6     1.00                        pnext	p15.b, p15, p15.b
+# CHECK-NEXT:  1      9     1.00                        pmullb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      9     1.00                        pmullb	z29.q, z30.d, z31.d
+# CHECK-NEXT:  1      9     1.00                        pmullb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      9     1.00                        pmullt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      9     1.00                        pmullt	z29.q, z30.d, z31.d
+# CHECK-NEXT:  1      9     1.00                        pmullt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      2     1.00                        pnext	p0.b, p15, p0.b
+# CHECK-NEXT:  1      2     1.00                        pnext	p0.d, p15, p0.d
+# CHECK-NEXT:  1      2     1.00                        pnext	p0.h, p15, p0.h
+# CHECK-NEXT:  1      2     1.00                        pnext	p0.s, p15, p0.s
+# CHECK-NEXT:  1      2     1.00                        pnext	p15.b, p15, p15.b
 # CHECK-NEXT:  1      0     0.50    *      *      U     prfb	#14, p0, [x0]
 # CHECK-NEXT:  1      0     0.50    *      *      U     prfb	#15, p0, [x0]
 # CHECK-NEXT:  1      0     0.50    *      *      U     prfb	#6, p0, [x0]
@@ -5274,97 +5274,97 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      0     0.50    *      *      U     prfw	pstl2strm, p0, [x0]
 # CHECK-NEXT:  1      0     0.50    *      *      U     prfw	pstl3keep, p0, [x0]
 # CHECK-NEXT:  1      0     0.50    *      *      U     prfw	pstl3strm, p0, [x0]
-# CHECK-NEXT:  1      6     1.00                        ptest	p15, p0.b
-# CHECK-NEXT:  1      6     1.00                        ptest	p15, p15.b
-# CHECK-NEXT:  1      6     1.00                        ptrue	p0.b, pow2
-# CHECK-NEXT:  1      6     1.00                        ptrue	p0.d, pow2
-# CHECK-NEXT:  1      6     1.00                        ptrue	p0.h, pow2
-# CHECK-NEXT:  1      6     1.00                        ptrue	p0.s, pow2
-# CHECK-NEXT:  1      6     1.00                        ptrue	p15.b
-# CHECK-NEXT:  1      6     1.00                        ptrue	p15.d
-# CHECK-NEXT:  1      6     1.00                        ptrue	p15.h
-# CHECK-NEXT:  1      6     1.00                        ptrue	p15.s
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, #14
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, #15
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, #16
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, #17
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, #18
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, #19
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, #20
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, #21
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, #22
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, #23
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, #24
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, #25
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, #26
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, #27
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, #28
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, mul3
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, mul4
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, vl1
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, vl128
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, vl16
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, vl2
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, vl256
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, vl3
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, vl32
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, vl4
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, vl5
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, vl6
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, vl64
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, vl7
-# CHECK-NEXT:  1      6     1.00                        ptrue	p7.s, vl8
-# CHECK-NEXT:  1      6     1.00                        ptrues	p0.b, pow2
-# CHECK-NEXT:  1      6     1.00                        ptrues	p0.d, pow2
-# CHECK-NEXT:  1      6     1.00                        ptrues	p0.h, pow2
-# CHECK-NEXT:  1      6     1.00                        ptrues	p0.s, pow2
-# CHECK-NEXT:  1      6     1.00                        ptrues	p15.b
-# CHECK-NEXT:  1      6     1.00                        ptrues	p15.d
-# CHECK-NEXT:  1      6     1.00                        ptrues	p15.h
-# CHECK-NEXT:  1      6     1.00                        ptrues	p15.s
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, #14
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, #15
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, #16
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, #17
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, #18
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, #19
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, #20
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, #21
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, #22
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, #23
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, #24
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, #25
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, #26
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, #27
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, #28
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, mul3
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, mul4
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, vl1
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, vl128
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, vl16
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, vl2
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, vl256
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, vl3
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, vl32
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, vl4
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, vl5
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, vl6
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, vl64
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, vl7
-# CHECK-NEXT:  1      6     1.00                        ptrues	p7.s, vl8
-# CHECK-NEXT:  1      6     1.00                        punpkhi	p0.h, p0.b
-# CHECK-NEXT:  1      6     1.00                        punpkhi	p15.h, p15.b
-# CHECK-NEXT:  1      6     1.00                        punpklo	p0.h, p0.b
-# CHECK-NEXT:  1      6     1.00                        punpklo	p15.h, p15.b
-# CHECK-NEXT:  1      4     0.50                        raddhnb	z0.b, z1.h, z31.h
-# CHECK-NEXT:  1      4     0.50                        raddhnb	z0.h, z1.s, z31.s
-# CHECK-NEXT:  1      4     0.50                        raddhnb	z0.s, z1.d, z31.d
-# CHECK-NEXT:  1      4     0.50                        raddhnt	z0.b, z1.h, z31.h
-# CHECK-NEXT:  1      4     0.50                        raddhnt	z0.h, z1.s, z31.s
-# CHECK-NEXT:  1      4     0.50                        raddhnt	z0.s, z1.d, z31.d
-# CHECK-NEXT:  1      8     1.00                        rax1	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      2     1.00                        ptest	p15, p0.b
+# CHECK-NEXT:  1      2     1.00                        ptest	p15, p15.b
+# CHECK-NEXT:  1      2     1.00                        ptrue	p0.b, pow2
+# CHECK-NEXT:  1      2     1.00                        ptrue	p0.d, pow2
+# CHECK-NEXT:  1      2     1.00                        ptrue	p0.h, pow2
+# CHECK-NEXT:  1      2     1.00                        ptrue	p0.s, pow2
+# CHECK-NEXT:  1      2     1.00                        ptrue	p15.b
+# CHECK-NEXT:  1      2     1.00                        ptrue	p15.d
+# CHECK-NEXT:  1      2     1.00                        ptrue	p15.h
+# CHECK-NEXT:  1      2     1.00                        ptrue	p15.s
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #14
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #15
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #16
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #17
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #18
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #19
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #20
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #21
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #22
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #23
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #24
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #25
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #26
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #27
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, #28
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, mul3
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, mul4
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl1
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl128
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl16
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl2
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl256
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl3
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl32
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl4
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl5
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl6
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl64
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl7
+# CHECK-NEXT:  1      2     1.00                        ptrue	p7.s, vl8
+# CHECK-NEXT:  1      2     1.00                        ptrues	p0.b, pow2
+# CHECK-NEXT:  1      2     1.00                        ptrues	p0.d, pow2
+# CHECK-NEXT:  1      2     1.00                        ptrues	p0.h, pow2
+# CHECK-NEXT:  1      2     1.00                        ptrues	p0.s, pow2
+# CHECK-NEXT:  1      2     1.00                        ptrues	p15.b
+# CHECK-NEXT:  1      2     1.00                        ptrues	p15.d
+# CHECK-NEXT:  1      2     1.00                        ptrues	p15.h
+# CHECK-NEXT:  1      2     1.00                        ptrues	p15.s
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #14
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #15
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #16
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #17
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #18
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #19
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #20
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #21
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #22
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #23
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #24
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #25
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #26
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #27
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, #28
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, mul3
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, mul4
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl1
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl128
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl16
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl2
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl256
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl3
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl32
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl4
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl5
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl6
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl64
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl7
+# CHECK-NEXT:  1      2     1.00                        ptrues	p7.s, vl8
+# CHECK-NEXT:  1      2     1.00                        punpkhi	p0.h, p0.b
+# CHECK-NEXT:  1      2     1.00                        punpkhi	p15.h, p15.b
+# CHECK-NEXT:  1      2     1.00                        punpklo	p0.h, p0.b
+# CHECK-NEXT:  1      2     1.00                        punpklo	p15.h, p15.b
+# CHECK-NEXT:  1      8     0.50                        raddhnb	z0.b, z1.h, z31.h
+# CHECK-NEXT:  1      8     0.50                        raddhnb	z0.h, z1.s, z31.s
+# CHECK-NEXT:  1      8     0.50                        raddhnb	z0.s, z1.d, z31.d
+# CHECK-NEXT:  1      8     0.50                        raddhnt	z0.b, z1.h, z31.h
+# CHECK-NEXT:  1      8     0.50                        raddhnt	z0.h, z1.s, z31.s
+# CHECK-NEXT:  1      8     0.50                        raddhnt	z0.s, z1.d, z31.d
+# CHECK-NEXT:  1      9     1.00                        rax1	z0.d, z1.d, z31.d
 # CHECK-NEXT:  1      3     0.50                        rbit	z0.b, p7/m, z31.b
 # CHECK-NEXT:  1      3     0.50                        rbit	z0.d, p7/m, z31.d
 # CHECK-NEXT:  1      3     0.50                        rbit	z0.h, p7/m, z31.h
@@ -5379,16 +5379,16 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      1     0.33                        rdvl	x21, #-32
 # CHECK-NEXT:  1      1     0.33                        rdvl	x23, #31
 # CHECK-NEXT:  1      1     0.33                        rdvl	xzr, #-1
-# CHECK-NEXT:  1      4     0.50                        rev	z0.b, z31.b
-# CHECK-NEXT:  1      4     0.50                        rev	z0.d, z31.d
-# CHECK-NEXT:  1      4     0.50                        rev	z0.h, z31.h
-# CHECK-NEXT:  1      4     0.50                        rev	z0.s, z31.s
-# CHECK-NEXT:  1      4     0.50                        revb	z0.d, p7/m, z31.d
-# CHECK-NEXT:  1      4     0.50                        revb	z0.h, p7/m, z31.h
-# CHECK-NEXT:  1      4     0.50                        revb	z0.s, p7/m, z31.s
-# CHECK-NEXT:  1      4     0.50                        revh	z0.d, p7/m, z31.d
-# CHECK-NEXT:  1      4     0.50                        revh	z0.s, p7/m, z31.s
-# CHECK-NEXT:  1      4     0.50                        revw	z0.d, p7/m, z31.d
+# CHECK-NEXT:  1      3     0.50                        rev	z0.b, z31.b
+# CHECK-NEXT:  1      3     0.50                        rev	z0.d, z31.d
+# CHECK-NEXT:  1      3     0.50                        rev	z0.h, z31.h
+# CHECK-NEXT:  1      3     0.50                        rev	z0.s, z31.s
+# CHECK-NEXT:  1      3     0.50                        revb	z0.d, p7/m, z31.d
+# CHECK-NEXT:  1      3     0.50                        revb	z0.h, p7/m, z31.h
+# CHECK-NEXT:  1      3     0.50                        revb	z0.s, p7/m, z31.s
+# CHECK-NEXT:  1      3     0.50                        revh	z0.d, p7/m, z31.d
+# CHECK-NEXT:  1      3     0.50                        revh	z0.s, p7/m, z31.s
+# CHECK-NEXT:  1      3     0.50                        revw	z0.d, p7/m, z31.d
 # CHECK-NEXT:  1      4     0.50                        rshrnb	z0.b, z0.h, #1
 # CHECK-NEXT:  1      4     0.50                        rshrnb	z0.h, z0.s, #1
 # CHECK-NEXT:  1      4     0.50                        rshrnb	z0.s, z0.d, #1
@@ -5401,22 +5401,22 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      4     0.50                        rshrnt	z31.b, z31.h, #8
 # CHECK-NEXT:  1      4     0.50                        rshrnt	z31.h, z31.s, #16
 # CHECK-NEXT:  1      4     0.50                        rshrnt	z31.s, z31.d, #32
-# CHECK-NEXT:  1      4     0.50                        rsubhnb	z0.b, z1.h, z31.h
-# CHECK-NEXT:  1      4     0.50                        rsubhnb	z0.h, z1.s, z31.s
-# CHECK-NEXT:  1      4     0.50                        rsubhnb	z0.s, z1.d, z31.d
-# CHECK-NEXT:  1      4     0.50                        rsubhnt	z0.b, z1.h, z31.h
-# CHECK-NEXT:  1      4     0.50                        rsubhnt	z0.h, z1.s, z31.s
-# CHECK-NEXT:  1      4     0.50                        rsubhnt	z0.s, z1.d, z31.d
-# CHECK-NEXT:  1      8     1.00                        saba	z0.b, z1.b, z31.b
-# CHECK-NEXT:  1      8     1.00                        saba	z0.d, z1.d, z31.d
-# CHECK-NEXT:  1      8     1.00                        saba	z0.h, z1.h, z31.h
-# CHECK-NEXT:  1      8     1.00                        saba	z0.s, z1.s, z31.s
-# CHECK-NEXT:  1      8     1.00                        sabalb	z0.d, z1.s, z31.s
-# CHECK-NEXT:  1      8     1.00                        sabalb	z0.h, z1.b, z31.b
-# CHECK-NEXT:  1      8     1.00                        sabalb	z0.s, z1.h, z31.h
-# CHECK-NEXT:  1      8     1.00                        sabalt	z0.d, z1.s, z31.s
-# CHECK-NEXT:  1      8     1.00                        sabalt	z0.h, z1.b, z31.b
-# CHECK-NEXT:  1      8     1.00                        sabalt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      8     0.50                        rsubhnb	z0.b, z1.h, z31.h
+# CHECK-NEXT:  1      8     0.50                        rsubhnb	z0.h, z1.s, z31.s
+# CHECK-NEXT:  1      8     0.50                        rsubhnb	z0.s, z1.d, z31.d
+# CHECK-NEXT:  1      8     0.50                        rsubhnt	z0.b, z1.h, z31.h
+# CHECK-NEXT:  1      8     0.50                        rsubhnt	z0.h, z1.s, z31.s
+# CHECK-NEXT:  1      8     0.50                        rsubhnt	z0.s, z1.d, z31.d
+# CHECK-NEXT:  1      6     1.00                        saba	z0.b, z1.b, z31.b
+# CHECK-NEXT:  1      6     1.00                        saba	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      6     1.00                        saba	z0.h, z1.h, z31.h
+# CHECK-NEXT:  1      6     1.00                        saba	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      6     1.00                        sabalb	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      6     1.00                        sabalb	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      6     1.00                        sabalb	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      6     1.00                        sabalt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      6     1.00                        sabalt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      6     1.00                        sabalt	z0.s, z1.h, z31.h
 # CHECK-NEXT:  1      3     0.50                        sabd	z31.b, p7/m, z31.b, z31.b
 # CHECK-NEXT:  1      3     0.50                        sabd	z31.d, p7/m, z31.d, z31.d
 # CHECK-NEXT:  1      3     0.50                        sabd	z31.h, p7/m, z31.h, z31.h
@@ -5430,24 +5430,24 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      7     1.00                        sadalp	z0.h, p0/m, z1.b
 # CHECK-NEXT:  1      7     1.00                        sadalp	z29.s, p0/m, z30.h
 # CHECK-NEXT:  1      7     1.00                        sadalp	z30.d, p7/m, z31.s
-# CHECK-NEXT:  1      3     0.50                        saddlb	z0.h, z1.b, z2.b
-# CHECK-NEXT:  1      3     0.50                        saddlb	z29.s, z30.h, z31.h
-# CHECK-NEXT:  1      3     0.50                        saddlb	z31.d, z31.s, z31.s
-# CHECK-NEXT:  1      3     0.50                        saddlbt	z0.d, z1.s, z31.s
-# CHECK-NEXT:  1      3     0.50                        saddlbt	z0.h, z1.b, z31.b
-# CHECK-NEXT:  1      3     0.50                        saddlbt	z0.s, z1.h, z31.h
-# CHECK-NEXT:  1      3     0.50                        saddlt	z0.h, z1.b, z2.b
-# CHECK-NEXT:  1      3     0.50                        saddlt	z29.s, z30.h, z31.h
-# CHECK-NEXT:  1      3     0.50                        saddlt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      4     0.50                        saddlb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      4     0.50                        saddlb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     0.50                        saddlb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      4     0.50                        saddlbt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      4     0.50                        saddlbt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      4     0.50                        saddlbt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      4     0.50                        saddlt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      4     0.50                        saddlt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     0.50                        saddlt	z31.d, z31.s, z31.s
 # CHECK-NEXT:  1      4     1.00                        saddv	d0, p7, z31.b
 # CHECK-NEXT:  1      4     1.00                        saddv	d0, p7, z31.h
 # CHECK-NEXT:  1      4     1.00                        saddv	d0, p7, z31.s
-# CHECK-NEXT:  1      3     0.50                        saddwb	z0.h, z1.h, z2.b
-# CHECK-NEXT:  1      3     0.50                        saddwb	z29.s, z30.s, z31.h
-# CHECK-NEXT:  1      3     0.50                        saddwb	z31.d, z31.d, z31.s
-# CHECK-NEXT:  1      3     0.50                        saddwt	z0.h, z1.h, z2.b
-# CHECK-NEXT:  1      3     0.50                        saddwt	z29.s, z30.s, z31.h
-# CHECK-NEXT:  1      3     0.50                        saddwt	z31.d, z31.d, z31.s
+# CHECK-NEXT:  1      4     0.50                        saddwb	z0.h, z1.h, z2.b
+# CHECK-NEXT:  1      4     0.50                        saddwb	z29.s, z30.s, z31.h
+# CHECK-NEXT:  1      4     0.50                        saddwb	z31.d, z31.d, z31.s
+# CHECK-NEXT:  1      4     0.50                        saddwt	z0.h, z1.h, z2.b
+# CHECK-NEXT:  1      4     0.50                        saddwt	z29.s, z30.s, z31.h
+# CHECK-NEXT:  1      4     0.50                        saddwt	z31.d, z31.d, z31.s
 # CHECK-NEXT:  1      4     0.50                        sbclb	z0.d, z1.d, z31.d
 # CHECK-NEXT:  1      4     0.50                        sbclb	z0.s, z1.s, z31.s
 # CHECK-NEXT:  1      4     0.50                        sbclt	z0.d, z1.d, z31.d
@@ -5504,8 +5504,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      3     0.50                        sli	z31.d, z31.d, #63
 # CHECK-NEXT:  1      3     0.50                        sli	z31.h, z31.h, #15
 # CHECK-NEXT:  1      3     0.50                        sli	z31.s, z31.s, #31
-# CHECK-NEXT:  1      8     1.00                        sm4e	z0.s, z0.s, z31.s
-# CHECK-NEXT:  1      8     1.00                        sm4ekey	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      9     1.00                        sm4e	z0.s, z0.s, z31.s
+# CHECK-NEXT:  1      9     1.00                        sm4ekey	z0.s, z1.s, z31.s
 # CHECK-NEXT:  1      3     0.50                        smax	z0.b, z0.b, #-128
 # CHECK-NEXT:  1      3     0.50                        smax	z0.d, z0.d, #-128
 # CHECK-NEXT:  1      3     0.50                        smax	z0.h, z0.h, #-128
@@ -5624,61 +5624,61 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      4     0.50                        sqcadd	z31.d, z31.d, z31.d, #270
 # CHECK-NEXT:  1      4     0.50                        sqcadd	z31.h, z31.h, z31.h, #270
 # CHECK-NEXT:  1      4     0.50                        sqcadd	z31.s, z31.s, z31.s, #270
-# CHECK-NEXT:  1      1     0.33                        sqdecb	x0
-# CHECK-NEXT:  1      1     0.33                        sqdecb	x0, #14
-# CHECK-NEXT:  1      1     0.33                        sqdecb	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        sqdecb	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        sqdecb	x0, vl1
-# CHECK-NEXT:  1      1     0.33                        sqdecb	x0, w0
-# CHECK-NEXT:  1      1     0.33                        sqdecb	x0, w0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        sqdecb	x0, w0, pow2
-# CHECK-NEXT:  1      1     0.33                        sqdecb	x0, w0, pow2, mul #16
-# CHECK-NEXT:  1      1     0.33                        sqdecd	x0
-# CHECK-NEXT:  1      1     0.33                        sqdecd	x0, #14
-# CHECK-NEXT:  1      1     0.33                        sqdecd	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        sqdecd	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        sqdecd	x0, vl1
-# CHECK-NEXT:  1      1     0.33                        sqdecd	x0, w0
-# CHECK-NEXT:  1      1     0.33                        sqdecd	x0, w0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        sqdecd	x0, w0, pow2
-# CHECK-NEXT:  1      1     0.33                        sqdecd	x0, w0, pow2, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqdecb	x0
+# CHECK-NEXT:  1      4     0.33                        sqdecb	x0, #14
+# CHECK-NEXT:  1      4     0.33                        sqdecb	x0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqdecb	x0, pow2
+# CHECK-NEXT:  1      4     0.33                        sqdecb	x0, vl1
+# CHECK-NEXT:  1      4     0.33                        sqdecb	x0, w0
+# CHECK-NEXT:  1      4     0.33                        sqdecb	x0, w0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqdecb	x0, w0, pow2
+# CHECK-NEXT:  1      4     0.33                        sqdecb	x0, w0, pow2, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqdecd	x0
+# CHECK-NEXT:  1      4     0.33                        sqdecd	x0, #14
+# CHECK-NEXT:  1      4     0.33                        sqdecd	x0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqdecd	x0, pow2
+# CHECK-NEXT:  1      4     0.33                        sqdecd	x0, vl1
+# CHECK-NEXT:  1      4     0.33                        sqdecd	x0, w0
+# CHECK-NEXT:  1      4     0.33                        sqdecd	x0, w0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqdecd	x0, w0, pow2
+# CHECK-NEXT:  1      4     0.33                        sqdecd	x0, w0, pow2, mul #16
 # CHECK-NEXT:  1      4     0.50                        sqdecd	z0.d
 # CHECK-NEXT:  1      4     0.50                        sqdecd	z0.d, all, mul #16
 # CHECK-NEXT:  1      4     0.50                        sqdecd	z0.d, pow2
 # CHECK-NEXT:  1      4     0.50                        sqdecd	z0.d, pow2, mul #16
-# CHECK-NEXT:  1      1     0.33                        sqdech	x0
-# CHECK-NEXT:  1      1     0.33                        sqdech	x0, #14
-# CHECK-NEXT:  1      1     0.33                        sqdech	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        sqdech	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        sqdech	x0, vl1
-# CHECK-NEXT:  1      1     0.33                        sqdech	x0, w0
-# CHECK-NEXT:  1      1     0.33                        sqdech	x0, w0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        sqdech	x0, w0, pow2
-# CHECK-NEXT:  1      1     0.33                        sqdech	x0, w0, pow2, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqdech	x0
+# CHECK-NEXT:  1      4     0.33                        sqdech	x0, #14
+# CHECK-NEXT:  1      4     0.33                        sqdech	x0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqdech	x0, pow2
+# CHECK-NEXT:  1      4     0.33                        sqdech	x0, vl1
+# CHECK-NEXT:  1      4     0.33                        sqdech	x0, w0
+# CHECK-NEXT:  1      4     0.33                        sqdech	x0, w0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqdech	x0, w0, pow2
+# CHECK-NEXT:  1      4     0.33                        sqdech	x0, w0, pow2, mul #16
 # CHECK-NEXT:  1      4     0.50                        sqdech	z0.h
 # CHECK-NEXT:  1      4     0.50                        sqdech	z0.h, all, mul #16
 # CHECK-NEXT:  1      4     0.50                        sqdech	z0.h, pow2
 # CHECK-NEXT:  1      4     0.50                        sqdech	z0.h, pow2, mul #16
-# CHECK-NEXT:  1      8     1.00                        sqdecp	x0, p0.b
-# CHECK-NEXT:  1      8     1.00                        sqdecp	x0, p0.d
-# CHECK-NEXT:  1      8     1.00                        sqdecp	x0, p0.h
-# CHECK-NEXT:  1      8     1.00                        sqdecp	x0, p0.s
-# CHECK-NEXT:  1      8     1.00                        sqdecp	xzr, p15.b, wzr
-# CHECK-NEXT:  1      8     1.00                        sqdecp	xzr, p15.d, wzr
-# CHECK-NEXT:  1      8     1.00                        sqdecp	xzr, p15.h, wzr
-# CHECK-NEXT:  1      8     1.00                        sqdecp	xzr, p15.s, wzr
+# CHECK-NEXT:  1      9     1.00                        sqdecp	x0, p0.b
+# CHECK-NEXT:  1      9     1.00                        sqdecp	x0, p0.d
+# CHECK-NEXT:  1      9     1.00                        sqdecp	x0, p0.h
+# CHECK-NEXT:  1      9     1.00                        sqdecp	x0, p0.s
+# CHECK-NEXT:  1      9     1.00                        sqdecp	xzr, p15.b, wzr
+# CHECK-NEXT:  1      9     1.00                        sqdecp	xzr, p15.d, wzr
+# CHECK-NEXT:  1      9     1.00                        sqdecp	xzr, p15.h, wzr
+# CHECK-NEXT:  1      9     1.00                        sqdecp	xzr, p15.s, wzr
 # CHECK-NEXT:  1      4     0.50                        sqdecp	z0.d, p0.d
 # CHECK-NEXT:  1      4     0.50                        sqdecp	z0.h, p0.h
 # CHECK-NEXT:  1      4     0.50                        sqdecp	z0.s, p0.s
-# CHECK-NEXT:  1      1     0.33                        sqdecw	x0
-# CHECK-NEXT:  1      1     0.33                        sqdecw	x0, #14
-# CHECK-NEXT:  1      1     0.33                        sqdecw	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        sqdecw	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        sqdecw	x0, vl1
-# CHECK-NEXT:  1      1     0.33                        sqdecw	x0, w0
-# CHECK-NEXT:  1      1     0.33                        sqdecw	x0, w0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        sqdecw	x0, w0, pow2
-# CHECK-NEXT:  1      1     0.33                        sqdecw	x0, w0, pow2, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqdecw	x0
+# CHECK-NEXT:  1      4     0.33                        sqdecw	x0, #14
+# CHECK-NEXT:  1      4     0.33                        sqdecw	x0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqdecw	x0, pow2
+# CHECK-NEXT:  1      4     0.33                        sqdecw	x0, vl1
+# CHECK-NEXT:  1      4     0.33                        sqdecw	x0, w0
+# CHECK-NEXT:  1      4     0.33                        sqdecw	x0, w0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqdecw	x0, w0, pow2
+# CHECK-NEXT:  1      4     0.33                        sqdecw	x0, w0, pow2, mul #16
 # CHECK-NEXT:  1      4     0.50                        sqdecw	z0.s
 # CHECK-NEXT:  1      4     0.50                        sqdecw	z0.s, all, mul #16
 # CHECK-NEXT:  1      4     0.50                        sqdecw	z0.s, pow2
@@ -5726,61 +5726,61 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      4     0.50                        sqdmullt	z0.s, z1.h, z7.h[7]
 # CHECK-NEXT:  1      4     0.50                        sqdmullt	z29.s, z30.h, z31.h
 # CHECK-NEXT:  1      4     0.50                        sqdmullt	z31.d, z31.s, z31.s
-# CHECK-NEXT:  1      1     0.33                        sqincb	x0
-# CHECK-NEXT:  1      1     0.33                        sqincb	x0, #14
-# CHECK-NEXT:  1      1     0.33                        sqincb	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        sqincb	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        sqincb	x0, vl1
-# CHECK-NEXT:  1      1     0.33                        sqincb	x0, w0
-# CHECK-NEXT:  1      1     0.33                        sqincb	x0, w0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        sqincb	x0, w0, pow2
-# CHECK-NEXT:  1      1     0.33                        sqincb	x0, w0, pow2, mul #16
-# CHECK-NEXT:  1      1     0.33                        sqincd	x0
-# CHECK-NEXT:  1      1     0.33                        sqincd	x0, #14
-# CHECK-NEXT:  1      1     0.33                        sqincd	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        sqincd	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        sqincd	x0, vl1
-# CHECK-NEXT:  1      1     0.33                        sqincd	x0, w0
-# CHECK-NEXT:  1      1     0.33                        sqincd	x0, w0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        sqincd	x0, w0, pow2
-# CHECK-NEXT:  1      1     0.33                        sqincd	x0, w0, pow2, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqincb	x0
+# CHECK-NEXT:  1      4     0.33                        sqincb	x0, #14
+# CHECK-NEXT:  1      4     0.33                        sqincb	x0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqincb	x0, pow2
+# CHECK-NEXT:  1      4     0.33                        sqincb	x0, vl1
+# CHECK-NEXT:  1      4     0.33                        sqincb	x0, w0
+# CHECK-NEXT:  1      4     0.33                        sqincb	x0, w0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqincb	x0, w0, pow2
+# CHECK-NEXT:  1      4     0.33                        sqincb	x0, w0, pow2, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqincd	x0
+# CHECK-NEXT:  1      4     0.33                        sqincd	x0, #14
+# CHECK-NEXT:  1      4     0.33                        sqincd	x0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqincd	x0, pow2
+# CHECK-NEXT:  1      4     0.33                        sqincd	x0, vl1
+# CHECK-NEXT:  1      4     0.33                        sqincd	x0, w0
+# CHECK-NEXT:  1      4     0.33                        sqincd	x0, w0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqincd	x0, w0, pow2
+# CHECK-NEXT:  1      4     0.33                        sqincd	x0, w0, pow2, mul #16
 # CHECK-NEXT:  1      4     0.50                        sqincd	z0.d
 # CHECK-NEXT:  1      4     0.50                        sqincd	z0.d, all, mul #16
 # CHECK-NEXT:  1      4     0.50                        sqincd	z0.d, pow2
 # CHECK-NEXT:  1      4     0.50                        sqincd	z0.d, pow2, mul #16
-# CHECK-NEXT:  1      1     0.33                        sqinch	x0
-# CHECK-NEXT:  1      1     0.33                        sqinch	x0, #14
-# CHECK-NEXT:  1      1     0.33                        sqinch	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        sqinch	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        sqinch	x0, vl1
-# CHECK-NEXT:  1      1     0.33                        sqinch	x0, w0
-# CHECK-NEXT:  1      1     0.33                        sqinch	x0, w0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        sqinch	x0, w0, pow2
-# CHECK-NEXT:  1      1     0.33                        sqinch	x0, w0, pow2, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqinch	x0
+# CHECK-NEXT:  1      4     0.33                        sqinch	x0, #14
+# CHECK-NEXT:  1      4     0.33                        sqinch	x0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqinch	x0, pow2
+# CHECK-NEXT:  1      4     0.33                        sqinch	x0, vl1
+# CHECK-NEXT:  1      4     0.33                        sqinch	x0, w0
+# CHECK-NEXT:  1      4     0.33                        sqinch	x0, w0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqinch	x0, w0, pow2
+# CHECK-NEXT:  1      4     0.33                        sqinch	x0, w0, pow2, mul #16
 # CHECK-NEXT:  1      4     0.50                        sqinch	z0.h
 # CHECK-NEXT:  1      4     0.50                        sqinch	z0.h, all, mul #16
 # CHECK-NEXT:  1      4     0.50                        sqinch	z0.h, pow2
 # CHECK-NEXT:  1      4     0.50                        sqinch	z0.h, pow2, mul #16
-# CHECK-NEXT:  1      8     1.00                        sqincp	x0, p0.b
-# CHECK-NEXT:  1      8     1.00                        sqincp	x0, p0.d
-# CHECK-NEXT:  1      8     1.00                        sqincp	x0, p0.h
-# CHECK-NEXT:  1      8     1.00                        sqincp	x0, p0.s
-# CHECK-NEXT:  1      8     1.00                        sqincp	xzr, p15.b, wzr
-# CHECK-NEXT:  1      8     1.00                        sqincp	xzr, p15.d, wzr
-# CHECK-NEXT:  1      8     1.00                        sqincp	xzr, p15.h, wzr
-# CHECK-NEXT:  1      8     1.00                        sqincp	xzr, p15.s, wzr
+# CHECK-NEXT:  1      9     1.00                        sqincp	x0, p0.b
+# CHECK-NEXT:  1      9     1.00                        sqincp	x0, p0.d
+# CHECK-NEXT:  1      9     1.00                        sqincp	x0, p0.h
+# CHECK-NEXT:  1      9     1.00                        sqincp	x0, p0.s
+# CHECK-NEXT:  1      9     1.00                        sqincp	xzr, p15.b, wzr
+# CHECK-NEXT:  1      9     1.00                        sqincp	xzr, p15.d, wzr
+# CHECK-NEXT:  1      9     1.00                        sqincp	xzr, p15.h, wzr
+# CHECK-NEXT:  1      9     1.00                        sqincp	xzr, p15.s, wzr
 # CHECK-NEXT:  1      4     0.50                        sqincp	z0.d, p0.d
 # CHECK-NEXT:  1      4     0.50                        sqincp	z0.h, p0.h
 # CHECK-NEXT:  1      4     0.50                        sqincp	z0.s, p0.s
-# CHECK-NEXT:  1      1     0.33                        sqincw	x0
-# CHECK-NEXT:  1      1     0.33                        sqincw	x0, #14
-# CHECK-NEXT:  1      1     0.33                        sqincw	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        sqincw	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        sqincw	x0, vl1
-# CHECK-NEXT:  1      1     0.33                        sqincw	x0, w0
-# CHECK-NEXT:  1      1     0.33                        sqincw	x0, w0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        sqincw	x0, w0, pow2
-# CHECK-NEXT:  1      1     0.33                        sqincw	x0, w0, pow2, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqincw	x0
+# CHECK-NEXT:  1      4     0.33                        sqincw	x0, #14
+# CHECK-NEXT:  1      4     0.33                        sqincw	x0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqincw	x0, pow2
+# CHECK-NEXT:  1      4     0.33                        sqincw	x0, vl1
+# CHECK-NEXT:  1      4     0.33                        sqincw	x0, w0
+# CHECK-NEXT:  1      4     0.33                        sqincw	x0, w0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        sqincw	x0, w0, pow2
+# CHECK-NEXT:  1      4     0.33                        sqincw	x0, w0, pow2, mul #16
 # CHECK-NEXT:  1      4     0.50                        sqincw	z0.s
 # CHECK-NEXT:  1      4     0.50                        sqincw	z0.s, all, mul #16
 # CHECK-NEXT:  1      4     0.50                        sqincw	z0.s, pow2
@@ -6001,24 +6001,24 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      4     0.50                        ssra	z31.d, z31.d, #64
 # CHECK-NEXT:  1      4     0.50                        ssra	z31.h, z31.h, #16
 # CHECK-NEXT:  1      4     0.50                        ssra	z31.s, z31.s, #32
-# CHECK-NEXT:  1      3     0.50                        ssublb	z0.h, z1.b, z2.b
-# CHECK-NEXT:  1      3     0.50                        ssublb	z29.s, z30.h, z31.h
-# CHECK-NEXT:  1      3     0.50                        ssublb	z31.d, z31.s, z31.s
-# CHECK-NEXT:  1      3     0.50                        ssublbt	z0.d, z1.s, z31.s
-# CHECK-NEXT:  1      3     0.50                        ssublbt	z0.h, z1.b, z31.b
-# CHECK-NEXT:  1      3     0.50                        ssublbt	z0.s, z1.h, z31.h
-# CHECK-NEXT:  1      3     0.50                        ssublt	z0.h, z1.b, z2.b
-# CHECK-NEXT:  1      3     0.50                        ssublt	z29.s, z30.h, z31.h
-# CHECK-NEXT:  1      3     0.50                        ssublt	z31.d, z31.s, z31.s
-# CHECK-NEXT:  1      3     0.50                        ssubltb	z0.d, z1.s, z31.s
-# CHECK-NEXT:  1      3     0.50                        ssubltb	z0.h, z1.b, z31.b
-# CHECK-NEXT:  1      3     0.50                        ssubltb	z0.s, z1.h, z31.h
-# CHECK-NEXT:  1      3     0.50                        ssubwb	z0.h, z1.h, z2.b
-# CHECK-NEXT:  1      3     0.50                        ssubwb	z29.s, z30.s, z31.h
-# CHECK-NEXT:  1      3     0.50                        ssubwb	z31.d, z31.d, z31.s
-# CHECK-NEXT:  1      3     0.50                        ssubwt	z0.h, z1.h, z2.b
-# CHECK-NEXT:  1      3     0.50                        ssubwt	z29.s, z30.s, z31.h
-# CHECK-NEXT:  1      3     0.50                        ssubwt	z31.d, z31.d, z31.s
+# CHECK-NEXT:  1      4     0.50                        ssublb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      4     0.50                        ssublb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     0.50                        ssublb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      4     0.50                        ssublbt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      4     0.50                        ssublbt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      4     0.50                        ssublbt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      4     0.50                        ssublt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      4     0.50                        ssublt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     0.50                        ssublt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      4     0.50                        ssubltb	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      4     0.50                        ssubltb	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      4     0.50                        ssubltb	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      4     0.50                        ssubwb	z0.h, z1.h, z2.b
+# CHECK-NEXT:  1      4     0.50                        ssubwb	z29.s, z30.s, z31.h
+# CHECK-NEXT:  1      4     0.50                        ssubwb	z31.d, z31.d, z31.s
+# CHECK-NEXT:  1      4     0.50                        ssubwt	z0.h, z1.h, z2.b
+# CHECK-NEXT:  1      4     0.50                        ssubwt	z29.s, z30.s, z31.h
+# CHECK-NEXT:  1      4     0.50                        ssubwt	z31.d, z31.d, z31.s
 # CHECK-NEXT:  1      1     1.00           *            st1b	{ z0.b }, p0, [x0, x0]
 # CHECK-NEXT:  1      1     1.00           *            st1b	{ z0.b }, p0, [x0]
 # CHECK-NEXT:  1      1     1.00           *            st1b	{ z0.d }, p0, [x0, x0]
@@ -6250,12 +6250,12 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      3     0.50                        sub	z31.s, p7/m, z31.s, z31.s
 # CHECK-NEXT:  1      3     0.50                        sub	z31.s, z31.s, #65280
 # CHECK-NEXT:  1      3     0.50                        sub	z31.s, z31.s, z31.s
-# CHECK-NEXT:  1      4     0.50                        subhnb	z0.b, z1.h, z31.h
-# CHECK-NEXT:  1      4     0.50                        subhnb	z0.h, z1.s, z31.s
-# CHECK-NEXT:  1      4     0.50                        subhnb	z0.s, z1.d, z31.d
-# CHECK-NEXT:  1      4     0.50                        subhnt	z0.b, z1.h, z31.h
-# CHECK-NEXT:  1      4     0.50                        subhnt	z0.h, z1.s, z31.s
-# CHECK-NEXT:  1      4     0.50                        subhnt	z0.s, z1.d, z31.d
+# CHECK-NEXT:  1      8     0.50                        subhnb	z0.b, z1.h, z31.h
+# CHECK-NEXT:  1      8     0.50                        subhnb	z0.h, z1.s, z31.s
+# CHECK-NEXT:  1      8     0.50                        subhnb	z0.s, z1.d, z31.d
+# CHECK-NEXT:  1      8     0.50                        subhnt	z0.b, z1.h, z31.h
+# CHECK-NEXT:  1      8     0.50                        subhnt	z0.h, z1.s, z31.s
+# CHECK-NEXT:  1      8     0.50                        subhnt	z0.s, z1.d, z31.d
 # CHECK-NEXT:  1      3     0.50                        subr	z0.b, p0/m, z0.b, z0.b
 # CHECK-NEXT:  1      3     0.50                        subr	z0.b, z0.b, #0
 # CHECK-NEXT:  1      3     0.50                        subr	z0.d, p0/m, z0.d, z0.d
@@ -6305,32 +6305,32 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      4     0.50                        tbx	z31.d, z31.d, z31.d
 # CHECK-NEXT:  1      4     0.50                        tbx	z31.h, z31.h, z31.h
 # CHECK-NEXT:  1      4     0.50                        tbx	z31.s, z31.s, z31.s
-# CHECK-NEXT:  1      6     1.00                        trn1	p15.b, p15.b, p15.b
-# CHECK-NEXT:  1      6     1.00                        trn1	p15.d, p15.d, p15.d
-# CHECK-NEXT:  1      6     1.00                        trn1	p15.h, p15.h, p15.h
-# CHECK-NEXT:  1      6     1.00                        trn1	p15.s, p15.s, p15.s
-# CHECK-NEXT:  1      4     0.50                        trn1	z31.b, z31.b, z31.b
-# CHECK-NEXT:  1      4     0.50                        trn1	z31.d, z31.d, z31.d
-# CHECK-NEXT:  1      4     0.50                        trn1	z31.h, z31.h, z31.h
-# CHECK-NEXT:  1      4     0.50                        trn1	z31.s, z31.s, z31.s
-# CHECK-NEXT:  1      6     1.00                        trn2	p15.b, p15.b, p15.b
-# CHECK-NEXT:  1      6     1.00                        trn2	p15.d, p15.d, p15.d
-# CHECK-NEXT:  1      6     1.00                        trn2	p15.h, p15.h, p15.h
-# CHECK-NEXT:  1      6     1.00                        trn2	p15.s, p15.s, p15.s
-# CHECK-NEXT:  1      4     0.50                        trn2	z31.b, z31.b, z31.b
-# CHECK-NEXT:  1      4     0.50                        trn2	z31.d, z31.d, z31.d
-# CHECK-NEXT:  1      4     0.50                        trn2	z31.h, z31.h, z31.h
-# CHECK-NEXT:  1      4     0.50                        trn2	z31.s, z31.s, z31.s
-# CHECK-NEXT:  1      8     1.00                        uaba	z0.b, z1.b, z31.b
-# CHECK-NEXT:  1      8     1.00                        uaba	z0.d, z1.d, z31.d
-# CHECK-NEXT:  1      8     1.00                        uaba	z0.h, z1.h, z31.h
-# CHECK-NEXT:  1      8     1.00                        uaba	z0.s, z1.s, z31.s
-# CHECK-NEXT:  1      8     1.00                        uabalb	z0.d, z1.s, z31.s
-# CHECK-NEXT:  1      8     1.00                        uabalb	z0.h, z1.b, z31.b
-# CHECK-NEXT:  1      8     1.00                        uabalb	z0.s, z1.h, z31.h
-# CHECK-NEXT:  1      8     1.00                        uabalt	z0.d, z1.s, z31.s
-# CHECK-NEXT:  1      8     1.00                        uabalt	z0.h, z1.b, z31.b
-# CHECK-NEXT:  1      8     1.00                        uabalt	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      2     1.00                        trn1	p15.b, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        trn1	p15.d, p15.d, p15.d
+# CHECK-NEXT:  1      2     1.00                        trn1	p15.h, p15.h, p15.h
+# CHECK-NEXT:  1      2     1.00                        trn1	p15.s, p15.s, p15.s
+# CHECK-NEXT:  1      3     0.50                        trn1	z31.b, z31.b, z31.b
+# CHECK-NEXT:  1      3     0.50                        trn1	z31.d, z31.d, z31.d
+# CHECK-NEXT:  1      3     0.50                        trn1	z31.h, z31.h, z31.h
+# CHECK-NEXT:  1      3     0.50                        trn1	z31.s, z31.s, z31.s
+# CHECK-NEXT:  1      2     1.00                        trn2	p15.b, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        trn2	p15.d, p15.d, p15.d
+# CHECK-NEXT:  1      2     1.00                        trn2	p15.h, p15.h, p15.h
+# CHECK-NEXT:  1      2     1.00                        trn2	p15.s, p15.s, p15.s
+# CHECK-NEXT:  1      3     0.50                        trn2	z31.b, z31.b, z31.b
+# CHECK-NEXT:  1      3     0.50                        trn2	z31.d, z31.d, z31.d
+# CHECK-NEXT:  1      3     0.50                        trn2	z31.h, z31.h, z31.h
+# CHECK-NEXT:  1      3     0.50                        trn2	z31.s, z31.s, z31.s
+# CHECK-NEXT:  1      6     1.00                        uaba	z0.b, z1.b, z31.b
+# CHECK-NEXT:  1      6     1.00                        uaba	z0.d, z1.d, z31.d
+# CHECK-NEXT:  1      6     1.00                        uaba	z0.h, z1.h, z31.h
+# CHECK-NEXT:  1      6     1.00                        uaba	z0.s, z1.s, z31.s
+# CHECK-NEXT:  1      6     1.00                        uabalb	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      6     1.00                        uabalb	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      6     1.00                        uabalb	z0.s, z1.h, z31.h
+# CHECK-NEXT:  1      6     1.00                        uabalt	z0.d, z1.s, z31.s
+# CHECK-NEXT:  1      6     1.00                        uabalt	z0.h, z1.b, z31.b
+# CHECK-NEXT:  1      6     1.00                        uabalt	z0.s, z1.h, z31.h
 # CHECK-NEXT:  1      3     0.50                        uabd	z31.b, p7/m, z31.b, z31.b
 # CHECK-NEXT:  1      3     0.50                        uabd	z31.d, p7/m, z31.d, z31.d
 # CHECK-NEXT:  1      3     0.50                        uabd	z31.h, p7/m, z31.h, z31.h
@@ -6344,22 +6344,22 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      7     1.00                        uadalp	z0.h, p0/m, z1.b
 # CHECK-NEXT:  1      7     1.00                        uadalp	z29.s, p0/m, z30.h
 # CHECK-NEXT:  1      7     1.00                        uadalp	z30.d, p7/m, z31.s
-# CHECK-NEXT:  1      3     0.50                        uaddlb	z0.h, z1.b, z2.b
-# CHECK-NEXT:  1      3     0.50                        uaddlb	z29.s, z30.h, z31.h
-# CHECK-NEXT:  1      3     0.50                        uaddlb	z31.d, z31.s, z31.s
-# CHECK-NEXT:  1      3     0.50                        uaddlt	z0.h, z1.b, z2.b
-# CHECK-NEXT:  1      3     0.50                        uaddlt	z29.s, z30.h, z31.h
-# CHECK-NEXT:  1      3     0.50                        uaddlt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      4     0.50                        uaddlb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      4     0.50                        uaddlb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     0.50                        uaddlb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      4     0.50                        uaddlt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      4     0.50                        uaddlt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     0.50                        uaddlt	z31.d, z31.s, z31.s
 # CHECK-NEXT:  1      4     1.00                        uaddv	d0, p7, z31.b
 # CHECK-NEXT:  1      4     1.00                        uaddv	d0, p7, z31.d
 # CHECK-NEXT:  1      4     1.00                        uaddv	d0, p7, z31.h
 # CHECK-NEXT:  1      4     1.00                        uaddv	d0, p7, z31.s
-# CHECK-NEXT:  1      3     0.50                        uaddwb	z0.h, z1.h, z2.b
-# CHECK-NEXT:  1      3     0.50                        uaddwb	z29.s, z30.s, z31.h
-# CHECK-NEXT:  1      3     0.50                        uaddwb	z31.d, z31.d, z31.s
-# CHECK-NEXT:  1      3     0.50                        uaddwt	z0.h, z1.h, z2.b
-# CHECK-NEXT:  1      3     0.50                        uaddwt	z29.s, z30.s, z31.h
-# CHECK-NEXT:  1      3     0.50                        uaddwt	z31.d, z31.d, z31.s
+# CHECK-NEXT:  1      4     0.50                        uaddwb	z0.h, z1.h, z2.b
+# CHECK-NEXT:  1      4     0.50                        uaddwb	z29.s, z30.s, z31.h
+# CHECK-NEXT:  1      4     0.50                        uaddwb	z31.d, z31.d, z31.s
+# CHECK-NEXT:  1      4     0.50                        uaddwt	z0.h, z1.h, z2.b
+# CHECK-NEXT:  1      4     0.50                        uaddwt	z29.s, z30.s, z31.h
+# CHECK-NEXT:  1      4     0.50                        uaddwt	z31.d, z31.d, z31.s
 # CHECK-NEXT:  1      4     0.50                        ucvtf	z0.d, p0/m, z0.d
 # CHECK-NEXT:  1      4     0.50                        ucvtf	z0.d, p0/m, z0.s
 # CHECK-NEXT:  1      4     0.50                        ucvtf	z0.h, p0/m, z0.d
@@ -6473,120 +6473,120 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      4     0.50                        uqadd	z31.d, z31.d, #65280
 # CHECK-NEXT:  1      4     0.50                        uqadd	z31.h, z31.h, #65280
 # CHECK-NEXT:  1      4     0.50                        uqadd	z31.s, z31.s, #65280
-# CHECK-NEXT:  1      1     0.33                        uqdecb	w0
-# CHECK-NEXT:  1      1     0.33                        uqdecb	w0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqdecb	w0, pow2
-# CHECK-NEXT:  1      1     0.33                        uqdecb	w0, pow2, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqdecb	x0
-# CHECK-NEXT:  1      1     0.33                        uqdecb	x0, #14
-# CHECK-NEXT:  1      1     0.33                        uqdecb	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqdecb	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        uqdecb	x0, vl1
-# CHECK-NEXT:  1      1     0.33                        uqdecd	w0
-# CHECK-NEXT:  1      1     0.33                        uqdecd	w0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqdecd	w0, pow2
-# CHECK-NEXT:  1      1     0.33                        uqdecd	w0, pow2, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqdecd	x0
-# CHECK-NEXT:  1      1     0.33                        uqdecd	x0, #14
-# CHECK-NEXT:  1      1     0.33                        uqdecd	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqdecd	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        uqdecd	x0, vl1
+# CHECK-NEXT:  1      4     0.33                        uqdecb	w0
+# CHECK-NEXT:  1      4     0.33                        uqdecb	w0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqdecb	w0, pow2
+# CHECK-NEXT:  1      4     0.33                        uqdecb	w0, pow2, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqdecb	x0
+# CHECK-NEXT:  1      4     0.33                        uqdecb	x0, #14
+# CHECK-NEXT:  1      4     0.33                        uqdecb	x0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqdecb	x0, pow2
+# CHECK-NEXT:  1      4     0.33                        uqdecb	x0, vl1
+# CHECK-NEXT:  1      4     0.33                        uqdecd	w0
+# CHECK-NEXT:  1      4     0.33                        uqdecd	w0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqdecd	w0, pow2
+# CHECK-NEXT:  1      4     0.33                        uqdecd	w0, pow2, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqdecd	x0
+# CHECK-NEXT:  1      4     0.33                        uqdecd	x0, #14
+# CHECK-NEXT:  1      4     0.33                        uqdecd	x0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqdecd	x0, pow2
+# CHECK-NEXT:  1      4     0.33                        uqdecd	x0, vl1
 # CHECK-NEXT:  1      4     0.50                        uqdecd	z0.d
 # CHECK-NEXT:  1      4     0.50                        uqdecd	z0.d, all, mul #16
 # CHECK-NEXT:  1      4     0.50                        uqdecd	z0.d, pow2
 # CHECK-NEXT:  1      4     0.50                        uqdecd	z0.d, pow2, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqdech	w0
-# CHECK-NEXT:  1      1     0.33                        uqdech	w0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqdech	w0, pow2
-# CHECK-NEXT:  1      1     0.33                        uqdech	w0, pow2, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqdech	x0
-# CHECK-NEXT:  1      1     0.33                        uqdech	x0, #14
-# CHECK-NEXT:  1      1     0.33                        uqdech	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqdech	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        uqdech	x0, vl1
+# CHECK-NEXT:  1      4     0.33                        uqdech	w0
+# CHECK-NEXT:  1      4     0.33                        uqdech	w0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqdech	w0, pow2
+# CHECK-NEXT:  1      4     0.33                        uqdech	w0, pow2, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqdech	x0
+# CHECK-NEXT:  1      4     0.33                        uqdech	x0, #14
+# CHECK-NEXT:  1      4     0.33                        uqdech	x0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqdech	x0, pow2
+# CHECK-NEXT:  1      4     0.33                        uqdech	x0, vl1
 # CHECK-NEXT:  1      4     0.50                        uqdech	z0.h
 # CHECK-NEXT:  1      4     0.50                        uqdech	z0.h, all, mul #16
 # CHECK-NEXT:  1      4     0.50                        uqdech	z0.h, pow2
 # CHECK-NEXT:  1      4     0.50                        uqdech	z0.h, pow2, mul #16
-# CHECK-NEXT:  1      8     1.00                        uqdecp	wzr, p15.b
-# CHECK-NEXT:  1      8     1.00                        uqdecp	wzr, p15.d
-# CHECK-NEXT:  1      8     1.00                        uqdecp	wzr, p15.h
-# CHECK-NEXT:  1      8     1.00                        uqdecp	wzr, p15.s
-# CHECK-NEXT:  1      8     1.00                        uqdecp	x0, p0.b
-# CHECK-NEXT:  1      8     1.00                        uqdecp	x0, p0.d
-# CHECK-NEXT:  1      8     1.00                        uqdecp	x0, p0.h
-# CHECK-NEXT:  1      8     1.00                        uqdecp	x0, p0.s
+# CHECK-NEXT:  1      9     1.00                        uqdecp	wzr, p15.b
+# CHECK-NEXT:  1      9     1.00                        uqdecp	wzr, p15.d
+# CHECK-NEXT:  1      9     1.00                        uqdecp	wzr, p15.h
+# CHECK-NEXT:  1      9     1.00                        uqdecp	wzr, p15.s
+# CHECK-NEXT:  1      9     1.00                        uqdecp	x0, p0.b
+# CHECK-NEXT:  1      9     1.00                        uqdecp	x0, p0.d
+# CHECK-NEXT:  1      9     1.00                        uqdecp	x0, p0.h
+# CHECK-NEXT:  1      9     1.00                        uqdecp	x0, p0.s
 # CHECK-NEXT:  1      4     0.50                        uqdecp	z0.d, p0.d
 # CHECK-NEXT:  1      4     0.50                        uqdecp	z0.h, p0.h
 # CHECK-NEXT:  1      4     0.50                        uqdecp	z0.s, p0.s
-# CHECK-NEXT:  1      1     0.33                        uqdecw	w0
-# CHECK-NEXT:  1      1     0.33                        uqdecw	w0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqdecw	w0, pow2
-# CHECK-NEXT:  1      1     0.33                        uqdecw	w0, pow2, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqdecw	x0
-# CHECK-NEXT:  1      1     0.33                        uqdecw	x0, #14
-# CHECK-NEXT:  1      1     0.33                        uqdecw	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqdecw	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        uqdecw	x0, vl1
+# CHECK-NEXT:  1      4     0.33                        uqdecw	w0
+# CHECK-NEXT:  1      4     0.33                        uqdecw	w0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqdecw	w0, pow2
+# CHECK-NEXT:  1      4     0.33                        uqdecw	w0, pow2, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqdecw	x0
+# CHECK-NEXT:  1      4     0.33                        uqdecw	x0, #14
+# CHECK-NEXT:  1      4     0.33                        uqdecw	x0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqdecw	x0, pow2
+# CHECK-NEXT:  1      4     0.33                        uqdecw	x0, vl1
 # CHECK-NEXT:  1      4     0.50                        uqdecw	z0.s
 # CHECK-NEXT:  1      4     0.50                        uqdecw	z0.s, all, mul #16
 # CHECK-NEXT:  1      4     0.50                        uqdecw	z0.s, pow2
 # CHECK-NEXT:  1      4     0.50                        uqdecw	z0.s, pow2, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqincb	w0
-# CHECK-NEXT:  1      1     0.33                        uqincb	w0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqincb	w0, pow2
-# CHECK-NEXT:  1      1     0.33                        uqincb	w0, pow2, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqincb	x0
-# CHECK-NEXT:  1      1     0.33                        uqincb	x0, #14
-# CHECK-NEXT:  1      1     0.33                        uqincb	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqincb	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        uqincb	x0, vl1
-# CHECK-NEXT:  1      1     0.33                        uqincd	w0
-# CHECK-NEXT:  1      1     0.33                        uqincd	w0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqincd	w0, pow2
-# CHECK-NEXT:  1      1     0.33                        uqincd	w0, pow2, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqincd	x0
-# CHECK-NEXT:  1      1     0.33                        uqincd	x0, #14
-# CHECK-NEXT:  1      1     0.33                        uqincd	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqincd	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        uqincd	x0, vl1
+# CHECK-NEXT:  1      4     0.33                        uqincb	w0
+# CHECK-NEXT:  1      4     0.33                        uqincb	w0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqincb	w0, pow2
+# CHECK-NEXT:  1      4     0.33                        uqincb	w0, pow2, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqincb	x0
+# CHECK-NEXT:  1      4     0.33                        uqincb	x0, #14
+# CHECK-NEXT:  1      4     0.33                        uqincb	x0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqincb	x0, pow2
+# CHECK-NEXT:  1      4     0.33                        uqincb	x0, vl1
+# CHECK-NEXT:  1      4     0.33                        uqincd	w0
+# CHECK-NEXT:  1      4     0.33                        uqincd	w0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqincd	w0, pow2
+# CHECK-NEXT:  1      4     0.33                        uqincd	w0, pow2, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqincd	x0
+# CHECK-NEXT:  1      4     0.33                        uqincd	x0, #14
+# CHECK-NEXT:  1      4     0.33                        uqincd	x0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqincd	x0, pow2
+# CHECK-NEXT:  1      4     0.33                        uqincd	x0, vl1
 # CHECK-NEXT:  1      4     0.50                        uqincd	z0.d
 # CHECK-NEXT:  1      4     0.50                        uqincd	z0.d, all, mul #16
 # CHECK-NEXT:  1      4     0.50                        uqincd	z0.d, pow2
 # CHECK-NEXT:  1      4     0.50                        uqincd	z0.d, pow2, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqinch	w0
-# CHECK-NEXT:  1      1     0.33                        uqinch	w0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqinch	w0, pow2
-# CHECK-NEXT:  1      1     0.33                        uqinch	w0, pow2, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqinch	x0
-# CHECK-NEXT:  1      1     0.33                        uqinch	x0, #14
-# CHECK-NEXT:  1      1     0.33                        uqinch	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqinch	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        uqinch	x0, vl1
+# CHECK-NEXT:  1      4     0.33                        uqinch	w0
+# CHECK-NEXT:  1      4     0.33                        uqinch	w0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqinch	w0, pow2
+# CHECK-NEXT:  1      4     0.33                        uqinch	w0, pow2, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqinch	x0
+# CHECK-NEXT:  1      4     0.33                        uqinch	x0, #14
+# CHECK-NEXT:  1      4     0.33                        uqinch	x0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqinch	x0, pow2
+# CHECK-NEXT:  1      4     0.33                        uqinch	x0, vl1
 # CHECK-NEXT:  1      4     0.50                        uqinch	z0.h
 # CHECK-NEXT:  1      4     0.50                        uqinch	z0.h, all, mul #16
 # CHECK-NEXT:  1      4     0.50                        uqinch	z0.h, pow2
 # CHECK-NEXT:  1      4     0.50                        uqinch	z0.h, pow2, mul #16
-# CHECK-NEXT:  1      8     1.00                        uqincp	wzr, p15.b
-# CHECK-NEXT:  1      8     1.00                        uqincp	wzr, p15.d
-# CHECK-NEXT:  1      8     1.00                        uqincp	wzr, p15.h
-# CHECK-NEXT:  1      8     1.00                        uqincp	wzr, p15.s
-# CHECK-NEXT:  1      8     1.00                        uqincp	x0, p0.b
-# CHECK-NEXT:  1      8     1.00                        uqincp	x0, p0.d
-# CHECK-NEXT:  1      8     1.00                        uqincp	x0, p0.h
-# CHECK-NEXT:  1      8     1.00                        uqincp	x0, p0.s
+# CHECK-NEXT:  1      9     1.00                        uqincp	wzr, p15.b
+# CHECK-NEXT:  1      9     1.00                        uqincp	wzr, p15.d
+# CHECK-NEXT:  1      9     1.00                        uqincp	wzr, p15.h
+# CHECK-NEXT:  1      9     1.00                        uqincp	wzr, p15.s
+# CHECK-NEXT:  1      9     1.00                        uqincp	x0, p0.b
+# CHECK-NEXT:  1      9     1.00                        uqincp	x0, p0.d
+# CHECK-NEXT:  1      9     1.00                        uqincp	x0, p0.h
+# CHECK-NEXT:  1      9     1.00                        uqincp	x0, p0.s
 # CHECK-NEXT:  1      4     0.50                        uqincp	z0.d, p0.d
 # CHECK-NEXT:  1      4     0.50                        uqincp	z0.h, p0.h
 # CHECK-NEXT:  1      4     0.50                        uqincp	z0.s, p0.s
-# CHECK-NEXT:  1      1     0.33                        uqincw	w0
-# CHECK-NEXT:  1      1     0.33                        uqincw	w0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqincw	w0, pow2
-# CHECK-NEXT:  1      1     0.33                        uqincw	w0, pow2, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqincw	x0
-# CHECK-NEXT:  1      1     0.33                        uqincw	x0, #14
-# CHECK-NEXT:  1      1     0.33                        uqincw	x0, all, mul #16
-# CHECK-NEXT:  1      1     0.33                        uqincw	x0, pow2
-# CHECK-NEXT:  1      1     0.33                        uqincw	x0, vl1
+# CHECK-NEXT:  1      4     0.33                        uqincw	w0
+# CHECK-NEXT:  1      4     0.33                        uqincw	w0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqincw	w0, pow2
+# CHECK-NEXT:  1      4     0.33                        uqincw	w0, pow2, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqincw	x0
+# CHECK-NEXT:  1      4     0.33                        uqincw	x0, #14
+# CHECK-NEXT:  1      4     0.33                        uqincw	x0, all, mul #16
+# CHECK-NEXT:  1      4     0.33                        uqincw	x0, pow2
+# CHECK-NEXT:  1      4     0.33                        uqincw	x0, vl1
 # CHECK-NEXT:  1      4     0.50                        uqincw	z0.s
 # CHECK-NEXT:  1      4     0.50                        uqincw	z0.s, all, mul #16
 # CHECK-NEXT:  1      4     0.50                        uqincw	z0.s, pow2
@@ -6723,18 +6723,18 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      4     0.50                        usra	z31.d, z31.d, #64
 # CHECK-NEXT:  1      4     0.50                        usra	z31.h, z31.h, #16
 # CHECK-NEXT:  1      4     0.50                        usra	z31.s, z31.s, #32
-# CHECK-NEXT:  1      3     0.50                        usublb	z0.h, z1.b, z2.b
-# CHECK-NEXT:  1      3     0.50                        usublb	z29.s, z30.h, z31.h
-# CHECK-NEXT:  1      3     0.50                        usublb	z31.d, z31.s, z31.s
-# CHECK-NEXT:  1      3     0.50                        usublt	z0.h, z1.b, z2.b
-# CHECK-NEXT:  1      3     0.50                        usublt	z29.s, z30.h, z31.h
-# CHECK-NEXT:  1      3     0.50                        usublt	z31.d, z31.s, z31.s
-# CHECK-NEXT:  1      3     0.50                        usubwb	z0.h, z1.h, z2.b
-# CHECK-NEXT:  1      3     0.50                        usubwb	z29.s, z30.s, z31.h
-# CHECK-NEXT:  1      3     0.50                        usubwb	z31.d, z31.d, z31.s
-# CHECK-NEXT:  1      3     0.50                        usubwt	z0.h, z1.h, z2.b
-# CHECK-NEXT:  1      3     0.50                        usubwt	z29.s, z30.s, z31.h
-# CHECK-NEXT:  1      3     0.50                        usubwt	z31.d, z31.d, z31.s
+# CHECK-NEXT:  1      4     0.50                        usublb	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      4     0.50                        usublb	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     0.50                        usublb	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      4     0.50                        usublt	z0.h, z1.b, z2.b
+# CHECK-NEXT:  1      4     0.50                        usublt	z29.s, z30.h, z31.h
+# CHECK-NEXT:  1      4     0.50                        usublt	z31.d, z31.s, z31.s
+# CHECK-NEXT:  1      4     0.50                        usubwb	z0.h, z1.h, z2.b
+# CHECK-NEXT:  1      4     0.50                        usubwb	z29.s, z30.s, z31.h
+# CHECK-NEXT:  1      4     0.50                        usubwb	z31.d, z31.d, z31.s
+# CHECK-NEXT:  1      4     0.50                        usubwt	z0.h, z1.h, z2.b
+# CHECK-NEXT:  1      4     0.50                        usubwt	z29.s, z30.s, z31.h
+# CHECK-NEXT:  1      4     0.50                        usubwt	z31.d, z31.d, z31.s
 # CHECK-NEXT:  1      4     0.50                        uunpkhi	z31.d, z31.s
 # CHECK-NEXT:  1      4     0.50                        uunpkhi	z31.h, z31.b
 # CHECK-NEXT:  1      4     0.50                        uunpkhi	z31.s, z31.h
@@ -6753,82 +6753,82 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  1      3     0.50                        uxth	z31.s, p7/m, z31.s
 # CHECK-NEXT:  1      3     0.50                        uxtw	z0.d, p0/m, z0.d
 # CHECK-NEXT:  1      3     0.50                        uxtw	z31.d, p7/m, z31.d
-# CHECK-NEXT:  1      6     1.00                        uzp1	p15.b, p15.b, p15.b
-# CHECK-NEXT:  1      6     1.00                        uzp1	p15.d, p15.d, p15.d
-# CHECK-NEXT:  1      6     1.00                        uzp1	p15.h, p15.h, p15.h
-# CHECK-NEXT:  1      6     1.00                        uzp1	p15.s, p15.s, p15.s
-# CHECK-NEXT:  1      4     0.50                        uzp1	z31.b, z31.b, z31.b
-# CHECK-NEXT:  1      4     0.50                        uzp1	z31.d, z31.d, z31.d
-# CHECK-NEXT:  1      4     0.50                        uzp1	z31.h, z31.h, z31.h
-# CHECK-NEXT:  1      4     0.50                        uzp1	z31.s, z31.s, z31.s
-# CHECK-NEXT:  1      6     1.00                        uzp2	p15.b, p15.b, p15.b
-# CHECK-NEXT:  1      6     1.00                        uzp2	p15.d, p15.d, p15.d
-# CHECK-NEXT:  1      6     1.00                        uzp2	p15.h, p15.h, p15.h
-# CHECK-NEXT:  1      6     1.00                        uzp2	p15.s, p15.s, p15.s
-# CHECK-NEXT:  1      4     0.50                        uzp2	z31.b, z31.b, z31.b
-# CHECK-NEXT:  1      4     0.50                        uzp2	z31.d, z31.d, z31.d
-# CHECK-NEXT:  1      4     0.50                        uzp2	z31.h, z31.h, z31.h
-# CHECK-NEXT:  1      4     0.50                        uzp2	z31.s, z31.s, z31.s
-# CHECK-NEXT:  1      6     1.00                        whilege	p15.b, w0, wzr
-# CHECK-NEXT:  1      6     1.00                        whilege	p15.b, wzr, w0
-# CHECK-NEXT:  1      6     1.00                        whilege	p15.b, x0, xzr
-# CHECK-NEXT:  1      6     1.00                        whilege	p15.b, xzr, x0
-# CHECK-NEXT:  1      6     1.00                        whilege	p15.d, w0, wzr
-# CHECK-NEXT:  1      6     1.00                        whilege	p15.d, x0, xzr
-# CHECK-NEXT:  1      6     1.00                        whilege	p15.h, w0, wzr
-# CHECK-NEXT:  1      6     1.00                        whilege	p15.h, x0, xzr
-# CHECK-NEXT:  1      6     1.00                        whilege	p15.s, w0, wzr
-# CHECK-NEXT:  1      6     1.00                        whilege	p15.s, x0, xzr
-# CHECK-NEXT:  1      6     1.00                        whilerw	p15.b, x30, x30
-# CHECK-NEXT:  1      6     1.00                        whilerw	p15.d, x30, x30
-# CHECK-NEXT:  1      6     1.00                        whilerw	p15.h, x30, x30
-# CHECK-NEXT:  1      6     1.00                        whilerw	p15.s, x30, x30
-# CHECK-NEXT:  1      6     1.00                        whilewr	p15.b, x30, x30
-# CHECK-NEXT:  1      6     1.00                        whilewr	p15.d, x30, x30
-# CHECK-NEXT:  1      6     1.00                        whilewr	p15.h, x30, x30
-# CHECK-NEXT:  1      6     1.00                        whilewr	p15.s, x30, x30
+# CHECK-NEXT:  1      2     1.00                        uzp1	p15.b, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        uzp1	p15.d, p15.d, p15.d
+# CHECK-NEXT:  1      2     1.00                        uzp1	p15.h, p15.h, p15.h
+# CHECK-NEXT:  1      2     1.00                        uzp1	p15.s, p15.s, p15.s
+# CHECK-NEXT:  1      3     0.50                        uzp1	z31.b, z31.b, z31.b
+# CHECK-NEXT:  1      3     0.50                        uzp1	z31.d, z31.d, z31.d
+# CHECK-NEXT:  1      3     0.50                        uzp1	z31.h, z31.h, z31.h
+# CHECK-NEXT:  1      3     0.50                        uzp1	z31.s, z31.s, z31.s
+# CHECK-NEXT:  1      2     1.00                        uzp2	p15.b, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        uzp2	p15.d, p15.d, p15.d
+# CHECK-NEXT:  1      2     1.00                        uzp2	p15.h, p15.h, p15.h
+# CHECK-NEXT:  1      2     1.00                        uzp2	p15.s, p15.s, p15.s
+# CHECK-NEXT:  1      3     0.50                        uzp2	z31.b, z31.b, z31.b
+# CHECK-NEXT:  1      3     0.50                        uzp2	z31.d, z31.d, z31.d
+# CHECK-NEXT:  1      3     0.50                        uzp2	z31.h, z31.h, z31.h
+# CHECK-NEXT:  1      3     0.50                        uzp2	z31.s, z31.s, z31.s
+# CHECK-NEXT:  1      2     1.00                        whilege	p15.b, w0, wzr
+# CHECK-NEXT:  1      2     1.00                        whilege	p15.b, wzr, w0
+# CHECK-NEXT:  1      2     1.00                        whilege	p15.b, x0, xzr
+# CHECK-NEXT:  1      2     1.00                        whilege	p15.b, xzr, x0
+# CHECK-NEXT:  1      2     1.00                        whilege	p15.d, w0, wzr
+# CHECK-NEXT:  1      2     1.00                        whilege	p15.d, x0, xzr
+# CHECK-NEXT:  1      2     1.00                        whilege	p15.h, w0, wzr
+# CHECK-NEXT:  1      2     1.00                        whilege	p15.h, x0, xzr
+# CHECK-NEXT:  1      2     1.00                        whilege	p15.s, w0, wzr
+# CHECK-NEXT:  1      2     1.00                        whilege	p15.s, x0, xzr
+# CHECK-NEXT:  1      2     1.00                        whilerw	p15.b, x30, x30
+# CHECK-NEXT:  1      2     1.00                        whilerw	p15.d, x30, x30
+# CHECK-NEXT:  1      2     1.00                        whilerw	p15.h, x30, x30
+# CHECK-NEXT:  1      2     1.00                        whilerw	p15.s, x30, x30
+# CHECK-NEXT:  1      2     1.00                        whilewr	p15.b, x30, x30
+# CHECK-NEXT:  1      2     1.00                        whilewr	p15.d, x30, x30
+# CHECK-NEXT:  1      2     1.00                        whilewr	p15.h, x30, x30
+# CHECK-NEXT:  1      2     1.00                        whilewr	p15.s, x30, x30
 # CHECK-NEXT:  1      1     0.33           *      U     wrffr	p0.b
 # CHECK-NEXT:  1      1     0.33           *      U     wrffr	p15.b
-# CHECK-NEXT:  1      3     0.50                        xar	z0.b, z0.b, z1.b, #1
-# CHECK-NEXT:  1      3     0.50                        xar	z0.d, z0.d, z1.d, #1
-# CHECK-NEXT:  1      3     0.50                        xar	z0.h, z0.h, z1.h, #1
-# CHECK-NEXT:  1      3     0.50                        xar	z0.s, z0.s, z1.s, #1
-# CHECK-NEXT:  1      3     0.50                        xar	z31.b, z31.b, z30.b, #8
-# CHECK-NEXT:  1      3     0.50                        xar	z31.d, z31.d, z30.d, #64
-# CHECK-NEXT:  1      3     0.50                        xar	z31.h, z31.h, z30.h, #16
-# CHECK-NEXT:  1      3     0.50                        xar	z31.s, z31.s, z30.s, #32
-# CHECK-NEXT:  1      6     1.00                        zip1	p0.b, p0.b, p0.b
-# CHECK-NEXT:  1      6     1.00                        zip1	p0.d, p0.d, p0.d
-# CHECK-NEXT:  1      6     1.00                        zip1	p0.h, p0.h, p0.h
-# CHECK-NEXT:  1      6     1.00                        zip1	p0.s, p0.s, p0.s
-# CHECK-NEXT:  1      6     1.00                        zip1	p15.b, p15.b, p15.b
-# CHECK-NEXT:  1      6     1.00                        zip1	p15.d, p15.d, p15.d
-# CHECK-NEXT:  1      6     1.00                        zip1	p15.h, p15.h, p15.h
-# CHECK-NEXT:  1      6     1.00                        zip1	p15.s, p15.s, p15.s
-# CHECK-NEXT:  1      4     0.50                        zip1	z0.b, z0.b, z0.b
-# CHECK-NEXT:  1      4     0.50                        zip1	z0.d, z0.d, z0.d
-# CHECK-NEXT:  1      4     0.50                        zip1	z0.h, z0.h, z0.h
-# CHECK-NEXT:  1      4     0.50                        zip1	z0.s, z0.s, z0.s
-# CHECK-NEXT:  1      4     0.50                        zip1	z31.b, z31.b, z31.b
-# CHECK-NEXT:  1      4     0.50                        zip1	z31.d, z31.d, z31.d
-# CHECK-NEXT:  1      4     0.50                        zip1	z31.h, z31.h, z31.h
-# CHECK-NEXT:  1      4     0.50                        zip1	z31.s, z31.s, z31.s
-# CHECK-NEXT:  1      6     1.00                        zip2	p0.b, p0.b, p0.b
-# CHECK-NEXT:  1      6     1.00                        zip2	p0.d, p0.d, p0.d
-# CHECK-NEXT:  1      6     1.00                        zip2	p0.h, p0.h, p0.h
-# CHECK-NEXT:  1      6     1.00                        zip2	p0.s, p0.s, p0.s
-# CHECK-NEXT:  1      6     1.00                        zip2	p15.b, p15.b, p15.b
-# CHECK-NEXT:  1      6     1.00                        zip2	p15.d, p15.d, p15.d
-# CHECK-NEXT:  1      6     1.00                        zip2	p15.h, p15.h, p15.h
-# CHECK-NEXT:  1      6     1.00                        zip2	p15.s, p15.s, p15.s
-# CHECK-NEXT:  1      4     0.50                        zip2	z0.b, z0.b, z0.b
-# CHECK-NEXT:  1      4     0.50                        zip2	z0.d, z0.d, z0.d
-# CHECK-NEXT:  1      4     0.50                        zip2	z0.h, z0.h, z0.h
-# CHECK-NEXT:  1      4     0.50                        zip2	z0.s, z0.s, z0.s
-# CHECK-NEXT:  1      4     0.50                        zip2	z31.b, z31.b, z31.b
-# CHECK-NEXT:  1      4     0.50                        zip2	z31.d, z31.d, z31.d
-# CHECK-NEXT:  1      4     0.50                        zip2	z31.h, z31.h, z31.h
-# CHECK-NEXT:  1      4     0.50                        zip2	z31.s, z31.s, z31.s
+# CHECK-NEXT:  1      4     0.50                        xar	z0.b, z0.b, z1.b, #1
+# CHECK-NEXT:  1      4     0.50                        xar	z0.d, z0.d, z1.d, #1
+# CHECK-NEXT:  1      4     0.50                        xar	z0.h, z0.h, z1.h, #1
+# CHECK-NEXT:  1      4     0.50                        xar	z0.s, z0.s, z1.s, #1
+# CHECK-NEXT:  1      4     0.50                        xar	z31.b, z31.b, z30.b, #8
+# CHECK-NEXT:  1      4     0.50                        xar	z31.d, z31.d, z30.d, #64
+# CHECK-NEXT:  1      4     0.50                        xar	z31.h, z31.h, z30.h, #16
+# CHECK-NEXT:  1      4     0.50                        xar	z31.s, z31.s, z30.s, #32
+# CHECK-NEXT:  1      2     1.00                        zip1	p0.b, p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        zip1	p0.d, p0.d, p0.d
+# CHECK-NEXT:  1      2     1.00                        zip1	p0.h, p0.h, p0.h
+# CHECK-NEXT:  1      2     1.00                        zip1	p0.s, p0.s, p0.s
+# CHECK-NEXT:  1      2     1.00                        zip1	p15.b, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        zip1	p15.d, p15.d, p15.d
+# CHECK-NEXT:  1      2     1.00                        zip1	p15.h, p15.h, p15.h
+# CHECK-NEXT:  1      2     1.00                        zip1	p15.s, p15.s, p15.s
+# CHECK-NEXT:  1      3     0.50                        zip1	z0.b, z0.b, z0.b
+# CHECK-NEXT:  1      3     0.50                        zip1	z0.d, z0.d, z0.d
+# CHECK-NEXT:  1      3     0.50                        zip1	z0.h, z0.h, z0.h
+# CHECK-NEXT:  1      3     0.50                        zip1	z0.s, z0.s, z0.s
+# CHECK-NEXT:  1      3     0.50                        zip1	z31.b, z31.b, z31.b
+# CHECK-NEXT:  1      3     0.50                        zip1	z31.d, z31.d, z31.d
+# CHECK-NEXT:  1      3     0.50                        zip1	z31.h, z31.h, z31.h
+# CHECK-NEXT:  1      3     0.50                        zip1	z31.s, z31.s, z31.s
+# CHECK-NEXT:  1      2     1.00                        zip2	p0.b, p0.b, p0.b
+# CHECK-NEXT:  1      2     1.00                        zip2	p0.d, p0.d, p0.d
+# CHECK-NEXT:  1      2     1.00                        zip2	p0.h, p0.h, p0.h
+# CHECK-NEXT:  1      2     1.00                        zip2	p0.s, p0.s, p0.s
+# CHECK-NEXT:  1      2     1.00                        zip2	p15.b, p15.b, p15.b
+# CHECK-NEXT:  1      2     1.00                        zip2	p15.d, p15.d, p15.d
+# CHECK-NEXT:  1      2     1.00                        zip2	p15.h, p15.h, p15.h
+# CHECK-NEXT:  1      2     1.00                        zip2	p15.s, p15.s, p15.s
+# CHECK-NEXT:  1      3     0.50                        zip2	z0.b, z0.b, z0.b
+# CHECK-NEXT:  1      3     0.50                        zip2	z0.d, z0.d, z0.d
+# CHECK-NEXT:  1      3     0.50                        zip2	z0.h, z0.h, z0.h
+# CHECK-NEXT:  1      3     0.50                        zip2	z0.s, z0.s, z0.s
+# CHECK-NEXT:  1      3     0.50                        zip2	z31.b, z31.b, z31.b
+# CHECK-NEXT:  1      3     0.50                        zip2	z31.d, z31.d, z31.d
+# CHECK-NEXT:  1      3     0.50                        zip2	z31.h, z31.h, z31.h
+# CHECK-NEXT:  1      3     0.50                        zip2	z31.s, z31.s, z31.s
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - CortexA510UnitALU0
@@ -6848,7 +6848,7 @@ zip2	z31.s, z31.s, z31.s
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1.0]  [1.1]  [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10.0] [10.1] [11]
-# CHECK-NEXT: 79.00  75.00  75.00  9.00    -     240.00 3698.00  -     -     1290.00 924.00 199.50 199.50 670.00
+# CHECK-NEXT: 79.00  75.00  75.00  9.00    -     209.00 3667.00  -     -     1290.00 924.00 199.50 199.50 670.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1.0]  [1.1]  [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10.0] [10.1] [11]   Instructions:
@@ -7844,8 +7844,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1b	{ z0.d }, p0/z, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ld1b	{ z0.d }, p0/z, [z0.d]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1b	{ z0.h }, p0/z, [x0]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ld1b	{ z0.s }, p0/z, [x0, z0.s, sxtw]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ld1b	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ld1b	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ld1b	{ z0.s }, p0/z, [x0, z0.s, uxtw]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1b	{ z0.s }, p0/z, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -     9.00    -      -      -      -      -      -      -     ld1b	{ z0.s }, p0/z, [z0.s]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1b	{ z21.b }, p5/z, [x10, #5, mul vl]
@@ -7882,8 +7882,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1h	{ z0.d }, p0/z, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ld1h	{ z0.d }, p0/z, [z0.d]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1h	{ z0.h }, p0/z, [x0]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ld1h	{ z0.s }, p0/z, [x0, z0.s, sxtw]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ld1h	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ld1h	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ld1h	{ z0.s }, p0/z, [x0, z0.s, uxtw]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1h	{ z0.s }, p0/z, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -     9.00    -      -      -      -      -      -      -     ld1h	{ z0.s }, p0/z, [z0.s]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1h	{ z21.d }, p5/z, [x10, #5, mul vl]
@@ -7899,8 +7899,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ld1h	{ z31.d }, p7/z, [z31.d, #62]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1h	{ z31.h }, p7/z, [sp, #-1, mul vl]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1h	{ z31.s }, p7/z, [sp, #-1, mul vl]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ld1h	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ld1h	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ld1h	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ld1h	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
 # CHECK-NEXT:  -      -      -      -      -      -     9.00    -      -      -      -      -      -      -     ld1h	{ z31.s }, p7/z, [z31.s, #62]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1h	{ z5.h }, p3/z, [sp, x16, lsl #1]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1h	{ z5.h }, p3/z, [x17, x16, lsl #1]
@@ -7961,7 +7961,7 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1sb	{ z0.h }, p0/z, [sp, x0]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1sb	{ z0.h }, p0/z, [x0, x0]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1sb	{ z0.h }, p0/z, [x0]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ld1sb	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ld1sb	{ z0.s }, p0/z, [x0, z0.s, sxtw]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1sb	{ z0.s }, p0/z, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -     9.00    -      -      -      -      -      -      -     ld1sb	{ z0.s }, p0/z, [z0.s]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1sb	{ z21.d }, p5/z, [x10, #5, mul vl]
@@ -7981,8 +7981,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ld1sh	{ z0.d }, p0/z, [x0, z0.d, uxtw #1]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1sh	{ z0.d }, p0/z, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ld1sh	{ z0.d }, p0/z, [z0.d]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ld1sh	{ z0.s }, p0/z, [x0, z0.s, sxtw]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ld1sh	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ld1sh	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ld1sh	{ z0.s }, p0/z, [x0, z0.s, uxtw]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1sh	{ z0.s }, p0/z, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -     9.00    -      -      -      -      -      -      -     ld1sh	{ z0.s }, p0/z, [z0.s]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1sh	{ z21.d }, p5/z, [x10, #5, mul vl]
@@ -7997,8 +7997,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ld1sh	{ z31.d }, p7/z, [sp, z31.d]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ld1sh	{ z31.d }, p7/z, [z31.d, #62]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1sh	{ z31.s }, p7/z, [sp, #-1, mul vl]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ld1sh	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ld1sh	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ld1sh	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ld1sh	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
 # CHECK-NEXT:  -      -      -      -      -      -     9.00    -      -      -      -      -      -      -     ld1sh	{ z31.s }, p7/z, [z31.s, #62]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ld1sw	{ z0.d }, p0/z, [x0, z0.d, sxtw #2]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ld1sw	{ z0.d }, p0/z, [x0, z0.d, uxtw #2]
@@ -8017,8 +8017,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ld1w	{ z0.d }, p0/z, [x0, z0.d, uxtw #2]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1w	{ z0.d }, p0/z, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ld1w	{ z0.d }, p0/z, [z0.d]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ld1w	{ z0.s }, p0/z, [x0, z0.s, sxtw]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ld1w	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ld1w	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ld1w	{ z0.s }, p0/z, [x0, z0.s, uxtw]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1w	{ z0.s }, p0/z, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -     9.00    -      -      -      -      -      -      -     ld1w	{ z0.s }, p0/z, [z0.s]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1w	{ z21.d }, p5/z, [x10, #5, mul vl]
@@ -8033,8 +8033,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ld1w	{ z31.d }, p7/z, [sp, z31.d]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ld1w	{ z31.d }, p7/z, [z31.d, #124]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ld1w	{ z31.s }, p7/z, [sp, #-1, mul vl]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ld1w	{ z31.s }, p7/z, [sp, z31.s, sxtw #2]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ld1w	{ z31.s }, p7/z, [sp, z31.s, uxtw #2]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ld1w	{ z31.s }, p7/z, [sp, z31.s, sxtw #2]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ld1w	{ z31.s }, p7/z, [sp, z31.s, uxtw #2]
 # CHECK-NEXT:  -      -      -      -      -      -     9.00    -      -      -      -      -      -      -     ld1w	{ z31.s }, p7/z, [z31.s, #124]
 # CHECK-NEXT:  -      -      -      -      -      -     2.00    -      -      -      -      -      -      -     ld2b	{ z0.b, z1.b }, p0/z, [x0, x0]
 # CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -      -      -      -      -      -     ld2b	{ z0.b, z1.b }, p0/z, [x0]
@@ -8100,8 +8100,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ldff1b	{ z0.d }, p0/z, [z0.d]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ldff1b	{ z0.h }, p0/z, [x0, x0]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ldff1b	{ z0.s }, p0/z, [x0, x0]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ldff1b	{ z0.s }, p0/z, [x0, z0.s, sxtw]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ldff1b	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ldff1b	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ldff1b	{ z0.s }, p0/z, [x0, z0.s, uxtw]
 # CHECK-NEXT:  -      -      -      -      -      -     9.00    -      -      -      -      -      -      -     ldff1b	{ z0.s }, p0/z, [z0.s]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ldff1b	{ z21.d }, p5/z, [x10, z21.d, sxtw]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ldff1b	{ z21.d }, p5/z, [x10, z21.d, uxtw]
@@ -8128,8 +8128,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ldff1h	{ z0.d }, p0/z, [z0.d]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ldff1h	{ z0.h }, p0/z, [x0, x0, lsl #1]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ldff1h	{ z0.s }, p0/z, [x0, x0, lsl #1]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ldff1h	{ z0.s }, p0/z, [x0, z0.s, sxtw]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ldff1h	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ldff1h	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ldff1h	{ z0.s }, p0/z, [x0, z0.s, uxtw]
 # CHECK-NEXT:  -      -      -      -      -      -     9.00    -      -      -      -      -      -      -     ldff1h	{ z0.s }, p0/z, [z0.s]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ldff1h	{ z21.d }, p5/z, [x10, z21.d, sxtw]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ldff1h	{ z21.d }, p5/z, [x10, z21.d, uxtw]
@@ -8138,16 +8138,16 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ldff1h	{ z31.d }, p7/z, [sp]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ldff1h	{ z31.d }, p7/z, [z31.d, #62]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ldff1h	{ z31.h }, p7/z, [sp]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ldff1h	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ldff1h	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ldff1h	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ldff1h	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ldff1h	{ z31.s }, p7/z, [sp]
 # CHECK-NEXT:  -      -      -      -      -      -     9.00    -      -      -      -      -      -      -     ldff1h	{ z31.s }, p7/z, [z31.s, #62]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ldff1sb	{ z0.d }, p0/z, [x0, x0]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ldff1sb	{ z0.d }, p0/z, [z0.d]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ldff1sb	{ z0.h }, p0/z, [x0, x0]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ldff1sb	{ z0.s }, p0/z, [x0, x0]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ldff1sb	{ z0.s }, p0/z, [x0, z0.s, sxtw]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ldff1sb	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ldff1sb	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ldff1sb	{ z0.s }, p0/z, [x0, z0.s, uxtw]
 # CHECK-NEXT:  -      -      -      -      -      -     9.00    -      -      -      -      -      -      -     ldff1sb	{ z0.s }, p0/z, [z0.s]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ldff1sb	{ z21.d }, p5/z, [x10, z21.d, sxtw]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ldff1sb	{ z21.d }, p5/z, [x10, z21.d, uxtw]
@@ -8162,8 +8162,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ldff1sh	{ z0.d }, p0/z, [x0, z0.d, uxtw #1]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ldff1sh	{ z0.d }, p0/z, [z0.d]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ldff1sh	{ z0.s }, p0/z, [x0, x0, lsl #1]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ldff1sh	{ z0.s }, p0/z, [x0, z0.s, sxtw]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ldff1sh	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ldff1sh	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ldff1sh	{ z0.s }, p0/z, [x0, z0.s, uxtw]
 # CHECK-NEXT:  -      -      -      -      -      -     9.00    -      -      -      -      -      -      -     ldff1sh	{ z0.s }, p0/z, [z0.s]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ldff1sh	{ z21.d }, p5/z, [x10, z21.d, sxtw]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ldff1sh	{ z21.d }, p5/z, [x10, z21.d, uxtw]
@@ -8171,8 +8171,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ldff1sh	{ z31.d }, p7/z, [sp, z31.d]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ldff1sh	{ z31.d }, p7/z, [sp]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ldff1sh	{ z31.d }, p7/z, [z31.d, #62]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ldff1sh	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ldff1sh	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ldff1sh	{ z31.s }, p7/z, [sp, z31.s, sxtw #1]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ldff1sh	{ z31.s }, p7/z, [sp, z31.s, uxtw #1]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ldff1sh	{ z31.s }, p7/z, [sp]
 # CHECK-NEXT:  -      -      -      -      -      -     9.00    -      -      -      -      -      -      -     ldff1sh	{ z31.s }, p7/z, [z31.s, #62]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ldff1sw	{ z0.d }, p0/z, [x0, x0, lsl #2]
@@ -8190,8 +8190,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ldff1w	{ z0.d }, p0/z, [x0, z0.d, uxtw #2]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ldff1w	{ z0.d }, p0/z, [z0.d]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ldff1w	{ z0.s }, p0/z, [x0, x0, lsl #2]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ldff1w	{ z0.s }, p0/z, [x0, z0.s, sxtw]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ldff1w	{ z0.s }, p0/z, [x0, z0.s, uxtw]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ldff1w	{ z0.s }, p0/z, [x0, z0.s, sxtw]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ldff1w	{ z0.s }, p0/z, [x0, z0.s, uxtw]
 # CHECK-NEXT:  -      -      -      -      -      -     9.00    -      -      -      -      -      -      -     ldff1w	{ z0.s }, p0/z, [z0.s]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ldff1w	{ z21.d }, p5/z, [x10, z21.d, sxtw]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ldff1w	{ z21.d }, p5/z, [x10, z21.d, uxtw]
@@ -8199,8 +8199,8 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ldff1w	{ z31.d }, p7/z, [sp, z31.d]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ldff1w	{ z31.d }, p7/z, [sp]
 # CHECK-NEXT:  -      -      -      -      -      -     7.00    -      -      -      -      -      -      -     ldff1w	{ z31.d }, p7/z, [z31.d, #124]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ldff1w	{ z31.s }, p7/z, [sp, z31.s, sxtw #2]
-# CHECK-NEXT:  -      -      -      -      -     4.50   4.50    -      -      -      -      -      -      -     ldff1w	{ z31.s }, p7/z, [sp, z31.s, uxtw #2]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ldff1w	{ z31.s }, p7/z, [sp, z31.s, sxtw #2]
+# CHECK-NEXT:  -      -      -      -      -     3.50   3.50    -      -      -      -      -      -      -     ldff1w	{ z31.s }, p7/z, [sp, z31.s, uxtw #2]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ldff1w	{ z31.s }, p7/z, [sp]
 # CHECK-NEXT:  -      -      -      -      -      -     9.00    -      -      -      -      -      -      -     ldff1w	{ z31.s }, p7/z, [z31.s, #124]
 # CHECK-NEXT:  -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -     ldnf1b	{ z0.b }, p0/z, [x0]
diff --git a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
index 0a947f6e206fe..4699fbbea5def 100644
--- a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
@@ -22,7 +22,7 @@
 namespace llvm {
 namespace exegesis {
 
-#if defined(__linux__) && !defined(__ANDROID__)
+#if defined(__linux__)
 
 long SubprocessMemory::getCurrentTID() {
   // We're using the raw syscall here rather than the gettid() function provided
@@ -31,6 +31,8 @@ long SubprocessMemory::getCurrentTID() {
   return syscall(SYS_gettid);
 }
 
+#if !defined(__ANDROID__)
+
 Error SubprocessMemory::initializeSubprocessMemory(pid_t ProcessID) {
   // Add the PID to the shared memory name so that if we're running multiple
   // processes at the same time, they won't interfere with each other.
@@ -157,7 +159,8 @@ Expected<int> SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
 
 SubprocessMemory::~SubprocessMemory() {}
 
-#endif // defined(__linux__) && !defined(__ANDROID__)
+#endif // !defined(__ANDROID__)
+#endif // defined(__linux__)
 
 } // namespace exegesis
 } // namespace llvm
diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp
index 8ebd9b511f39f..8738af91b652b 100644
--- a/llvm/unittests/Analysis/ValueTrackingTest.cpp
+++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp
@@ -2110,8 +2110,7 @@ TEST_F(ValueTrackingTest, isNonZeroRecurrence) {
   )");
   const DataLayout &DL = M->getDataLayout();
   AssumptionCache AC(*F);
-  EXPECT_TRUE(isKnownNonZero(A, /*Depth=*/0,
-                             SimplifyQuery(DL, /*DT=*/nullptr, &AC, CxtI)));
+  EXPECT_TRUE(isKnownNonZero(A, SimplifyQuery(DL, /*DT=*/nullptr, &AC, CxtI)));
 }
 
 TEST_F(ValueTrackingTest, KnownNonZeroFromDomCond) {
@@ -2135,9 +2134,8 @@ TEST_F(ValueTrackingTest, KnownNonZeroFromDomCond) {
   DominatorTree DT(*F);
   const DataLayout &DL = M->getDataLayout();
   const SimplifyQuery SQ(DL, &DT, &AC);
-  EXPECT_EQ(isKnownNonZero(A, /*Depth=*/0, SQ.getWithInstruction(CxtI)), true);
-  EXPECT_EQ(isKnownNonZero(A, /*Depth=*/0, SQ.getWithInstruction(CxtI2)),
-            false);
+  EXPECT_EQ(isKnownNonZero(A, SQ.getWithInstruction(CxtI)), true);
+  EXPECT_EQ(isKnownNonZero(A, SQ.getWithInstruction(CxtI2)), false);
 }
 
 TEST_F(ValueTrackingTest, KnownNonZeroFromDomCond2) {
@@ -2161,9 +2159,8 @@ TEST_F(ValueTrackingTest, KnownNonZeroFromDomCond2) {
   DominatorTree DT(*F);
   const DataLayout &DL = M->getDataLayout();
   const SimplifyQuery SQ(DL, &DT, &AC);
-  EXPECT_EQ(isKnownNonZero(A, /*Depth=*/0, SQ.getWithInstruction(CxtI)), true);
-  EXPECT_EQ(isKnownNonZero(A, /*Depth=*/0, SQ.getWithInstruction(CxtI2)),
-            false);
+  EXPECT_EQ(isKnownNonZero(A, SQ.getWithInstruction(CxtI)), true);
+  EXPECT_EQ(isKnownNonZero(A, SQ.getWithInstruction(CxtI2)), false);
 }
 
 TEST_F(ValueTrackingTest, IsImpliedConditionAnd) {
diff --git a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
index 4865616e3e2ba..d7e4dba4ac170 100644
--- a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
+++ b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
@@ -59,6 +59,14 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) {
   EXPECT_EQ(UpgradeDataLayoutString("e-m:e-p:64:64-i64:64-i128:128-n64-S128",
                                     "riscv64"),
             "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128");
+
+  // Check that SPIR && SPIRV targets add -G1 if it's not present.
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "spir"), "e-p:32:32-G1");
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "spir64"), "e-p:32:32-G1");
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "spirv32"), "e-p:32:32-G1");
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "spirv64"), "e-p:32:32-G1");
+  // but that SPIRV Logical does not.
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "spirv"), "e-p:32:32");
 }
 
 TEST(DataLayoutUpgradeTest, NoDataLayoutUpgrade) {
@@ -100,6 +108,17 @@ TEST(DataLayoutUpgradeTest, NoDataLayoutUpgrade) {
             "p7:64:64-G2-e-p:64:64-ni:7:8:9-p8:128:128-p9:192:256:256:32");
   EXPECT_EQ(UpgradeDataLayoutString("e-p:64:64-p7:64:64-G1", "amdgcn"),
             "e-p:64:64-p7:64:64-G1-ni:7:8:9-p8:128:128-p9:192:256:256:32");
+
+  // Check that SPIR & SPIRV targets don't add -G1 if there is already a -G
+  // flag.
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G2", "spir"), "e-p:32:32-G2");
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G2", "spir64"), "e-p:32:32-G2");
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G2", "spirv32"), "e-p:32:32-G2");
+  EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G2", "spirv64"), "e-p:32:32-G2");
+  EXPECT_EQ(UpgradeDataLayoutString("G2", "spir"), "G2");
+  EXPECT_EQ(UpgradeDataLayoutString("G2", "spir64"), "G2");
+  EXPECT_EQ(UpgradeDataLayoutString("G2", "spirv32"), "G2");
+  EXPECT_EQ(UpgradeDataLayoutString("G2", "spirv64"), "G2");
 }
 
 TEST(DataLayoutUpgradeTest, EmptyDataLayout) {
@@ -113,6 +132,14 @@ TEST(DataLayoutUpgradeTest, EmptyDataLayout) {
   EXPECT_EQ(UpgradeDataLayoutString("", "r600"), "G1");
   EXPECT_EQ(UpgradeDataLayoutString("", "amdgcn"),
             "G1-ni:7:8:9-p7:160:256:256:32-p8:128:128-p9:192:256:256:32");
+
+  // Check that SPIR & SPIRV targets add G1 if it's not present.
+  EXPECT_EQ(UpgradeDataLayoutString("", "spir"), "G1");
+  EXPECT_EQ(UpgradeDataLayoutString("", "spir64"), "G1");
+  EXPECT_EQ(UpgradeDataLayoutString("", "spirv32"), "G1");
+  EXPECT_EQ(UpgradeDataLayoutString("", "spirv64"), "G1");
+  // but SPIRV Logical does not.
+  EXPECT_EQ(UpgradeDataLayoutString("", "spirv"), "");
 }
 
 } // end namespace
diff --git a/llvm/unittests/CodeGen/RegAllocScoreTest.cpp b/llvm/unittests/CodeGen/RegAllocScoreTest.cpp
index ff7146eaf9439..eae517f9d01cf 100644
--- a/llvm/unittests/CodeGen/RegAllocScoreTest.cpp
+++ b/llvm/unittests/CodeGen/RegAllocScoreTest.cpp
@@ -166,19 +166,20 @@ TEST(RegAllocScoreTest, Counts) {
   ASSERT_EQ(MF->size(), 2U);
   const auto TotalScore =
       llvm::calculateRegAllocScore(*MF, MBBFreqMock, IsRemat);
-  ASSERT_EQ(Freq1, TotalScore.copyCounts());
-  ASSERT_EQ(2.0 * Freq1 + Freq2, TotalScore.loadCounts());
-  ASSERT_EQ(Freq1 + Freq2, TotalScore.storeCounts());
-  ASSERT_EQ(Freq2, TotalScore.loadStoreCounts());
-  ASSERT_EQ(Freq1, TotalScore.cheapRematCounts());
-  ASSERT_EQ(Freq2, TotalScore.expensiveRematCounts());
-  ASSERT_EQ(TotalScore.getScore(),
-            TotalScore.copyCounts() * CopyWeight +
-                TotalScore.loadCounts() * LoadWeight +
-                TotalScore.storeCounts() * StoreWeight +
-                TotalScore.loadStoreCounts() * (LoadWeight + StoreWeight) +
-                TotalScore.cheapRematCounts() * CheapRematWeight +
-                TotalScore.expensiveRematCounts() * ExpensiveRematWeight
+  ASSERT_DOUBLE_EQ(Freq1, TotalScore.copyCounts());
+  ASSERT_DOUBLE_EQ(2.0 * Freq1 + Freq2, TotalScore.loadCounts());
+  ASSERT_DOUBLE_EQ(Freq1 + Freq2, TotalScore.storeCounts());
+  ASSERT_DOUBLE_EQ(Freq2, TotalScore.loadStoreCounts());
+  ASSERT_DOUBLE_EQ(Freq1, TotalScore.cheapRematCounts());
+  ASSERT_DOUBLE_EQ(Freq2, TotalScore.expensiveRematCounts());
+  ASSERT_DOUBLE_EQ(
+      TotalScore.getScore(),
+      TotalScore.copyCounts() * CopyWeight +
+          TotalScore.loadCounts() * LoadWeight +
+          TotalScore.storeCounts() * StoreWeight +
+          TotalScore.loadStoreCounts() * (LoadWeight + StoreWeight) +
+          TotalScore.cheapRematCounts() * CheapRematWeight +
+          TotalScore.expensiveRematCounts() * ExpensiveRematWeight
 
   );
 }
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index db1c4a8951ad2..8344bca08404e 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -2097,7 +2097,7 @@ TEST_F(OpenMPIRBuilderTest, ApplySimdlenSafelen) {
   }));
 }
 
-TEST_F(OpenMPIRBuilderTest, ApplySimdLoopIf) {
+TEST_F(OpenMPIRBuilderTest, ApplySimdIf) {
   OpenMPIRBuilder OMPBuilder(*M);
   IRBuilder<> Builder(BB);
   MapVector<Value *, Value *> AlignedVars;
diff --git a/llvm/unittests/IR/ConstantsTest.cpp b/llvm/unittests/IR/ConstantsTest.cpp
index 1d6a92c498b06..8f0a507c0fd18 100644
--- a/llvm/unittests/IR/ConstantsTest.cpp
+++ b/llvm/unittests/IR/ConstantsTest.cpp
@@ -581,7 +581,7 @@ TEST(ConstantsTest, containsUndefElemTest) {
   }
 }
 
-// Check that undefined elements in vector constants are matched
+// Check that poison elements in vector constants are matched
 // correctly for both integer and floating-point types. Just don't
 // crash on vectors of pointers (could be handled?).
 
@@ -590,6 +590,7 @@ TEST(ConstantsTest, isElementWiseEqual) {
 
   Type *Int32Ty = Type::getInt32Ty(Context);
   Constant *CU = UndefValue::get(Int32Ty);
+  Constant *CP = PoisonValue::get(Int32Ty);
   Constant *C1 = ConstantInt::get(Int32Ty, 1);
   Constant *C2 = ConstantInt::get(Int32Ty, 2);
 
@@ -597,15 +598,25 @@ TEST(ConstantsTest, isElementWiseEqual) {
   Constant *C12U1 = ConstantVector::get({C1, C2, CU, C1});
   Constant *C12U2 = ConstantVector::get({C1, C2, CU, C2});
   Constant *C12U21 = ConstantVector::get({C1, C2, CU, C2, C1});
+  Constant *C12P1 = ConstantVector::get({C1, C2, CP, C1});
+  Constant *C12P2 = ConstantVector::get({C1, C2, CP, C2});
+  Constant *C12P21 = ConstantVector::get({C1, C2, CP, C2, C1});
 
-  EXPECT_TRUE(C1211->isElementWiseEqual(C12U1));
-  EXPECT_TRUE(C12U1->isElementWiseEqual(C1211));
+  EXPECT_FALSE(C1211->isElementWiseEqual(C12U1));
+  EXPECT_FALSE(C12U1->isElementWiseEqual(C1211));
   EXPECT_FALSE(C12U2->isElementWiseEqual(C12U1));
   EXPECT_FALSE(C12U1->isElementWiseEqual(C12U2));
   EXPECT_FALSE(C12U21->isElementWiseEqual(C12U2));
 
+  EXPECT_TRUE(C1211->isElementWiseEqual(C12P1));
+  EXPECT_TRUE(C12P1->isElementWiseEqual(C1211));
+  EXPECT_FALSE(C12P2->isElementWiseEqual(C12P1));
+  EXPECT_FALSE(C12P1->isElementWiseEqual(C12P2));
+  EXPECT_FALSE(C12P21->isElementWiseEqual(C12P2));
+
   Type *FltTy = Type::getFloatTy(Context);
   Constant *CFU = UndefValue::get(FltTy);
+  Constant *CFP = PoisonValue::get(FltTy);
   Constant *CF1 = ConstantFP::get(FltTy, 1.0);
   Constant *CF2 = ConstantFP::get(FltTy, 2.0);
 
@@ -613,25 +624,41 @@ TEST(ConstantsTest, isElementWiseEqual) {
   Constant *CF12U1 = ConstantVector::get({CF1, CF2, CFU, CF1});
   Constant *CF12U2 = ConstantVector::get({CF1, CF2, CFU, CF2});
   Constant *CFUU1U = ConstantVector::get({CFU, CFU, CF1, CFU});
+  Constant *CF12P1 = ConstantVector::get({CF1, CF2, CFP, CF1});
+  Constant *CF12P2 = ConstantVector::get({CF1, CF2, CFP, CF2});
+  Constant *CFPP1P = ConstantVector::get({CFP, CFP, CF1, CFP});
 
-  EXPECT_TRUE(CF1211->isElementWiseEqual(CF12U1));
-  EXPECT_TRUE(CF12U1->isElementWiseEqual(CF1211));
-  EXPECT_TRUE(CFUU1U->isElementWiseEqual(CF12U1));
+  EXPECT_FALSE(CF1211->isElementWiseEqual(CF12U1));
+  EXPECT_FALSE(CF12U1->isElementWiseEqual(CF1211));
+  EXPECT_FALSE(CFUU1U->isElementWiseEqual(CF12U1));
   EXPECT_FALSE(CF12U2->isElementWiseEqual(CF12U1));
   EXPECT_FALSE(CF12U1->isElementWiseEqual(CF12U2));
 
+  EXPECT_TRUE(CF1211->isElementWiseEqual(CF12P1));
+  EXPECT_TRUE(CF12P1->isElementWiseEqual(CF1211));
+  EXPECT_TRUE(CFPP1P->isElementWiseEqual(CF12P1));
+  EXPECT_FALSE(CF12P2->isElementWiseEqual(CF12P1));
+  EXPECT_FALSE(CF12P1->isElementWiseEqual(CF12P2));
+
   PointerType *PtrTy = PointerType::get(Context, 0);
   Constant *CPU = UndefValue::get(PtrTy);
+  Constant *CPP = PoisonValue::get(PtrTy);
   Constant *CP0 = ConstantPointerNull::get(PtrTy);
 
   Constant *CP0000 = ConstantVector::get({CP0, CP0, CP0, CP0});
   Constant *CP00U0 = ConstantVector::get({CP0, CP0, CPU, CP0});
   Constant *CP00U = ConstantVector::get({CP0, CP0, CPU});
+  Constant *CP00P0 = ConstantVector::get({CP0, CP0, CPP, CP0});
+  Constant *CP00P = ConstantVector::get({CP0, CP0, CPP});
 
   EXPECT_FALSE(CP0000->isElementWiseEqual(CP00U0));
   EXPECT_FALSE(CP00U0->isElementWiseEqual(CP0000));
   EXPECT_FALSE(CP0000->isElementWiseEqual(CP00U));
   EXPECT_FALSE(CP00U->isElementWiseEqual(CP00U0));
+  EXPECT_FALSE(CP0000->isElementWiseEqual(CP00P0));
+  EXPECT_FALSE(CP00P0->isElementWiseEqual(CP0000));
+  EXPECT_FALSE(CP0000->isElementWiseEqual(CP00P));
+  EXPECT_FALSE(CP00P->isElementWiseEqual(CP00P0));
 }
 
 // Check that vector/aggregate constants correctly store undef and poison
diff --git a/llvm/unittests/IR/PatternMatch.cpp b/llvm/unittests/IR/PatternMatch.cpp
index 4d0c2e4220fec..133012684d16d 100644
--- a/llvm/unittests/IR/PatternMatch.cpp
+++ b/llvm/unittests/IR/PatternMatch.cpp
@@ -1184,6 +1184,8 @@ TEST_F(PatternMatchTest, VectorUndefInt) {
   Type *VectorTy = FixedVectorType::get(ScalarTy, 4);
   Constant *ScalarUndef = UndefValue::get(ScalarTy);
   Constant *VectorUndef = UndefValue::get(VectorTy);
+  Constant *ScalarPoison = PoisonValue::get(ScalarTy);
+  Constant *VectorPoison = PoisonValue::get(VectorTy);
   Constant *ScalarZero = Constant::getNullValue(ScalarTy);
   Constant *VectorZero = Constant::getNullValue(VectorTy);
 
@@ -1194,17 +1196,30 @@ TEST_F(PatternMatchTest, VectorUndefInt) {
   Elems.push_back(ScalarZero);
   Constant *VectorZeroUndef = ConstantVector::get(Elems);
 
+  SmallVector<Constant *, 4> Elems2;
+  Elems2.push_back(ScalarPoison);
+  Elems2.push_back(ScalarZero);
+  Elems2.push_back(ScalarPoison);
+  Elems2.push_back(ScalarZero);
+  Constant *VectorZeroPoison = ConstantVector::get(Elems2);
+
   EXPECT_TRUE(match(ScalarUndef, m_Undef()));
+  EXPECT_TRUE(match(ScalarPoison, m_Undef()));
   EXPECT_TRUE(match(VectorUndef, m_Undef()));
+  EXPECT_TRUE(match(VectorPoison, m_Undef()));
   EXPECT_FALSE(match(ScalarZero, m_Undef()));
   EXPECT_FALSE(match(VectorZero, m_Undef()));
   EXPECT_FALSE(match(VectorZeroUndef, m_Undef()));
+  EXPECT_FALSE(match(VectorZeroPoison, m_Undef()));
 
   EXPECT_FALSE(match(ScalarUndef, m_Zero()));
+  EXPECT_FALSE(match(ScalarPoison, m_Zero()));
   EXPECT_FALSE(match(VectorUndef, m_Zero()));
+  EXPECT_FALSE(match(VectorPoison, m_Zero()));
+  EXPECT_FALSE(match(VectorZeroUndef, m_Zero()));
   EXPECT_TRUE(match(ScalarZero, m_Zero()));
   EXPECT_TRUE(match(VectorZero, m_Zero()));
-  EXPECT_TRUE(match(VectorZeroUndef, m_Zero()));
+  EXPECT_TRUE(match(VectorZeroPoison, m_Zero()));
 
   const APInt *C;
   // Regardless of whether undefs are allowed,
@@ -1249,6 +1264,8 @@ TEST_F(PatternMatchTest, VectorUndefFloat) {
   Type *VectorTy = FixedVectorType::get(ScalarTy, 4);
   Constant *ScalarUndef = UndefValue::get(ScalarTy);
   Constant *VectorUndef = UndefValue::get(VectorTy);
+  Constant *ScalarPoison = PoisonValue::get(ScalarTy);
+  Constant *VectorPoison = PoisonValue::get(VectorTy);
   Constant *ScalarZero = Constant::getNullValue(ScalarTy);
   Constant *VectorZero = Constant::getNullValue(VectorTy);
   Constant *ScalarPosInf = ConstantFP::getInfinity(ScalarTy, false);
@@ -1258,72 +1275,116 @@ TEST_F(PatternMatchTest, VectorUndefFloat) {
   Constant *VectorZeroUndef =
       ConstantVector::get({ScalarUndef, ScalarZero, ScalarUndef, ScalarZero});
 
+  Constant *VectorZeroPoison =
+      ConstantVector::get({ScalarPoison, ScalarZero, ScalarPoison, ScalarZero});
+
   Constant *VectorInfUndef = ConstantVector::get(
       {ScalarPosInf, ScalarNegInf, ScalarUndef, ScalarPosInf});
 
+  Constant *VectorInfPoison = ConstantVector::get(
+      {ScalarPosInf, ScalarNegInf, ScalarPoison, ScalarPosInf});
+
   Constant *VectorNaNUndef =
       ConstantVector::get({ScalarUndef, ScalarNaN, ScalarNaN, ScalarNaN});
 
+  Constant *VectorNaNPoison =
+      ConstantVector::get({ScalarPoison, ScalarNaN, ScalarNaN, ScalarNaN});
+
   EXPECT_TRUE(match(ScalarUndef, m_Undef()));
   EXPECT_TRUE(match(VectorUndef, m_Undef()));
+  EXPECT_TRUE(match(ScalarPoison, m_Undef()));
+  EXPECT_TRUE(match(VectorPoison, m_Undef()));
   EXPECT_FALSE(match(ScalarZero, m_Undef()));
   EXPECT_FALSE(match(VectorZero, m_Undef()));
   EXPECT_FALSE(match(VectorZeroUndef, m_Undef()));
   EXPECT_FALSE(match(VectorInfUndef, m_Undef()));
   EXPECT_FALSE(match(VectorNaNUndef, m_Undef()));
+  EXPECT_FALSE(match(VectorZeroPoison, m_Undef()));
+  EXPECT_FALSE(match(VectorInfPoison, m_Undef()));
+  EXPECT_FALSE(match(VectorNaNPoison, m_Undef()));
 
   EXPECT_FALSE(match(ScalarUndef, m_AnyZeroFP()));
   EXPECT_FALSE(match(VectorUndef, m_AnyZeroFP()));
+  EXPECT_FALSE(match(ScalarPoison, m_AnyZeroFP()));
+  EXPECT_FALSE(match(VectorPoison, m_AnyZeroFP()));
   EXPECT_TRUE(match(ScalarZero, m_AnyZeroFP()));
   EXPECT_TRUE(match(VectorZero, m_AnyZeroFP()));
-  EXPECT_TRUE(match(VectorZeroUndef, m_AnyZeroFP()));
+  EXPECT_FALSE(match(VectorZeroUndef, m_AnyZeroFP()));
   EXPECT_FALSE(match(VectorInfUndef, m_AnyZeroFP()));
   EXPECT_FALSE(match(VectorNaNUndef, m_AnyZeroFP()));
+  EXPECT_TRUE(match(VectorZeroPoison, m_AnyZeroFP()));
+  EXPECT_FALSE(match(VectorInfPoison, m_AnyZeroFP()));
+  EXPECT_FALSE(match(VectorNaNPoison, m_AnyZeroFP()));
 
   EXPECT_FALSE(match(ScalarUndef, m_NaN()));
   EXPECT_FALSE(match(VectorUndef, m_NaN()));
   EXPECT_FALSE(match(VectorZeroUndef, m_NaN()));
+  EXPECT_FALSE(match(ScalarPoison, m_NaN()));
+  EXPECT_FALSE(match(VectorPoison, m_NaN()));
+  EXPECT_FALSE(match(VectorZeroPoison, m_NaN()));
   EXPECT_FALSE(match(ScalarPosInf, m_NaN()));
   EXPECT_FALSE(match(ScalarNegInf, m_NaN()));
   EXPECT_TRUE(match(ScalarNaN, m_NaN()));
   EXPECT_FALSE(match(VectorInfUndef, m_NaN()));
-  EXPECT_TRUE(match(VectorNaNUndef, m_NaN()));
+  EXPECT_FALSE(match(VectorNaNUndef, m_NaN()));
+  EXPECT_FALSE(match(VectorInfPoison, m_NaN()));
+  EXPECT_TRUE(match(VectorNaNPoison, m_NaN()));
 
   EXPECT_FALSE(match(ScalarUndef, m_NonNaN()));
   EXPECT_FALSE(match(VectorUndef, m_NonNaN()));
-  EXPECT_TRUE(match(VectorZeroUndef, m_NonNaN()));
+  EXPECT_FALSE(match(VectorZeroUndef, m_NonNaN()));
+  EXPECT_FALSE(match(ScalarPoison, m_NonNaN()));
+  EXPECT_FALSE(match(VectorPoison, m_NonNaN()));
+  EXPECT_TRUE(match(VectorZeroPoison, m_NonNaN()));
   EXPECT_TRUE(match(ScalarPosInf, m_NonNaN()));
   EXPECT_TRUE(match(ScalarNegInf, m_NonNaN()));
   EXPECT_FALSE(match(ScalarNaN, m_NonNaN()));
-  EXPECT_TRUE(match(VectorInfUndef, m_NonNaN()));
+  EXPECT_FALSE(match(VectorInfUndef, m_NonNaN()));
   EXPECT_FALSE(match(VectorNaNUndef, m_NonNaN()));
+  EXPECT_TRUE(match(VectorInfPoison, m_NonNaN()));
+  EXPECT_FALSE(match(VectorNaNPoison, m_NonNaN()));
 
   EXPECT_FALSE(match(ScalarUndef, m_Inf()));
   EXPECT_FALSE(match(VectorUndef, m_Inf()));
   EXPECT_FALSE(match(VectorZeroUndef, m_Inf()));
+  EXPECT_FALSE(match(ScalarPoison, m_Inf()));
+  EXPECT_FALSE(match(VectorPoison, m_Inf()));
+  EXPECT_FALSE(match(VectorZeroPoison, m_Inf()));
   EXPECT_TRUE(match(ScalarPosInf, m_Inf()));
   EXPECT_TRUE(match(ScalarNegInf, m_Inf()));
   EXPECT_FALSE(match(ScalarNaN, m_Inf()));
-  EXPECT_TRUE(match(VectorInfUndef, m_Inf()));
+  EXPECT_FALSE(match(VectorInfUndef, m_Inf()));
   EXPECT_FALSE(match(VectorNaNUndef, m_Inf()));
+  EXPECT_TRUE(match(VectorInfPoison, m_Inf()));
+  EXPECT_FALSE(match(VectorNaNPoison, m_Inf()));
 
   EXPECT_FALSE(match(ScalarUndef, m_NonInf()));
   EXPECT_FALSE(match(VectorUndef, m_NonInf()));
-  EXPECT_TRUE(match(VectorZeroUndef, m_NonInf()));
+  EXPECT_FALSE(match(VectorZeroUndef, m_NonInf()));
+  EXPECT_FALSE(match(ScalarPoison, m_NonInf()));
+  EXPECT_FALSE(match(VectorPoison, m_NonInf()));
+  EXPECT_TRUE(match(VectorZeroPoison, m_NonInf()));
   EXPECT_FALSE(match(ScalarPosInf, m_NonInf()));
   EXPECT_FALSE(match(ScalarNegInf, m_NonInf()));
   EXPECT_TRUE(match(ScalarNaN, m_NonInf()));
   EXPECT_FALSE(match(VectorInfUndef, m_NonInf()));
-  EXPECT_TRUE(match(VectorNaNUndef, m_NonInf()));
+  EXPECT_FALSE(match(VectorNaNUndef, m_NonInf()));
+  EXPECT_FALSE(match(VectorInfPoison, m_NonInf()));
+  EXPECT_TRUE(match(VectorNaNPoison, m_NonInf()));
 
   EXPECT_FALSE(match(ScalarUndef, m_Finite()));
   EXPECT_FALSE(match(VectorUndef, m_Finite()));
-  EXPECT_TRUE(match(VectorZeroUndef, m_Finite()));
+  EXPECT_FALSE(match(VectorZeroUndef, m_Finite()));
+  EXPECT_FALSE(match(ScalarPoison, m_Finite()));
+  EXPECT_FALSE(match(VectorPoison, m_Finite()));
+  EXPECT_TRUE(match(VectorZeroPoison, m_Finite()));
   EXPECT_FALSE(match(ScalarPosInf, m_Finite()));
   EXPECT_FALSE(match(ScalarNegInf, m_Finite()));
   EXPECT_FALSE(match(ScalarNaN, m_Finite()));
   EXPECT_FALSE(match(VectorInfUndef, m_Finite()));
   EXPECT_FALSE(match(VectorNaNUndef, m_Finite()));
+  EXPECT_FALSE(match(VectorInfPoison, m_Finite()));
+  EXPECT_FALSE(match(VectorNaNPoison, m_Finite()));
 
   const APFloat *C;
   // Regardless of whether undefs are allowed,
@@ -1707,38 +1768,57 @@ TEST_F(PatternMatchTest, ConstantPredicateType) {
 
   Constant *CMixedU32 = ConstantVector::get({CU32Max, CU32Zero, CU32DeadBeef});
   Constant *CU32Undef = UndefValue::get(U32Ty);
+  Constant *CU32Poison = PoisonValue::get(U32Ty);
   Constant *CU32MaxWithUndef =
       ConstantVector::get({CU32Undef, CU32Max, CU32Undef});
+  Constant *CU32MaxWithPoison =
+      ConstantVector::get({CU32Poison, CU32Max, CU32Poison});
 
   EXPECT_FALSE(match(CMixedU32, cst_pred_ty<is_unsigned_max_pred>()));
   EXPECT_FALSE(match(CMixedU32, cst_pred_ty<is_unsigned_zero_pred>()));
   EXPECT_TRUE(match(CMixedU32, cst_pred_ty<always_true_pred<APInt>>()));
   EXPECT_FALSE(match(CMixedU32, cst_pred_ty<always_false_pred<APInt>>()));
 
-  EXPECT_TRUE(match(CU32MaxWithUndef, cst_pred_ty<is_unsigned_max_pred>()));
+  EXPECT_FALSE(match(CU32MaxWithUndef, cst_pred_ty<is_unsigned_max_pred>()));
   EXPECT_FALSE(match(CU32MaxWithUndef, cst_pred_ty<is_unsigned_zero_pred>()));
-  EXPECT_TRUE(match(CU32MaxWithUndef, cst_pred_ty<always_true_pred<APInt>>()));
+  EXPECT_FALSE(match(CU32MaxWithUndef, cst_pred_ty<always_true_pred<APInt>>()));
   EXPECT_FALSE(
       match(CU32MaxWithUndef, cst_pred_ty<always_false_pred<APInt>>()));
 
+  EXPECT_TRUE(match(CU32MaxWithPoison, cst_pred_ty<is_unsigned_max_pred>()));
+  EXPECT_FALSE(match(CU32MaxWithPoison, cst_pred_ty<is_unsigned_zero_pred>()));
+  EXPECT_TRUE(match(CU32MaxWithPoison, cst_pred_ty<always_true_pred<APInt>>()));
+  EXPECT_FALSE(
+      match(CU32MaxWithPoison, cst_pred_ty<always_false_pred<APInt>>()));
+
   // Float arbitrary vector
 
   Constant *CMixedF32 = ConstantVector::get({CF32NaN, CF32Zero, CF32Pi});
   Constant *CF32Undef = UndefValue::get(F32Ty);
+  Constant *CF32Poison = PoisonValue::get(F32Ty);
   Constant *CF32NaNWithUndef =
       ConstantVector::get({CF32Undef, CF32NaN, CF32Undef});
+  Constant *CF32NaNWithPoison =
+      ConstantVector::get({CF32Poison, CF32NaN, CF32Poison});
 
   EXPECT_FALSE(match(CMixedF32, cstfp_pred_ty<is_float_nan_pred>()));
   EXPECT_FALSE(match(CMixedF32, cstfp_pred_ty<is_float_zero_pred>()));
   EXPECT_TRUE(match(CMixedF32, cstfp_pred_ty<always_true_pred<APFloat>>()));
   EXPECT_FALSE(match(CMixedF32, cstfp_pred_ty<always_false_pred<APFloat>>()));
 
-  EXPECT_TRUE(match(CF32NaNWithUndef, cstfp_pred_ty<is_float_nan_pred>()));
+  EXPECT_FALSE(match(CF32NaNWithUndef, cstfp_pred_ty<is_float_nan_pred>()));
   EXPECT_FALSE(match(CF32NaNWithUndef, cstfp_pred_ty<is_float_zero_pred>()));
-  EXPECT_TRUE(
+  EXPECT_FALSE(
       match(CF32NaNWithUndef, cstfp_pred_ty<always_true_pred<APFloat>>()));
   EXPECT_FALSE(
       match(CF32NaNWithUndef, cstfp_pred_ty<always_false_pred<APFloat>>()));
+
+  EXPECT_TRUE(match(CF32NaNWithPoison, cstfp_pred_ty<is_float_nan_pred>()));
+  EXPECT_FALSE(match(CF32NaNWithPoison, cstfp_pred_ty<is_float_zero_pred>()));
+  EXPECT_TRUE(
+      match(CF32NaNWithPoison, cstfp_pred_ty<always_true_pred<APFloat>>()));
+  EXPECT_FALSE(
+      match(CF32NaNWithPoison, cstfp_pred_ty<always_false_pred<APFloat>>()));
 }
 
 TEST_F(PatternMatchTest, InsertValue) {
@@ -1888,35 +1968,44 @@ TEST_F(PatternMatchTest, NotForbidUndef) {
   Type *ScalarTy = IRB.getInt8Ty();
   Type *VectorTy = FixedVectorType::get(ScalarTy, 3);
   Constant *ScalarUndef = UndefValue::get(ScalarTy);
+  Constant *ScalarPoison = PoisonValue::get(ScalarTy);
   Constant *ScalarOnes = Constant::getAllOnesValue(ScalarTy);
   Constant *VectorZero = Constant::getNullValue(VectorTy);
   Constant *VectorOnes = Constant::getAllOnesValue(VectorTy);
 
-  SmallVector<Constant *, 3> MixedElems;
-  MixedElems.push_back(ScalarOnes);
-  MixedElems.push_back(ScalarOnes);
-  MixedElems.push_back(ScalarUndef);
-  Constant *VectorMixed = ConstantVector::get(MixedElems);
+  SmallVector<Constant *, 3> MixedElemsUndef;
+  MixedElemsUndef.push_back(ScalarOnes);
+  MixedElemsUndef.push_back(ScalarOnes);
+  MixedElemsUndef.push_back(ScalarUndef);
+  Constant *VectorMixedUndef = ConstantVector::get(MixedElemsUndef);
+
+  SmallVector<Constant *, 3> MixedElemsPoison;
+  MixedElemsPoison.push_back(ScalarOnes);
+  MixedElemsPoison.push_back(ScalarOnes);
+  MixedElemsPoison.push_back(ScalarPoison);
+  Constant *VectorMixedPoison = ConstantVector::get(MixedElemsPoison);
 
   Value *Not = IRB.CreateXor(VectorZero, VectorOnes);
   Value *X;
-  EXPECT_TRUE(match(Not, m_Not(m_Value())));
-  EXPECT_TRUE(match(Not, m_NotForbidUndef(m_Value(X))));
+  EXPECT_TRUE(match(Not, m_Not(m_Value(X))));
   EXPECT_TRUE(match(X, m_Zero()));
 
   Value *NotCommute = IRB.CreateXor(VectorOnes, VectorZero);
   Value *Y;
-  EXPECT_TRUE(match(NotCommute, m_Not(m_Value())));
-  EXPECT_TRUE(match(NotCommute, m_NotForbidUndef(m_Value(Y))));
+  EXPECT_TRUE(match(NotCommute, m_Not(m_Value(Y))));
   EXPECT_TRUE(match(Y, m_Zero()));
 
-  Value *NotWithUndefs = IRB.CreateXor(VectorZero, VectorMixed);
-  EXPECT_TRUE(match(NotWithUndefs, m_Not(m_Value())));
-  EXPECT_FALSE(match(NotWithUndefs, m_NotForbidUndef(m_Value())));
+  Value *NotWithUndefs = IRB.CreateXor(VectorZero, VectorMixedUndef);
+  EXPECT_FALSE(match(NotWithUndefs, m_Not(m_Value())));
+
+  Value *NotWithPoisons = IRB.CreateXor(VectorZero, VectorMixedPoison);
+  EXPECT_TRUE(match(NotWithPoisons, m_Not(m_Value())));
+
+  Value *NotWithUndefsCommute = IRB.CreateXor(VectorMixedUndef, VectorZero);
+  EXPECT_FALSE(match(NotWithUndefsCommute, m_Not(m_Value())));
 
-  Value *NotWithUndefsCommute = IRB.CreateXor(VectorMixed, VectorZero);
-  EXPECT_TRUE(match(NotWithUndefsCommute, m_Not(m_Value())));
-  EXPECT_FALSE(match(NotWithUndefsCommute, m_NotForbidUndef(m_Value(X))));
+  Value *NotWithPoisonsCommute = IRB.CreateXor(VectorMixedPoison, VectorZero);
+  EXPECT_TRUE(match(NotWithPoisonsCommute, m_Not(m_Value())));
 }
 
 template <typename T> struct MutableConstTest : PatternMatchTest { };
diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index 9cf307472d656..7e00a80cacf93 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -21,9 +21,11 @@ using ::llvm::DILineInfo;
 using ::llvm::DILineInfoSpecifier;
 using ::llvm::DILocal;
 using ::llvm::StringRef;
+using ::llvm::memprof::CallStackId;
 using ::llvm::memprof::CallStackMap;
 using ::llvm::memprof::Frame;
 using ::llvm::memprof::FrameId;
+using ::llvm::memprof::IndexedAllocationInfo;
 using ::llvm::memprof::IndexedMemProfRecord;
 using ::llvm::memprof::MemInfoBlock;
 using ::llvm::memprof::MemProfReader;
@@ -36,6 +38,7 @@ using ::llvm::memprof::SegmentEntry;
 using ::llvm::object::SectionedAddress;
 using ::llvm::symbolize::SymbolizableModule;
 using ::testing::Return;
+using ::testing::SizeIs;
 
 class MockSymbolizer : public SymbolizableModule {
 public:
@@ -180,13 +183,13 @@ TEST(MemProf, FillsValue) {
   // We expect 4 records. We attach alloc site data to foo and bar, i.e.
   // all frames bottom up until we find a non-inline frame. We attach call site
   // data to bar, xyz and abc.
-  ASSERT_EQ(Records.size(), 4U);
+  ASSERT_THAT(Records, SizeIs(4));
 
   // Check the memprof record for foo.
   const llvm::GlobalValue::GUID FooId = IndexedMemProfRecord::getGUID("foo");
   ASSERT_EQ(Records.count(FooId), 1U);
   const MemProfRecord &Foo = Records[FooId];
-  ASSERT_EQ(Foo.AllocSites.size(), 1U);
+  ASSERT_THAT(Foo.AllocSites, SizeIs(1));
   EXPECT_EQ(Foo.AllocSites[0].Info.getAllocCount(), 1U);
   EXPECT_THAT(Foo.AllocSites[0].CallStack[0],
               FrameContains("foo", 5U, 30U, true));
@@ -202,7 +205,7 @@ TEST(MemProf, FillsValue) {
   const llvm::GlobalValue::GUID BarId = IndexedMemProfRecord::getGUID("bar");
   ASSERT_EQ(Records.count(BarId), 1U);
   const MemProfRecord &Bar = Records[BarId];
-  ASSERT_EQ(Bar.AllocSites.size(), 1U);
+  ASSERT_THAT(Bar.AllocSites, SizeIs(1));
   EXPECT_EQ(Bar.AllocSites[0].Info.getAllocCount(), 1U);
   EXPECT_THAT(Bar.AllocSites[0].CallStack[0],
               FrameContains("foo", 5U, 30U, true));
@@ -213,8 +216,8 @@ TEST(MemProf, FillsValue) {
   EXPECT_THAT(Bar.AllocSites[0].CallStack[3],
               FrameContains("abc", 5U, 30U, false));
 
-  ASSERT_EQ(Bar.CallSites.size(), 1U);
-  ASSERT_EQ(Bar.CallSites[0].size(), 2U);
+  ASSERT_THAT(Bar.CallSites, SizeIs(1));
+  ASSERT_THAT(Bar.CallSites[0], SizeIs(2));
   EXPECT_THAT(Bar.CallSites[0][0], FrameContains("foo", 5U, 30U, true));
   EXPECT_THAT(Bar.CallSites[0][1], FrameContains("bar", 51U, 20U, false));
 
@@ -222,8 +225,8 @@ TEST(MemProf, FillsValue) {
   const llvm::GlobalValue::GUID XyzId = IndexedMemProfRecord::getGUID("xyz");
   ASSERT_EQ(Records.count(XyzId), 1U);
   const MemProfRecord &Xyz = Records[XyzId];
-  ASSERT_EQ(Xyz.CallSites.size(), 1U);
-  ASSERT_EQ(Xyz.CallSites[0].size(), 2U);
+  ASSERT_THAT(Xyz.CallSites, SizeIs(1));
+  ASSERT_THAT(Xyz.CallSites[0], SizeIs(2));
   // Expect the entire frame even though in practice we only need the first
   // entry here.
   EXPECT_THAT(Xyz.CallSites[0][0], FrameContains("xyz", 5U, 30U, true));
@@ -234,8 +237,8 @@ TEST(MemProf, FillsValue) {
   ASSERT_EQ(Records.count(AbcId), 1U);
   const MemProfRecord &Abc = Records[AbcId];
   EXPECT_TRUE(Abc.AllocSites.empty());
-  ASSERT_EQ(Abc.CallSites.size(), 1U);
-  ASSERT_EQ(Abc.CallSites[0].size(), 2U);
+  ASSERT_THAT(Abc.CallSites, SizeIs(1));
+  ASSERT_THAT(Abc.CallSites[0], SizeIs(2));
   EXPECT_THAT(Abc.CallSites[0][0], FrameContains("xyz", 5U, 30U, true));
   EXPECT_THAT(Abc.CallSites[0][1], FrameContains("abc", 5U, 30U, false));
 }
@@ -390,9 +393,9 @@ TEST(MemProf, SymbolizationFilter) {
     Records.push_back(KeyRecordPair.second);
   }
 
-  ASSERT_EQ(Records.size(), 1U);
-  ASSERT_EQ(Records[0].AllocSites.size(), 1U);
-  ASSERT_EQ(Records[0].AllocSites[0].CallStack.size(), 1U);
+  ASSERT_THAT(Records, SizeIs(1));
+  ASSERT_THAT(Records[0].AllocSites, SizeIs(1));
+  ASSERT_THAT(Records[0].AllocSites[0].CallStack, SizeIs(1));
   EXPECT_THAT(Records[0].AllocSites[0].CallStack[0],
               FrameContains("foo", 5U, 30U, false));
 }
@@ -424,12 +427,135 @@ TEST(MemProf, BaseMemProfReader) {
     Records.push_back(KeyRecordPair.second);
   }
 
-  ASSERT_EQ(Records.size(), 1U);
-  ASSERT_EQ(Records[0].AllocSites.size(), 1U);
-  ASSERT_EQ(Records[0].AllocSites[0].CallStack.size(), 2U);
+  ASSERT_THAT(Records, SizeIs(1));
+  ASSERT_THAT(Records[0].AllocSites, SizeIs(1));
+  ASSERT_THAT(Records[0].AllocSites[0].CallStack, SizeIs(2));
   EXPECT_THAT(Records[0].AllocSites[0].CallStack[0],
               FrameContains("foo", 20U, 5U, true));
   EXPECT_THAT(Records[0].AllocSites[0].CallStack[1],
               FrameContains("bar", 10U, 2U, false));
 }
+
+TEST(MemProf, BaseMemProfReaderWithCSIdMap) {
+  llvm::DenseMap<FrameId, Frame> FrameIdMap;
+  Frame F1(/*Hash=*/IndexedMemProfRecord::getGUID("foo"), /*LineOffset=*/20,
+           /*Column=*/5, /*IsInlineFrame=*/true);
+  Frame F2(/*Hash=*/IndexedMemProfRecord::getGUID("bar"), /*LineOffset=*/10,
+           /*Column=*/2, /*IsInlineFrame=*/false);
+  FrameIdMap.insert({F1.hash(), F1});
+  FrameIdMap.insert({F2.hash(), F2});
+
+  llvm::DenseMap<CallStackId, llvm::SmallVector<FrameId>> CSIdMap;
+  llvm::SmallVector<FrameId> CallStack = {F1.hash(), F2.hash()};
+  CallStackId CSId = llvm::memprof::hashCallStack(CallStack);
+  CSIdMap.insert({CSId, CallStack});
+
+  llvm::MapVector<llvm::GlobalValue::GUID, IndexedMemProfRecord> ProfData;
+  IndexedMemProfRecord FakeRecord;
+  MemInfoBlock Block;
+  Block.AllocCount = 1U, Block.TotalAccessDensity = 4,
+  Block.TotalLifetime = 200001;
+  FakeRecord.AllocSites.emplace_back(
+      /*CS=*/llvm::SmallVector<FrameId>(),
+      /*CSId=*/llvm::memprof::hashCallStack(CallStack),
+      /*MB=*/Block);
+  ProfData.insert({F1.hash(), FakeRecord});
+
+  MemProfReader Reader(FrameIdMap, CSIdMap, ProfData);
+
+  llvm::SmallVector<MemProfRecord, 1> Records;
+  for (const auto &KeyRecordPair : Reader) {
+    Records.push_back(KeyRecordPair.second);
+  }
+
+  ASSERT_THAT(Records, SizeIs(1));
+  ASSERT_THAT(Records[0].AllocSites, SizeIs(1));
+  ASSERT_THAT(Records[0].AllocSites[0].CallStack, SizeIs(2));
+  EXPECT_THAT(Records[0].AllocSites[0].CallStack[0],
+              FrameContains("foo", 20U, 5U, true));
+  EXPECT_THAT(Records[0].AllocSites[0].CallStack[1],
+              FrameContains("bar", 10U, 2U, false));
+}
+
+TEST(MemProf, IndexedMemProfRecordToMemProfRecord) {
+  // Verify that MemProfRecord can be constructed from IndexedMemProfRecord with
+  // CallStackIds only.
+
+  llvm::DenseMap<FrameId, Frame> FrameIdMap;
+  Frame F1(1, 0, 0, false);
+  Frame F2(2, 0, 0, false);
+  Frame F3(3, 0, 0, false);
+  Frame F4(4, 0, 0, false);
+  FrameIdMap.insert({F1.hash(), F1});
+  FrameIdMap.insert({F2.hash(), F2});
+  FrameIdMap.insert({F3.hash(), F3});
+  FrameIdMap.insert({F4.hash(), F4});
+
+  llvm::DenseMap<CallStackId, llvm::SmallVector<FrameId>> CallStackIdMap;
+  llvm::SmallVector<FrameId> CS1 = {F1.hash(), F2.hash()};
+  llvm::SmallVector<FrameId> CS2 = {F1.hash(), F3.hash()};
+  llvm::SmallVector<FrameId> CS3 = {F2.hash(), F3.hash()};
+  llvm::SmallVector<FrameId> CS4 = {F2.hash(), F4.hash()};
+  CallStackIdMap.insert({llvm::memprof::hashCallStack(CS1), CS1});
+  CallStackIdMap.insert({llvm::memprof::hashCallStack(CS2), CS2});
+  CallStackIdMap.insert({llvm::memprof::hashCallStack(CS3), CS3});
+  CallStackIdMap.insert({llvm::memprof::hashCallStack(CS4), CS4});
+
+  IndexedMemProfRecord IndexedRecord;
+  IndexedAllocationInfo AI;
+  AI.CSId = llvm::memprof::hashCallStack(CS1);
+  IndexedRecord.AllocSites.push_back(AI);
+  AI.CSId = llvm::memprof::hashCallStack(CS2);
+  IndexedRecord.AllocSites.push_back(AI);
+  IndexedRecord.CallSiteIds.push_back(llvm::memprof::hashCallStack(CS3));
+  IndexedRecord.CallSiteIds.push_back(llvm::memprof::hashCallStack(CS4));
+
+  bool CSIdMissing = false;
+  bool FrameIdMissing = false;
+
+  auto Callback = [&](CallStackId CSId) -> llvm::SmallVector<Frame> {
+    llvm::SmallVector<Frame> CallStack;
+    llvm::SmallVector<FrameId> FrameIds;
+
+    auto Iter = CallStackIdMap.find(CSId);
+    if (Iter == CallStackIdMap.end())
+      CSIdMissing = true;
+    else
+      FrameIds = Iter->second;
+
+    for (FrameId Id : FrameIds) {
+      Frame F(0, 0, 0, false);
+      auto Iter = FrameIdMap.find(Id);
+      if (Iter == FrameIdMap.end())
+        FrameIdMissing = true;
+      else
+        F = Iter->second;
+      CallStack.push_back(F);
+    }
+
+    return CallStack;
+  };
+
+  MemProfRecord Record = IndexedRecord.toMemProfRecord(Callback);
+
+  // Make sure that all lookups are successful.
+  ASSERT_FALSE(CSIdMissing);
+  ASSERT_FALSE(FrameIdMissing);
+
+  // Verify the contents of Record.
+  ASSERT_THAT(Record.AllocSites, SizeIs(2));
+  ASSERT_THAT(Record.AllocSites[0].CallStack, SizeIs(2));
+  EXPECT_EQ(Record.AllocSites[0].CallStack[0].hash(), F1.hash());
+  EXPECT_EQ(Record.AllocSites[0].CallStack[1].hash(), F2.hash());
+  ASSERT_THAT(Record.AllocSites[1].CallStack, SizeIs(2));
+  EXPECT_EQ(Record.AllocSites[1].CallStack[0].hash(), F1.hash());
+  EXPECT_EQ(Record.AllocSites[1].CallStack[1].hash(), F3.hash());
+  ASSERT_THAT(Record.CallSites, SizeIs(2));
+  ASSERT_THAT(Record.CallSites[0], SizeIs(2));
+  EXPECT_EQ(Record.CallSites[0][0].hash(), F2.hash());
+  EXPECT_EQ(Record.CallSites[0][1].hash(), F3.hash());
+  ASSERT_THAT(Record.CallSites[1], SizeIs(2));
+  EXPECT_EQ(Record.CallSites[1][0].hash(), F2.hash());
+  EXPECT_EQ(Record.CallSites[1][1].hash(), F4.hash());
+}
 } // namespace
diff --git a/llvm/unittests/Support/RISCVISAInfoTest.cpp b/llvm/unittests/Support/RISCVISAInfoTest.cpp
index 67012d2e6dc72..caf7bf0a31717 100644
--- a/llvm/unittests/Support/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/Support/RISCVISAInfoTest.cpp
@@ -769,6 +769,7 @@ R"(All available -march extensions for RISC-V
     za128rs              1.0
     za64rs               1.0
     zacas                1.0
+    zama16b              1.0
     zawrs                1.0
     zfa                  1.0
     zfh                  1.0
diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp
index a86775a1366b0..d7d0ea2c6a6e7 100644
--- a/llvm/unittests/Transforms/Utils/LocalTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp
@@ -1241,6 +1241,18 @@ TEST(Local, ExpressionForConstant) {
   EXPECT_NE(Expr, nullptr);
   EXPECT_EQ(Expr->getElement(1), 13841306799765140275U);
 
+  // Half.
+  Type *HalfTy = Type::getHalfTy(Context);
+  Expr = createExpression(ConstantFP::get(HalfTy, 5.55), HalfTy);
+  EXPECT_NE(Expr, nullptr);
+  EXPECT_EQ(Expr->getElement(1), 17805U);
+
+  // BFloat.
+  Type *BFloatTy = Type::getBFloatTy(Context);
+  Expr = createExpression(ConstantFP::get(BFloatTy, -5.55), BFloatTy);
+  EXPECT_NE(Expr, nullptr);
+  EXPECT_EQ(Expr->getElement(1), 49330U);
+
   // Pointer.
   PointerType *PtrTy = PointerType::get(Context, 0);
   Expr = createExpression(ConstantPointerNull::get(PtrTy), PtrTy);
@@ -1257,15 +1269,6 @@ TEST(Local, ExpressionForConstant) {
   EXPECT_NE(Expr, nullptr);
   EXPECT_EQ(Expr->getElement(1), 5678U);
 
-  // Others.
-  Type *HalfTy = Type::getHalfTy(Context);
-  Expr = createExpression(ConstantFP::get(HalfTy, 32), HalfTy);
-  EXPECT_EQ(Expr, nullptr);
-
-  Type *BFloatTy = Type::getBFloatTy(Context);
-  Expr = createExpression(ConstantFP::get(BFloatTy, 32), BFloatTy);
-  EXPECT_EQ(Expr, nullptr);
-
   Type *FP128Ty = Type::getFP128Ty(Context);
   Expr = createExpression(ConstantFP::get(FP128Ty, 32), FP128Ty);
   EXPECT_EQ(Expr, nullptr);
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
index 777675b623f32..2b25c62ac2f65 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
@@ -192,9 +192,9 @@ TEST_F(VPlanHCFGTest, testVPInstructionToVPRecipesInner) {
   auto Iter = VecBB->begin();
   EXPECT_NE(nullptr, dyn_cast<VPWidenPHIRecipe>(&*Iter++));
   EXPECT_NE(nullptr, dyn_cast<VPWidenGEPRecipe>(&*Iter++));
-  EXPECT_NE(nullptr, dyn_cast<VPWidenMemoryInstructionRecipe>(&*Iter++));
+  EXPECT_NE(nullptr, dyn_cast<VPWidenMemoryRecipe>(&*Iter++));
   EXPECT_NE(nullptr, dyn_cast<VPWidenRecipe>(&*Iter++));
-  EXPECT_NE(nullptr, dyn_cast<VPWidenMemoryInstructionRecipe>(&*Iter++));
+  EXPECT_NE(nullptr, dyn_cast<VPWidenMemoryRecipe>(&*Iter++));
   EXPECT_NE(nullptr, dyn_cast<VPWidenRecipe>(&*Iter++));
   EXPECT_NE(nullptr, dyn_cast<VPWidenRecipe>(&*Iter++));
   EXPECT_NE(nullptr, dyn_cast<VPInstruction>(&*Iter++));
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index cb8737a9e64d2..64e9c06db3fe8 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1029,7 +1029,7 @@ TEST(VPRecipeTest, CastVPBranchOnMaskRecipeToVPUser) {
   EXPECT_EQ(&Recipe, BaseR);
 }
 
-TEST(VPRecipeTest, CastVPWidenMemoryInstructionRecipeToVPUserAndVPDef) {
+TEST(VPRecipeTest, CastVPWidenMemoryRecipeToVPUserAndVPDef) {
   LLVMContext C;
 
   IntegerType *Int32 = IntegerType::get(C, 32);
@@ -1038,7 +1038,7 @@ TEST(VPRecipeTest, CastVPWidenMemoryInstructionRecipeToVPUserAndVPDef) {
       new LoadInst(Int32, UndefValue::get(Int32Ptr), "", false, Align(1));
   VPValue Addr;
   VPValue Mask;
-  VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, true, false, {});
+  VPWidenLoadRecipe Recipe(*Load, &Addr, &Mask, true, false, {});
   EXPECT_TRUE(isa<VPUser>(&Recipe));
   VPRecipeBase *BaseR = &Recipe;
   EXPECT_TRUE(isa<VPUser>(BaseR));
@@ -1133,7 +1133,7 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
         new LoadInst(Int32, UndefValue::get(Int32Ptr), "", false, Align(1));
     VPValue Addr;
     VPValue Mask;
-    VPWidenMemoryInstructionRecipe Recipe(*Load, &Addr, &Mask, true, false, {});
+    VPWidenLoadRecipe Recipe(*Load, &Addr, &Mask, true, false, {});
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_TRUE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1147,8 +1147,7 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     VPValue Addr;
     VPValue Mask;
     VPValue StoredV;
-    VPWidenMemoryInstructionRecipe Recipe(*Store, &Addr, &StoredV, &Mask, false,
-                                          false, {});
+    VPWidenStoreRecipe Recipe(*Store, &Addr, &StoredV, &Mask, false, false, {});
     EXPECT_TRUE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_TRUE(Recipe.mayWriteToMemory());
diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp
index 36f8fa1465393..b3a05e081f637 100644
--- a/llvm/utils/TableGen/InstrInfoEmitter.cpp
+++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp
@@ -1181,9 +1181,15 @@ void InstrInfoEmitter::emitRecord(
     // Each logical operand can be multiple MI operands.
     MinOperands =
         Inst.Operands.back().MIOperandNo + Inst.Operands.back().MINumOperands;
+  // Even the logical output operand may be multiple MI operands.
+  int DefOperands = 0;
+  if (Inst.Operands.NumDefs) {
+    auto &Opnd = Inst.Operands[Inst.Operands.NumDefs - 1];
+    DefOperands = Opnd.MIOperandNo + Opnd.MINumOperands;
+  }
 
   OS << "    { ";
-  OS << Num << ",\t" << MinOperands << ",\t" << Inst.Operands.NumDefs << ",\t"
+  OS << Num << ",\t" << MinOperands << ",\t" << DefOperands << ",\t"
      << Inst.TheDef->getValueAsInt("Size") << ",\t"
      << SchedModels.getSchedClassIdx(Inst) << ",\t";
 
diff --git a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
index 7a6439cb94910..e57bc6fb507e3 100644
--- a/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
+++ b/llvm/utils/TableGen/RISCVTargetDefEmitter.cpp
@@ -60,11 +60,19 @@ static void EmitRISCVTargetDef(RecordKeeper &RK, raw_ostream &OS) {
     if (MArch.empty())
       MArch = getMArch(*Rec);
 
-    const bool FastUnalignedAccess =
+    bool FastScalarUnalignedAccess =
         any_of(Rec->getValueAsListOfDefs("Features"), [&](auto &Feature) {
-          return Feature->getValueAsString("Name") == "fast-unaligned-access";
+          return Feature->getValueAsString("Name") == "unaligned-scalar-mem";
         });
 
+    bool FastVectorUnalignedAccess =
+        any_of(Rec->getValueAsListOfDefs("Features"), [&](auto &Feature) {
+          return Feature->getValueAsString("Name") == "unaligned-vector-mem";
+        });
+
+    bool FastUnalignedAccess =
+        FastScalarUnalignedAccess && FastVectorUnalignedAccess;
+
     OS << "PROC(" << Rec->getName() << ", "
        << "{\"" << Rec->getValueAsString("Name") << "\"}, "
        << "{\"" << MArch << "\"}, " << FastUnalignedAccess << ")\n";
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index ecb19d233a8d1..5595e6f417555 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -430,36 +430,47 @@ def collect_original_check_lines(ti: TestInfo, prefix_set: set):
     result[func_name][prefix] is filled with a list of right-hand-sides of check
     lines.
     """
-    result = {}
+    result = collections.defaultdict(lambda: {})
 
+    current_prefix = None
     current_function = None
     for input_line_info in ti.ro_iterlines():
         input_line = input_line_info.line
-        if current_function is not None:
-            if input_line == "":
-                continue
-            if input_line.lstrip().startswith(";"):
-                m = CHECK_RE.match(input_line)
-                if (
-                    m is not None
-                    and m.group(1) in prefix_set
-                    and m.group(2) not in ["LABEL", "SAME"]
-                ):
-                    if m.group(1) not in current_function:
-                        current_function[m.group(1)] = []
-                    current_function[m.group(1)].append(input_line[m.end() :].strip())
-                continue
-            current_function = None
+        if input_line.lstrip().startswith(";"):
+            m = CHECK_RE.match(input_line)
+            if m is not None:
+                prefix = m.group(1)
+                check_kind = m.group(2)
+                line = input_line[m.end() :].strip()
+
+                if prefix != current_prefix:
+                    current_function = None
+                    current_prefix = None
+
+                if check_kind not in ["LABEL", "SAME"]:
+                    if current_function is not None:
+                        current_function.append(line)
+                    continue
 
-        m = IR_FUNCTION_RE.match(input_line)
-        if m is not None:
-            func_name = m.group(1)
-            if ti.args.function is not None and func_name != ti.args.function:
-                # When filtering on a specific function, skip all others.
-                continue
+                if check_kind == "SAME":
+                    continue
 
-            assert func_name not in result
-            current_function = result[func_name] = {}
+                if check_kind == "LABEL":
+                    m = IR_FUNCTION_RE.match(line)
+                    if m is not None:
+                        func_name = m.group(1)
+                        if (
+                            ti.args.function is not None
+                            and func_name != ti.args.function
+                        ):
+                            # When filtering on a specific function, skip all others.
+                            continue
+
+                        current_prefix = prefix
+                        current_function = result[func_name][prefix] = []
+                        continue
+
+        current_function = None
 
     return result
 
@@ -980,10 +991,6 @@ def __init__(
     def is_local_def_ir_value(self):
         return self.ir_prefix == "%"
 
-    # Return true if this kind of IR value is "global", basically if it matches '#{{.*}}'.
-    def is_global_scope_ir_value_match(self, match):
-        return self.global_ir_rhs_regexp is not None
-
     # Return the IR prefix and check prefix we use for this kind or IR value,
     # e.g., (%, TMP) for locals. If the IR prefix is a regex, return the prefix
     # used in the IR output
@@ -1075,10 +1082,10 @@ def get_value_use(self, var, match, var_prefix=None):
     NamelessValue(r"TBAA_STRUCT", "!", r"!tbaa.struct ", r"![0-9]+", None),
     NamelessValue(r"RNG", "!", r"!range ", r"![0-9]+", None),
     NamelessValue(r"LOOP", "!", r"!llvm.loop ", r"![0-9]+", None),
-    NamelessValue(r"META", "!", r"metadata ", r"![0-9]+", None),
     NamelessValue(r"META", "!", r"", r"![0-9]+", r"(?:distinct |)!.*"),
     NamelessValue(r"ACC_GRP", "!", r"!llvm.access.group ", r"![0-9]+", None),
     NamelessValue(r"META", "!", r"![a-z.]+ ", r"![0-9]+", None),
+    NamelessValue(r"META", "!", r"[, (]", r"![0-9]+", None),
 ]
 
 global_nameless_values = [
diff --git a/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn
index 04f20211b3c71..22433459a7878 100644
--- a/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn
@@ -23,6 +23,7 @@ static_library("FlowSensitive") {
     target_gen_dir,
   ]
   sources = [
+    "ASTOps.cpp",
     "AdornedCFG.cpp",
     "Arena.cpp",
     "DataflowAnalysisContext.cpp",
diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/sanitizer_common/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/sanitizer_common/BUILD.gn
index 0519073239430..f7f1fce10bf5f 100644
--- a/llvm/utils/gn/secondary/compiler-rt/lib/sanitizer_common/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/lib/sanitizer_common/BUILD.gn
@@ -33,9 +33,6 @@ source_set("sources") {
     "sanitizer_asm.h",
     "sanitizer_atomic.h",
     "sanitizer_atomic_clang.h",
-    "sanitizer_atomic_clang_mips.h",
-    "sanitizer_atomic_clang_other.h",
-    "sanitizer_atomic_clang_x86.h",
     "sanitizer_atomic_msvc.h",
     "sanitizer_bitvector.h",
     "sanitizer_bvgraph.h",
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 4383f1d6d18ff..bd08ce044c247 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -37,9 +37,9 @@ if (current_toolchain == default_toolchain) {
       "_LIBCPP_INSTRUMENTED_WITH_ASAN=",
       "_LIBCPP_ABI_DEFINES=",
       "_LIBCPP_HARDENING_MODE_DEFAULT=_LIBCPP_HARDENING_MODE_NONE",
-      "_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH=",
-      "_LIBCPP_PSTL_CPU_BACKEND_SERIAL=1",
-      "_LIBCPP_PSTL_CPU_BACKEND_THREAD=",
+      "_LIBCPP_PSTL_BACKEND_LIBDISPATCH=",
+      "_LIBCPP_PSTL_BACKEND_SERIAL=1",
+      "_LIBCPP_PSTL_BACKEND_STD_THREAD=",
     ]
     if (libcxx_abi_version != 1) {
       values += [ "_LIBCPP_ABI_VERSION=$libcxx_abi_version" ]
@@ -143,18 +143,12 @@ if (current_toolchain == default_toolchain) {
       "__algorithm/pop_heap.h",
       "__algorithm/prev_permutation.h",
       "__algorithm/pstl_any_all_none_of.h",
-      "__algorithm/pstl_backend.h",
-      "__algorithm/pstl_backends/cpu_backend.h",
       "__algorithm/pstl_backends/cpu_backends/any_of.h",
-      "__algorithm/pstl_backends/cpu_backends/backend.h",
       "__algorithm/pstl_backends/cpu_backends/fill.h",
       "__algorithm/pstl_backends/cpu_backends/find_if.h",
       "__algorithm/pstl_backends/cpu_backends/for_each.h",
-      "__algorithm/pstl_backends/cpu_backends/libdispatch.h",
       "__algorithm/pstl_backends/cpu_backends/merge.h",
-      "__algorithm/pstl_backends/cpu_backends/serial.h",
       "__algorithm/pstl_backends/cpu_backends/stable_sort.h",
-      "__algorithm/pstl_backends/cpu_backends/thread.h",
       "__algorithm/pstl_backends/cpu_backends/transform.h",
       "__algorithm/pstl_backends/cpu_backends/transform_reduce.h",
       "__algorithm/pstl_copy.h",
@@ -664,6 +658,11 @@ if (current_toolchain == default_toolchain) {
       "__numeric/transform_exclusive_scan.h",
       "__numeric/transform_inclusive_scan.h",
       "__numeric/transform_reduce.h",
+      "__pstl/backends/libdispatch.h",
+      "__pstl/backends/serial.h",
+      "__pstl/backends/std_thread.h",
+      "__pstl/configuration.h",
+      "__pstl/configuration_fwd.h",
       "__pstl/cpu_algos/cpu_traits.h",
       "__random/bernoulli_distribution.h",
       "__random/binomial_distribution.h",
@@ -982,12 +981,14 @@ if (current_toolchain == default_toolchain) {
       "ctgmath",
       "ctime",
       "ctype.h",
+      "cuchar",
       "cwchar",
       "cwctype",
       "deque",
       "errno.h",
       "exception",
       "execution",
+      "expected",
       "experimental/__config",
       "experimental/__simd/aligned_tag.h",
       "experimental/__simd/declaration.h",
diff --git a/llvm/utils/gn/secondary/libcxx/src/BUILD.gn b/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
index 5da8db4574a0c..955854c7a134b 100644
--- a/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
@@ -49,7 +49,7 @@ config("cxx_config") {
     "-Wno-covered-switch-default",
   ]
   cflags_cc = [
-    "-std=c++20",
+    "-std=c++23",
     "-nostdinc++",
   ]
   defines = [
@@ -125,6 +125,7 @@ cxx_sources = [
   "condition_variable_destructor.cpp",
   "error_category.cpp",
   "exception.cpp",
+  "expected.cpp",
   "fstream.cpp",
   "functional.cpp",
   "future.cpp",
diff --git a/llvm/utils/gn/secondary/libcxxabi/src/BUILD.gn b/llvm/utils/gn/secondary/libcxxabi/src/BUILD.gn
index c82634e2bb064..7a923c5c854d7 100644
--- a/llvm/utils/gn/secondary/libcxxabi/src/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxxabi/src/BUILD.gn
@@ -66,7 +66,7 @@ config("cxxabi_config") {
     "//libcxx/src",
   ]
   cflags_cc = [
-    "-std=c++20",
+    "-std=c++23",
     "-nostdinc++",
   ]
   defines = [
diff --git a/llvm/utils/gn/secondary/lldb/test/BUILD.gn b/llvm/utils/gn/secondary/lldb/test/BUILD.gn
index 414ea4933c519..c8245739842d9 100644
--- a/llvm/utils/gn/secondary/lldb/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/test/BUILD.gn
@@ -118,6 +118,7 @@ write_lit_cfg("lit_shell_site_cfg") {
     "LLDB_TOOLS_DIR=" + rebase_path("$root_out_dir/bin"),
     "LLDB_USE_SYSTEM_DEBUGSERVER=1",  # XXX port //lldb/tools/debugserver (?)
     "LLVM_HOST_TRIPLE=$llvm_current_triple",
+    "LLVM_USE_SANITIZER=",
   ]
 
   if (llvm_enable_zlib) {
diff --git a/mlir/docs/Dialects/LLVM.md b/mlir/docs/Dialects/LLVM.md
index a49ba35db9a68..ba466aa6bc401 100644
--- a/mlir/docs/Dialects/LLVM.md
+++ b/mlir/docs/Dialects/LLVM.md
@@ -139,12 +139,12 @@ will be reevaluated after considering composite constants.
 ### Globals
 
 Global variables are also defined using a special operation,
-[`llvm.mlir.global`](#llvmmlirglobal-mlirllvmglobalop), located at the module
+[`llvm.mlir.global`](#llvmmlirglobal-llvmglobalop), located at the module
 level. Globals are MLIR symbols and are identified by their name.
 
 Since functions need to be isolated-from-above, i.e. values defined outside the
 function cannot be directly used inside the function, an additional operation,
-[`llvm.mlir.addressof`](#llvmmliraddressof-mlirllvmaddressofop), is provided to
+[`llvm.mlir.addressof`](#llvmmliraddressof-llvmaddressofop), is provided to
 locally define a value containing the _address_ of a global. The actual value
 can then be loaded from that pointer, or a new value can be stored into it if
 the global is not declared constant. This is similar to LLVM IR where globals
diff --git a/mlir/docs/Interfaces.md b/mlir/docs/Interfaces.md
index 536e7613e5093..51747db546bb7 100644
--- a/mlir/docs/Interfaces.md
+++ b/mlir/docs/Interfaces.md
@@ -299,6 +299,30 @@ owner of the dialect containing the object nor the owner of the interface are
 aware of an interface implementation, which can lead to duplicate or
 diverging implementations.
 
+Forgetting to register an external model can lead to bugs which are hard to
+track down. The `declarePromisedInterface` function can be used to declare that
+an external model implementation for an operation must eventually be provided.
+
+```
+  void MyDialect::initialize() {
+    declarePromisedInterface<SomeInterface, SomeOp>();
+     ...
+  }
+```
+
+Now attempting to use the interface, e.g in a cast, without a prior registration
+of the external model will lead to a runtime error that will look similar to
+this:
+
+```
+LLVM ERROR: checking for an interface (`SomeInterface`) that was promised by dialect 'mydialect' but never implemented. This is generally an indication that the dialect extension implementing the interface was never registered.
+```
+
+If you encounter this error for a dialect and an interface provided by MLIR, you
+may look for a method that will be named like
+`register<Dialect><Interface>ExternalModels(DialectRegistry &registry);` ; try
+to find it with `git grep 'register.*SomeInterface.*Model' mlir`.
+
 #### Dialect Fallback for OpInterface
 
 Some dialects have an open ecosystem and don't register all of the possible
diff --git a/mlir/include/mlir-c/Dialect/NVGPU.h b/mlir/include/mlir-c/Dialect/NVGPU.h
index 580d566794c09..e58015a4a3421 100644
--- a/mlir/include/mlir-c/Dialect/NVGPU.h
+++ b/mlir/include/mlir-c/Dialect/NVGPU.h
@@ -11,6 +11,7 @@
 #define MLIR_C_DIALECT_NVGPU_H
 
 #include "mlir-c/IR.h"
+#include "mlir-c/Support.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -18,6 +19,16 @@ extern "C" {
 
 MLIR_DECLARE_CAPI_DIALECT_REGISTRATION(NVGPU, nvgpu);
 
+//===---------------------------------------------------------------------===//
+// TensorMapDescriptorType
+//===---------------------------------------------------------------------===//
+
+MLIR_CAPI_EXPORTED bool mlirTypeIsANVGPUTensorMapDescriptorType(MlirType type);
+
+MLIR_CAPI_EXPORTED MlirType mlirNVGPUTensorMapDescriptorTypeGet(
+    MlirContext ctx, MlirType tensorMemrefType, int swizzle, int l2promo,
+    int oobFill, int interleave);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/mlir/include/mlir-c/Dialect/Transform/Interpreter.h b/mlir/include/mlir-c/Dialect/Transform/Interpreter.h
index 00095d5040a0e..fa320324234e8 100644
--- a/mlir/include/mlir-c/Dialect/Transform/Interpreter.h
+++ b/mlir/include/mlir-c/Dialect/Transform/Interpreter.h
@@ -60,7 +60,7 @@ MLIR_CAPI_EXPORTED void
 mlirTransformOptionsDestroy(MlirTransformOptions transformOptions);
 
 //----------------------------------------------------------------------------//
-// Transform interpreter.
+// Transform interpreter and utilities.
 //----------------------------------------------------------------------------//
 
 /// Applies the transformation script starting at the given transform root
@@ -72,6 +72,16 @@ MLIR_CAPI_EXPORTED MlirLogicalResult mlirTransformApplyNamedSequence(
     MlirOperation payload, MlirOperation transformRoot,
     MlirOperation transformModule, MlirTransformOptions transformOptions);
 
+/// Merge the symbols from `other` into `target`, potentially renaming them to
+/// avoid conflicts. Private symbols may be renamed during the merge, public
+/// symbols must have at most one declaration. A name conflict in public symbols
+/// is reported as an error before returning a failure.
+///
+/// Note that this clones the `other` operation unlike the C++ counterpart that
+/// takes ownership.
+MLIR_CAPI_EXPORTED MlirLogicalResult
+mlirMergeSymbolsIntoFromClone(MlirOperation target, MlirOperation other);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/mlir/include/mlir-c/IR.h b/mlir/include/mlir-c/IR.h
index 82da511f807a3..32abacf353133 100644
--- a/mlir/include/mlir-c/IR.h
+++ b/mlir/include/mlir-c/IR.h
@@ -705,6 +705,13 @@ MLIR_CAPI_EXPORTED void mlirOperationMoveAfter(MlirOperation op,
 MLIR_CAPI_EXPORTED void mlirOperationMoveBefore(MlirOperation op,
                                                 MlirOperation other);
 
+/// Operation walk result.
+typedef enum MlirWalkResult {
+  MlirWalkResultAdvance,
+  MlirWalkResultInterrupt,
+  MlirWalkResultSkip
+} MlirWalkResult;
+
 /// Traversal order for operation walk.
 typedef enum MlirWalkOrder {
   MlirWalkPreOrder,
@@ -713,7 +720,8 @@ typedef enum MlirWalkOrder {
 
 /// Operation walker type. The handler is passed an (opaque) reference to an
 /// operation and a pointer to a `userData`.
-typedef void (*MlirOperationWalkCallback)(MlirOperation, void *userData);
+typedef MlirWalkResult (*MlirOperationWalkCallback)(MlirOperation,
+                                                    void *userData);
 
 /// Walks operation `op` in `walkOrder` and calls `callback` on that operation.
 /// `*userData` is passed to the callback as well and can be used to tunnel some
diff --git a/mlir/include/mlir/Bindings/Python/PybindAdaptors.h b/mlir/include/mlir/Bindings/Python/PybindAdaptors.h
index 52f6321251919..d8f22c7aa1709 100644
--- a/mlir/include/mlir/Bindings/Python/PybindAdaptors.h
+++ b/mlir/include/mlir/Bindings/Python/PybindAdaptors.h
@@ -18,6 +18,7 @@
 #ifndef MLIR_BINDINGS_PYTHON_PYBINDADAPTORS_H
 #define MLIR_BINDINGS_PYTHON_PYBINDADAPTORS_H
 
+#include <pybind11/functional.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/pytypes.h>
 #include <pybind11/stl.h>
diff --git a/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h
index 8e840e744064d..1ea7375220815 100644
--- a/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Affine/Transforms/Transforms.h
@@ -53,6 +53,17 @@ void reorderOperandsByHoistability(RewriterBase &rewriter, AffineApplyOp op);
 /// maximally compose chains of AffineApplyOps.
 FailureOr<AffineApplyOp> decompose(RewriterBase &rewriter, AffineApplyOp op);
 
+/// Reify a bound for the given variable in terms of SSA values for which
+/// `stopCondition` is met.
+///
+/// By default, lower/equal bounds are closed and upper bounds are open. If
+/// `closedUB` is set to "true", upper bounds are also closed.
+FailureOr<OpFoldResult>
+reifyValueBound(OpBuilder &b, Location loc, presburger::BoundType type,
+                const ValueBoundsConstraintSet::Variable &var,
+                ValueBoundsConstraintSet::StopConditionFn stopCondition,
+                bool closedUB = false);
+
 /// Reify a bound for the given index-typed value in terms of SSA values for
 /// which `stopCondition` is met. If no stop condition is specified, reify in
 /// terms of the operands of the owner op.
diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Arith/Transforms/Transforms.h
index 970a52a06a11a..bbc7e5d3e0dd7 100644
--- a/mlir/include/mlir/Dialect/Arith/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Arith/Transforms/Transforms.h
@@ -24,6 +24,17 @@ enum class BoundType;
 
 namespace arith {
 
+/// Reify a bound for the given variable in terms of SSA values for which
+/// `stopCondition` is met.
+///
+/// By default, lower/equal bounds are closed and upper bounds are open. If
+/// `closedUB` is set to "true", upper bounds are also closed.
+FailureOr<OpFoldResult>
+reifyValueBound(OpBuilder &b, Location loc, presburger::BoundType type,
+                const ValueBoundsConstraintSet::Variable &var,
+                ValueBoundsConstraintSet::StopConditionFn stopCondition,
+                bool closedUB = false);
+
 /// Reify a bound for the given index-typed value in terms of SSA values for
 /// which `stopCondition` is met. If no stop condition is specified, reify in
 /// terms of the operands of the owner op.
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
index 304a9740d91ed..3c5fa23bd4a7f 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPClauseOperands.h
@@ -251,11 +251,10 @@ using SectionsClauseOps = detail::Clauses<AllocateClauseOps, NowaitClauseOps,
                                           PrivateClauseOps, ReductionClauseOps>;
 
 // TODO `linear` clause.
-using SimdLoopClauseOps =
-    detail::Clauses<AlignedClauseOps, CollapseClauseOps, IfClauseOps,
-                    LoopRelatedOps, NontemporalClauseOps, OrderClauseOps,
-                    PrivateClauseOps, ReductionClauseOps, SafelenClauseOps,
-                    SimdlenClauseOps>;
+using SimdClauseOps =
+    detail::Clauses<AlignedClauseOps, IfClauseOps, NontemporalClauseOps,
+                    OrderClauseOps, PrivateClauseOps, ReductionClauseOps,
+                    SafelenClauseOps, SimdlenClauseOps>;
 
 using SingleClauseOps = detail::Clauses<AllocateClauseOps, CopyprivateClauseOps,
                                         NowaitClauseOps, PrivateClauseOps>;
@@ -284,11 +283,10 @@ using TaskgroupClauseOps =
     detail::Clauses<AllocateClauseOps, TaskReductionClauseOps>;
 
 using TaskloopClauseOps =
-    detail::Clauses<AllocateClauseOps, CollapseClauseOps, FinalClauseOps,
-                    GrainsizeClauseOps, IfClauseOps, InReductionClauseOps,
-                    LoopRelatedOps, MergeableClauseOps, NogroupClauseOps,
-                    NumTasksClauseOps, PriorityClauseOps, PrivateClauseOps,
-                    ReductionClauseOps, UntiedClauseOps>;
+    detail::Clauses<AllocateClauseOps, FinalClauseOps, GrainsizeClauseOps,
+                    IfClauseOps, InReductionClauseOps, MergeableClauseOps,
+                    NogroupClauseOps, NumTasksClauseOps, PriorityClauseOps,
+                    PrivateClauseOps, ReductionClauseOps, UntiedClauseOps>;
 
 using TaskwaitClauseOps = detail::Clauses<DependClauseOps, NowaitClauseOps>;
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 3abdbe3adfd0b..10771f6e854dd 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -562,7 +562,7 @@ def LoopNestOp : OpenMP_Op<"loop_nest", [SameVariadicOperandSize,
     loop operations intended to serve as a stopgap solution until the long-term
     representation of canonical loops is defined. Specifically, this operation
     is intended to serve as a unique source for loop information during the
-    transition to making `omp.distribute`, `omp.simdloop`, `omp.taskloop` and
+    transition to making `omp.distribute`, `omp.simd`, `omp.taskloop` and
     `omp.wsloop` wrapper operations. It is not intended to help with the
     addition of support for loop transformations, non-rectangular loops and
     non-perfectly nested loops.
@@ -722,24 +722,19 @@ def WsloopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments,
 // Simd construct [2.9.3.1]
 //===----------------------------------------------------------------------===//
 
-def SimdLoopOp : OpenMP_Op<"simdloop", [AttrSizedOperandSegments,
-                         AllTypesMatch<["lowerBound", "upperBound", "step"]>,
-                         DeclareOpInterfaceMethods<LoopWrapperInterface>,
-                         RecursiveMemoryEffects]> {
- let summary = "simd loop construct";
+def SimdOp : OpenMP_Op<"simd", [AttrSizedOperandSegments,
+                       DeclareOpInterfaceMethods<LoopWrapperInterface>,
+                       RecursiveMemoryEffects,
+                       SingleBlockImplicitTerminator<"TerminatorOp">]> {
+ let summary = "simd construct";
   let description = [{
     The simd construct can be applied to a loop to indicate that the loop can be
     transformed into a SIMD loop (that is, multiple iterations of the loop can
-    be executed concurrently using SIMD instructions).. The lower and upper
-    bounds specify a half-open range: the range includes the lower bound but
-    does not include the upper bound. If the `inclusive` attribute is specified
-    then the upper bound is also included.
-
-    The body region can contain any number of blocks. The region is terminated
-    by "omp.yield" instruction without operands.
+    be executed concurrently using SIMD instructions).
 
-    Collapsed loops are represented by the simd-loop having a list of indices,
-    bounds and steps where the size of the list is equal to the collapse value.
+    The body region can contain a single block which must contain a single
+    operation and a terminator. The operation must be another compatible loop
+    wrapper or an `omp.loop_nest`.
 
     The `alignment_values` attribute additionally specifies alignment of each
     corresponding aligned operand. Note that `$aligned_vars` and
@@ -763,32 +758,32 @@ def SimdLoopOp : OpenMP_Op<"simdloop", [AttrSizedOperandSegments,
     SIMD chunk can have a distance in the logical iteration space that is
     greater than or equal to the value given in the clause.
     ```
-    omp.simdloop <clauses>
-    for (%i1, %i2) : index = (%c0, %c0) to (%c10, %c10) step (%c1, %c1) {
-      // block operations
-      omp.yield
+    omp.simd <clauses> {
+      omp.loop_nest (%i1, %i2) : index = (%c0, %c0) to (%c10, %c10) step (%c1, %c1) {
+        %a = load %arrA[%i1, %i2] : memref<?x?xf32>
+        %b = load %arrB[%i1, %i2] : memref<?x?xf32>
+        %sum = arith.addf %a, %b : f32
+        store %sum, %arrC[%i1, %i2] : memref<?x?xf32>
+        omp.yield
+      }
     }
     ```
   }];
 
   // TODO: Add other clauses
-  let arguments = (ins Variadic<IntLikeType>:$lowerBound,
-             Variadic<IntLikeType>:$upperBound,
-             Variadic<IntLikeType>:$step,
-             Variadic<OpenMP_PointerLikeType>:$aligned_vars,
+  let arguments = (ins Variadic<OpenMP_PointerLikeType>:$aligned_vars,
              OptionalAttr<I64ArrayAttr>:$alignment_values,
              Optional<I1>:$if_expr,
              Variadic<OpenMP_PointerLikeType>:$nontemporal_vars,
              OptionalAttr<OrderKindAttr>:$order_val,
              ConfinedAttr<OptionalAttr<I64Attr>, [IntPositive]>:$simdlen,
-             ConfinedAttr<OptionalAttr<I64Attr>, [IntPositive]>:$safelen,
-             UnitAttr:$inclusive
+             ConfinedAttr<OptionalAttr<I64Attr>, [IntPositive]>:$safelen
      );
 
   let regions = (region AnyRegion:$region);
 
   let builders = [
-    OpBuilder<(ins CArg<"const SimdLoopClauseOps &">:$clauses)>
+    OpBuilder<(ins CArg<"const SimdClauseOps &">:$clauses)>
   ];
 
   let assemblyFormat = [{
@@ -800,14 +795,7 @@ def SimdLoopOp : OpenMP_Op<"simdloop", [AttrSizedOperandSegments,
           |`order` `(` custom<ClauseAttr>($order_val) `)`
           |`simdlen` `(` $simdlen  `)`
           |`safelen` `(` $safelen  `)`
-    ) `for` custom<LoopControl>($region, $lowerBound, $upperBound, $step,
-                                  type($step), $inclusive) attr-dict
-  }];
-
-  let extraClassDeclaration = [{
-    /// Returns the number of loops in the simd loop nest.
-    unsigned getNumLoops() { return getLowerBound().size(); }
-
+    ) $region attr-dict
   }];
 
   let hasCustomAssemblyFormat = 1;
@@ -818,7 +806,7 @@ def SimdLoopOp : OpenMP_Op<"simdloop", [AttrSizedOperandSegments,
 def YieldOp : OpenMP_Op<"yield",
     [Pure, ReturnLike, Terminator,
      ParentOneOf<["LoopNestOp", "WsloopOp", "DeclareReductionOp",
-     "AtomicUpdateOp", "SimdLoopOp", "PrivateClauseOp"]>]> {
+     "AtomicUpdateOp", "PrivateClauseOp"]>]> {
   let summary = "loop yield and termination operation";
   let description = [{
     "omp.yield" yields SSA values from the OpenMP dialect op region and
@@ -840,7 +828,8 @@ def YieldOp : OpenMP_Op<"yield",
 //===----------------------------------------------------------------------===//
 def DistributeOp : OpenMP_Op<"distribute", [AttrSizedOperandSegments,
                              DeclareOpInterfaceMethods<LoopWrapperInterface>,
-                             RecursiveMemoryEffects]> {
+                             RecursiveMemoryEffects,
+                             SingleBlockImplicitTerminator<"TerminatorOp">]> {
   let summary = "distribute construct";
   let description = [{
     The distribute construct specifies that the iterations of one or more loops
@@ -855,15 +844,28 @@ def DistributeOp : OpenMP_Op<"distribute", [AttrSizedOperandSegments,
     The distribute loop construct specifies that the iterations of the loop(s)
     will be executed in parallel by threads in the current context. These
     iterations are spread across threads that already exist in the enclosing
-    region. The lower and upper bounds specify a half-open range: the
-    range includes the lower bound but does not include the upper bound. If the
-    `inclusive` attribute is specified then the upper bound is also included.
+    region.
+    
+    The body region can contain a single block which must contain a single
+    operation and a terminator. The operation must be another compatible loop
+    wrapper or an `omp.loop_nest`.
 
     The `dist_schedule_static` attribute specifies the  schedule for this
     loop, determining how the loop is distributed across the parallel threads.
     The optional `schedule_chunk` associated with this determines further
     controls this distribution.
 
+    ```mlir
+    omp.distribute <clauses> {
+      omp.loop_nest (%i1, %i2) : index = (%c0, %c0) to (%c10, %c10) step (%c1, %c1) {
+        %a = load %arrA[%i1, %i2] : memref<?x?xf32>
+        %b = load %arrB[%i1, %i2] : memref<?x?xf32>
+        %sum = arith.addf %a, %b : f32
+        store %sum, %arrC[%i1, %i2] : memref<?x?xf32>
+        omp.yield
+      }
+    }
+    ```
     // TODO: private_var, firstprivate_var, lastprivate_var, collapse
   }];
   let arguments = (ins
@@ -1016,10 +1018,10 @@ def TaskOp : OpenMP_Op<"task", [AttrSizedOperandSegments,
 }
 
 def TaskloopOp : OpenMP_Op<"taskloop", [AttrSizedOperandSegments,
-                           AutomaticAllocationScope, RecursiveMemoryEffects,
-                           AllTypesMatch<["lowerBound", "upperBound", "step"]>,
+                           AutomaticAllocationScope,
                            DeclareOpInterfaceMethods<LoopWrapperInterface>,
-                           ReductionClauseInterface]> {
+                           RecursiveMemoryEffects, ReductionClauseInterface,
+                           SingleBlockImplicitTerminator<"TerminatorOp">]> {
   let summary = "taskloop construct";
   let description = [{
     The taskloop construct specifies that the iterations of one or more
@@ -1027,21 +1029,19 @@ def TaskloopOp : OpenMP_Op<"taskloop", [AttrSizedOperandSegments,
     iterations are distributed across tasks generated by the construct and
     scheduled to be executed.
 
-    The `lowerBound` and `upperBound` specify a half-open range: the range
-    includes the lower bound but does not include the upper bound. If the
-    `inclusive` attribute is specified then the upper bound is also included.
-    The `step` specifies the loop step.
-
-    The body region can contain any number of blocks.
+    The body region can contain a single block which must contain a single
+    operation and a terminator. The operation must be another compatible loop
+    wrapper or an `omp.loop_nest`.
 
     ```
-    omp.taskloop <clauses>
-    for (%i1, %i2) : index = (%c0, %c0) to (%c10, %c10) step (%c1, %c1) {
-      %a = load %arrA[%i1, %i2] : memref<?x?xf32>
-      %b = load %arrB[%i1, %i2] : memref<?x?xf32>
-      %sum = arith.addf %a, %b : f32
-      store %sum, %arrC[%i1, %i2] : memref<?x?xf32>
-      omp.terminator
+    omp.taskloop <clauses> {
+      omp.loop_nest (%i1, %i2) : index = (%c0, %c0) to (%c10, %c10) step (%c1, %c1) {
+        %a = load %arrA[%i1, %i2] : memref<?x?xf32>
+        %b = load %arrB[%i1, %i2] : memref<?x?xf32>
+        %sum = arith.addf %a, %b : f32
+        store %sum, %arrC[%i1, %i2] : memref<?x?xf32>
+        omp.yield
+      }
     }
     ```
 
@@ -1118,11 +1118,7 @@ def TaskloopOp : OpenMP_Op<"taskloop", [AttrSizedOperandSegments,
     created.
   }];
 
-  let arguments = (ins Variadic<IntLikeType>:$lowerBound,
-                       Variadic<IntLikeType>:$upperBound,
-                       Variadic<IntLikeType>:$step,
-                       UnitAttr:$inclusive,
-                       Optional<I1>:$if_expr,
+  let arguments = (ins Optional<I1>:$if_expr,
                        Optional<I1>:$final_expr,
                        UnitAttr:$untied,
                        UnitAttr:$mergeable,
@@ -1165,8 +1161,7 @@ def TaskloopOp : OpenMP_Op<"taskloop", [AttrSizedOperandSegments,
           |`grain_size` `(` $grain_size `:` type($grain_size) `)`
           |`num_tasks` `(` $num_tasks `:` type($num_tasks) `)`
           |`nogroup` $nogroup
-    ) `for` custom<LoopControl>($region, $lowerBound, $upperBound, $step,
-                                  type($step), $inclusive) attr-dict
+    ) $region attr-dict
   }];
 
   let extraClassDeclaration = [{
diff --git a/mlir/include/mlir/Dialect/Polynomial/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Polynomial/IR/CMakeLists.txt
index d8039deb5ee21..79e739953d7cf 100644
--- a/mlir/include/mlir/Dialect/Polynomial/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/Polynomial/IR/CMakeLists.txt
@@ -1,8 +1,8 @@
 add_mlir_dialect(Polynomial polynomial)
-add_mlir_doc(PolynomialDialect PolynomialDialect Polynomial/ -gen-dialect-doc)
-add_mlir_doc(PolynomialOps PolynomialOps Polynomial/ -gen-op-doc)
-add_mlir_doc(PolynomialAttributes PolynomialAttributes Dialects/ -gen-attrdef-doc)
-add_mlir_doc(PolynomialTypes PolynomialTypes Dialects/ -gen-typedef-doc)
+add_mlir_doc(Polynomial PolynomialDialect Polynomial/ -gen-dialect-doc -dialect=polynomial)
+add_mlir_doc(Polynomial PolynomialOps Polynomial/ -gen-op-doc)
+add_mlir_doc(Polynomial PolynomialAttributes Dialects/ -gen-attrdef-doc)
+add_mlir_doc(Polynomial PolynomialTypes Dialects/ -gen-typedef-doc)
 
 set(LLVM_TARGET_DEFINITIONS Polynomial.td)
 mlir_tablegen(PolynomialAttributes.cpp.inc -gen-attrdef-defs -attrdefs-dialect=polynomial)
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
index 0cfc64f9988a0..4e4441c640ed9 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
@@ -27,9 +27,7 @@ class SparseTensor_Op<string mnemonic, list<Trait> traits = []>
 // Sparse Tensor Operations.
 //===----------------------------------------------------------------------===//
 
-def SparseTensor_NewOp : SparseTensor_Op<"new", [Pure]>,
-    Arguments<(ins AnyType:$source)>,
-    Results<(outs AnySparseTensor:$result)> {
+def SparseTensor_NewOp : SparseTensor_Op<"new", [Pure]> {
   string summary = "Materializes a new sparse tensor from given source";
   string description = [{
     Materializes a sparse tensor with contents taken from an opaque pointer
@@ -51,15 +49,14 @@ def SparseTensor_NewOp : SparseTensor_Op<"new", [Pure]>,
     sparse_tensor.new %source : !Source to tensor<1024x1024xf64, #CSR>
     ```
   }];
+
+  let arguments = (ins AnyType:$source);
+  let results = (outs AnySparseTensor:$result);
   let assemblyFormat = "$source attr-dict `:` type($source) `to` type($result)";
 }
 
-def SparseTensor_AssembleOp : SparseTensor_Op<"assemble", [Pure]>,
-    Arguments<(ins Variadic<TensorOf<[AnySignlessIntegerOrIndex]>>:$levels,
-                   TensorOf<[AnyType]>:$values)>,
-    Results<(outs AnySparseTensor: $result)> {
+def SparseTensor_AssembleOp : SparseTensor_Op<"assemble", [Pure]> {
   let summary = "Returns a sparse tensor assembled from the given levels and values";
-
   let description = [{
     Assembles the per-level position and coordinate arrays together with
     the values arrays into a sparse tensor. The order and types of the
@@ -93,6 +90,9 @@ def SparseTensor_AssembleOp : SparseTensor_Op<"assemble", [Pure]>,
     ```
   }];
 
+  let arguments = (ins Variadic<TensorOf<[AnySignlessIntegerOrIndex]>>:$levels,
+                   TensorOf<[AnyType]>:$values);
+  let results = (outs AnySparseTensor: $result);
   let assemblyFormat =
     "` ` `(` $levels       `)` `,` $values attr-dict `:`"
     "    `(` type($levels) `)` `,` type($values) `to` type($result)";
@@ -100,16 +100,8 @@ def SparseTensor_AssembleOp : SparseTensor_Op<"assemble", [Pure]>,
   let hasVerifier = 1;
 }
 
-def SparseTensor_DisassembleOp : SparseTensor_Op<"disassemble", [Pure, SameVariadicResultSize]>,
-    Arguments<(ins AnySparseTensor:$tensor,
-                   Variadic<TensorOf<[AnySignlessIntegerOrIndex]>>:$out_levels,
-                   TensorOf<[AnyType]>:$out_values)>,
-    Results<(outs Variadic<TensorOf<[AnySignlessIntegerOrIndex]>>:$ret_levels,
-                  TensorOf<[AnyType]>:$ret_values,
-                  Variadic<AnyIndexingScalarLike>:$lvl_lens,
-                  AnyIndexingScalarLike:$val_len)> {
+def SparseTensor_DisassembleOp : SparseTensor_Op<"disassemble", [Pure, SameVariadicResultSize]> {
   let summary = "Copies the levels and values of the given sparse tensor";
-
   let description = [{
     The disassemble operation is the inverse of `sparse_tensor::assemble`.
     It copies the per-level position and coordinate arrays together with
@@ -143,6 +135,13 @@ def SparseTensor_DisassembleOp : SparseTensor_Op<"disassemble", [Pure, SameVaria
     ```
   }];
 
+  let arguments = (ins AnySparseTensor:$tensor,
+                   Variadic<TensorOf<[AnySignlessIntegerOrIndex]>>:$out_levels,
+                   TensorOf<[AnyType]>:$out_values);
+  let results = (outs Variadic<TensorOf<[AnySignlessIntegerOrIndex]>>:$ret_levels,
+                  TensorOf<[AnyType]>:$ret_values,
+                  Variadic<AnyIndexingScalarLike>:$lvl_lens,
+                  AnyIndexingScalarLike:$val_len);
   let assemblyFormat =
     "$tensor attr-dict `:` type($tensor)"
     "`out_lvls` `(` $out_levels `:` type($out_levels) `)` "
@@ -154,9 +153,7 @@ def SparseTensor_DisassembleOp : SparseTensor_Op<"disassemble", [Pure, SameVaria
 }
 
 def SparseTensor_ConvertOp : SparseTensor_Op<"convert",
-  [Pure, StageWithSortSparseOpInterface]>,
-    Arguments<(ins AnyTensor:$source)>,
-    Results<(outs AnyTensor:$dest)> {
+  [Pure, StageWithSortSparseOpInterface]> {
   string summary = "Converts between different tensor types";
   string description = [{
     Converts one sparse or dense tensor type to another tensor type. The rank
@@ -197,20 +194,22 @@ def SparseTensor_ConvertOp : SparseTensor_Op<"convert",
 
   }];
 
+  let arguments = (ins AnyTensor:$source);
+  let results = (outs AnyTensor:$dest);
+  let assemblyFormat = "$source attr-dict `:` type($source) `to` type($dest)";
+
   let extraClassDeclaration = [{
      // Whether the convert can be done by a single step or it would require
      // an extra sort. Inherited from StageWithSortSparseOpInterface.
      bool needsExtraSort();
   }];
 
-  let assemblyFormat = "$source attr-dict `:` type($source) `to` type($dest)";
   let hasFolder = 1;
   let hasVerifier = 1;
 }
 
-def SparseTensor_ReinterpretMapOp : SparseTensor_Op<"reinterpret_map", [NoMemoryEffect]>,
-    Arguments<(ins AnySparseTensor:$source)>,
-    Results<(outs AnySparseTensor:$dest)> {
+def SparseTensor_ReinterpretMapOp : SparseTensor_Op<"reinterpret_map",
+    [NoMemoryEffect]> {
   let summary = "Reinterprets the dimension/level maps of the source tensor";
   let description = [{
     Reinterprets the dimension-to-level and level-to-dimension map specified in
@@ -248,19 +247,20 @@ def SparseTensor_ReinterpretMapOp : SparseTensor_Op<"reinterpret_map", [NoMemory
     ```
     }];
 
+  let arguments = (ins AnySparseTensor:$source);
+  let results = (outs AnySparseTensor:$dest);
+  let assemblyFormat = "$source attr-dict `:` type($source) `to` type($dest)";
+
   let builders = [
     OpBuilder<(ins "SparseTensorEncodingAttr":$dstEnc, "Value":$source)>
   ];
 
-  let assemblyFormat = "$source attr-dict `:` type($source) `to` type($dest)";
   let hasFolder = 1;
   let hasVerifier = 1;
 }
 
 def SparseTensor_ToPositionsOp : SparseTensor_Op<"positions",
-      [Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>]>,
-    Arguments<(ins AnySparseTensor:$tensor, LevelAttr:$level)>,
-    Results<(outs AnyNon0RankedMemRef:$result)> {
+      [Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Extracts the `level`-th positions array of the `tensor`";
   let description = [{
     Returns the positions array of the tensor's storage at the given
@@ -280,14 +280,16 @@ def SparseTensor_ToPositionsOp : SparseTensor_Op<"positions",
        : tensor<64x64xf64, #CSR> to memref<?xindex>
     ```
   }];
+
+  let arguments = (ins AnySparseTensor:$tensor, LevelAttr:$level);
+  let results = (outs AnyNon0RankedMemRef:$result);
   let assemblyFormat = "$tensor attr-dict `:` type($tensor) `to` type($result)";
+
   let hasVerifier = 1;
 }
 
 def SparseTensor_ToCoordinatesOp : SparseTensor_Op<"coordinates",
-      [Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>]>,
-    Arguments<(ins AnySparseTensor:$tensor, LevelAttr:$level)>,
-    Results<(outs AnyNon0RankedMemRef:$result)> {
+      [Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Extracts the `level`-th coordinates array of the `tensor`";
   let description = [{
     Returns the coordinates array of the tensor's storage at the given
@@ -307,14 +309,16 @@ def SparseTensor_ToCoordinatesOp : SparseTensor_Op<"coordinates",
        : tensor<64x64xf64, #CSR> to memref<?xindex>
     ```
   }];
+
+  let arguments = (ins AnySparseTensor:$tensor, LevelAttr:$level);
+  let results = (outs AnyNon0RankedMemRef:$result);
   let assemblyFormat = "$tensor attr-dict `:` type($tensor) `to` type($result)";
+
   let hasVerifier = 1;
 }
 
 def SparseTensor_ToCoordinatesBufferOp : SparseTensor_Op<"coordinates_buffer",
-      [Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>]>,
-    Arguments<(ins AnySparseTensor:$tensor)>,
-    Results<(outs AnyNon0RankedMemRef:$result)> {
+      [Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Extracts the linear coordinates array from a tensor";
   let description = [{
     Returns the linear coordinates array for a sparse tensor with
@@ -339,14 +343,16 @@ def SparseTensor_ToCoordinatesBufferOp : SparseTensor_Op<"coordinates_buffer",
        : tensor<64x64xf64, #COO> to memref<?xindex>
     ```
   }];
+
+  let arguments = (ins AnySparseTensor:$tensor);
+  let results = (outs AnyNon0RankedMemRef:$result);
   let assemblyFormat = "$tensor attr-dict `:` type($tensor) `to` type($result)";
+
   let hasVerifier = 1;
 }
 
 def SparseTensor_ToValuesOp : SparseTensor_Op<"values",
-      [Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>]>,
-    Arguments<(ins AnySparseTensor:$tensor)>,
-    Results<(outs AnyNon0RankedMemRef:$result)> {
+      [Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
   let summary = "Extracts numerical values array from a tensor";
   let description = [{
     Returns the values array of the sparse storage format for the given
@@ -365,13 +371,15 @@ def SparseTensor_ToValuesOp : SparseTensor_Op<"values",
     %1 = sparse_tensor.values %0 : tensor<64x64xf64, #CSR> to memref<?xf64>
     ```
   }];
+
+  let arguments = (ins AnySparseTensor:$tensor);
+  let results = (outs AnyNon0RankedMemRef:$result);
   let assemblyFormat = "$tensor attr-dict `:` type($tensor) `to` type($result)";
+
   let hasVerifier = 1;
 }
 
-def SparseTensor_NumberOfEntriesOp : SparseTensor_Op<"number_of_entries", [Pure]>,
-    Arguments<(ins AnySparseTensor:$tensor)>,
-    Results<(outs Index:$result)> {
+def SparseTensor_NumberOfEntriesOp : SparseTensor_Op<"number_of_entries", [Pure]> {
   let summary = "Returns the number of entries that are stored in the tensor.";
   let description = [{
     Returns the number of entries that are stored in the given sparse tensor.
@@ -385,14 +393,14 @@ def SparseTensor_NumberOfEntriesOp : SparseTensor_Op<"number_of_entries", [Pure]
     %noe = sparse_tensor.number_of_entries %tensor : tensor<64x64xf64, #CSR>
     ```
   }];
+
+  let arguments = (ins AnySparseTensor:$tensor);
+  let results = (outs Index:$result);
   let assemblyFormat = "$tensor attr-dict `:` type($tensor)";
 }
 
 def SparseTensor_ConcatenateOp : SparseTensor_Op<"concatenate",
-                                 [Pure, StageWithSortSparseOpInterface]>,
-    Arguments<(ins Variadic<AnyRankedTensor>:$inputs, DimensionAttr:$dimension)>,
-    Results<(outs AnyRankedTensor:$result)> {
-
+      [Pure, StageWithSortSparseOpInterface]> {
   let summary = "Concatenates a list of tensors into a single tensor.";
   let description = [{
      Concatenates a list input tensors and the output tensor with the same
@@ -418,13 +426,14 @@ def SparseTensor_ConcatenateOp : SparseTensor_Op<"concatenate",
      bool needsExtraSort();
   }];
 
+  let arguments = (ins Variadic<AnyRankedTensor>:$inputs, DimensionAttr:$dimension);
+  let results = (outs AnyRankedTensor:$result);
   let assemblyFormat = "$inputs attr-dict `:` type($inputs) `to` type($result)";
+
   let hasVerifier = 1;
 }
 
-def SparseTensor_ToSliceOffsetOp : SparseTensor_Op<"slice.offset", [Pure]>,
-    Arguments<(ins AnySparseTensorSlice:$slice, IndexAttr:$dim)>,
-    Results<(outs Index:$offset)> {
+def SparseTensor_ToSliceOffsetOp : SparseTensor_Op<"slice.offset", [Pure]> {
   let summary = "Extracts the offset of the sparse tensor slice at the given dimension";
   let description = [{
     Extracts the offset of the sparse tensor slice at the given dimension.
@@ -445,13 +454,15 @@ def SparseTensor_ToSliceOffsetOp : SparseTensor_Op<"slice.offset", [Pure]>,
     // %2 = %v2
     ```
   }];
+
+  let arguments = (ins AnySparseTensorSlice:$slice, IndexAttr:$dim);
+  let results = (outs Index:$offset);
   let assemblyFormat = "$slice `at` $dim attr-dict `:` type($slice)";
+
   let hasVerifier = 1;
 }
 
-def SparseTensor_ToSliceStrideOp : SparseTensor_Op<"slice.stride", [Pure]>,
-    Arguments<(ins AnySparseTensorSlice:$slice, IndexAttr:$dim)>,
-    Results<(outs Index:$stride)> {
+def SparseTensor_ToSliceStrideOp : SparseTensor_Op<"slice.stride", [Pure]> {
   let summary = "Extracts the stride of the sparse tensor slice at the given dimension";
   let description = [{
     Extracts the stride of the sparse tensor slice at the given dimension.
@@ -473,7 +484,11 @@ def SparseTensor_ToSliceStrideOp : SparseTensor_Op<"slice.stride", [Pure]>,
 
     ```
   }];
+
+  let arguments = (ins AnySparseTensorSlice:$slice, IndexAttr:$dim);
+  let results = (outs Index:$stride);
   let assemblyFormat = "$slice `at` $dim attr-dict `:` type($slice)";
+
   let hasVerifier = 1;
 }
 
@@ -482,9 +497,7 @@ def SparseTensor_ToSliceStrideOp : SparseTensor_Op<"slice.stride", [Pure]>,
 //===----------------------------------------------------------------------===//
 
 def SparseTensor_StorageSpecifierInitOp : SparseTensor_Op<"storage_specifier.init",
-  [Pure]>,
-    Arguments<(ins Optional<SparseTensorStorageSpecifier>:$source)>,
-    Results<(outs SparseTensorStorageSpecifier:$result)> {
+      [Pure]> {
   let summary = "";
   let description = [{
     Returns an initial storage specifier value.  A storage specifier
@@ -515,6 +528,10 @@ def SparseTensor_StorageSpecifierInitOp : SparseTensor_Op<"storage_specifier.ini
     ```
   }];
 
+  let arguments = (ins Optional<SparseTensorStorageSpecifier>:$source);
+  let results = (outs SparseTensorStorageSpecifier:$result);
+  let assemblyFormat = "attr-dict (`with` $source^)? `:` (`from` qualified(type($source))^ `to`)?"
+                                                        " qualified(type($result))";
   let builders = [
     OpBuilder<(ins "Type":$result),
     [{
@@ -522,15 +539,10 @@ def SparseTensor_StorageSpecifierInitOp : SparseTensor_Op<"storage_specifier.ini
     }]>
   ];
 
-  let assemblyFormat = "attr-dict (`with` $source^)? `:` (`from` qualified(type($source))^ `to`)?"
-                                                        " qualified(type($result))";
+
 }
 
-def SparseTensor_GetStorageSpecifierOp : SparseTensor_Op<"storage_specifier.get", [Pure]>,
-    Arguments<(ins SparseTensorStorageSpecifier:$specifier,
-                   SparseTensorStorageSpecifierKindAttr:$specifierKind,
-                   OptionalAttr<LevelAttr>:$level)>,
-    Results<(outs Index:$result)> {
+def SparseTensor_GetStorageSpecifierOp : SparseTensor_Op<"storage_specifier.get", [Pure]> {
   let summary = "";
   let description = [{
     Returns the requested field of the given storage_specifier.
@@ -543,19 +555,19 @@ def SparseTensor_GetStorageSpecifierOp : SparseTensor_Op<"storage_specifier.get"
     ```
   }];
 
+  let arguments = (ins SparseTensorStorageSpecifier:$specifier,
+                   SparseTensorStorageSpecifierKindAttr:$specifierKind,
+                   OptionalAttr<LevelAttr>:$level);
+  let results = (outs Index:$result);
   let assemblyFormat = "$specifier $specifierKind (`at` $level^)? attr-dict"
                        "`:` qualified(type($specifier))";
+
   let hasVerifier = 1;
   let hasFolder = 1;
 }
 
 def SparseTensor_SetStorageSpecifierOp : SparseTensor_Op<"storage_specifier.set",
-    [Pure, AllTypesMatch<["result", "specifier"]>]>,
-    Arguments<(ins SparseTensorStorageSpecifier:$specifier,
-                   SparseTensorStorageSpecifierKindAttr:$specifierKind,
-                   OptionalAttr<LevelAttr>:$level,
-                   Index:$value)>,
-    Results<(outs SparseTensorStorageSpecifier:$result)> {
+    [Pure, AllTypesMatch<["result", "specifier"]>]> {
   let summary = "";
   let description = [{
     Set the field of the storage specifier to the given input value. Returns
@@ -568,8 +580,15 @@ def SparseTensor_SetStorageSpecifierOp : SparseTensor_Op<"storage_specifier.set"
        : !sparse_tensor.storage_specifier<#COO>
     ```
   }];
+
+  let arguments = (ins SparseTensorStorageSpecifier:$specifier,
+                   SparseTensorStorageSpecifierKindAttr:$specifierKind,
+                   OptionalAttr<LevelAttr>:$level,
+                   Index:$value);
+  let results = (outs SparseTensorStorageSpecifier:$result);
   let assemblyFormat = "$specifier $specifierKind (`at` $level^)? `with` $value"
                        " attr-dict `:` qualified(type($result))";
+
   let hasVerifier = 1;
 }
 
@@ -577,9 +596,7 @@ def SparseTensor_SetStorageSpecifierOp : SparseTensor_Op<"storage_specifier.set"
 // Sparse Tensor Coordinate Operations.
 //===----------------------------------------------------------------------===//
 
-def SparseTensor_LvlOp : SparseTensor_Op<"lvl", [ConditionallySpeculatable, NoMemoryEffect]>,
-    Arguments<(ins AnySparseTensor:$source, Index:$index)>,
-    Results<(outs Index:$result)> {
+def SparseTensor_LvlOp : SparseTensor_Op<"lvl", [ConditionallySpeculatable, NoMemoryEffect]> {
   let summary = "level index operation";
   let description = [{
     The `sparse_tensor.lvl` behaves similar to `tensor.dim` operation.
@@ -615,9 +632,9 @@ def SparseTensor_LvlOp : SparseTensor_Op<"lvl", [ConditionallySpeculatable, NoMe
     ```
   }];
 
-  let assemblyFormat = [{
-    attr-dict $source `,` $index `:` type($source)
-  }];
+  let arguments = (ins AnySparseTensor:$source, Index:$index);
+  let results = (outs Index:$result);
+  let assemblyFormat = "attr-dict $source `,` $index `:` type($source) ";
 
   let builders = [
     OpBuilder<(ins "Value":$source, "int64_t":$index)>
@@ -635,11 +652,7 @@ def SparseTensor_LvlOp : SparseTensor_Op<"lvl", [ConditionallySpeculatable, NoMe
   let hasFolder = 1;
 }
 
-def SparseTensor_CrdTranslateOp : SparseTensor_Op<"crd_translate", [Pure]>,
-    Arguments<(ins Variadic<Index>:$in_crds,
-                   SparseTensorCrdTransDirectionAttr:$direction,
-                   SparseTensorEncodingAttr:$encoder)>,
-    Results<(outs Variadic<Index>:$out_crds)> {
+def SparseTensor_CrdTranslateOp : SparseTensor_Op<"crd_translate", [Pure]> {
   string summary = "Performs coordinate translation between level and dimension coordinate space.";
   string description = [{
     Performs coordinate translation between level and dimension coordinate space according
@@ -652,7 +665,13 @@ def SparseTensor_CrdTranslateOp : SparseTensor_Op<"crd_translate", [Pure]>,
                        : index, index, index, index
     ```
   }];
+
+  let arguments = (ins Variadic<Index>:$in_crds,
+                   SparseTensorCrdTransDirectionAttr:$direction,
+                   SparseTensorEncodingAttr:$encoder);
+  let results = (outs Variadic<Index>:$out_crds);
   let assemblyFormat = "$direction `[` $in_crds `]` `as` $encoder attr-dict `:` type($out_crds)";
+
   let hasVerifier = 1;
   let hasFolder = 1;
 }
@@ -669,13 +688,7 @@ def SparseTensor_PushBackOp : SparseTensor_Op<"push_back",
     [TypesMatchWith<"value type matches element type of inBuffer",
                     "inBuffer", "value",
                     "::llvm::cast<ShapedType>($_self).getElementType()">,
-     AllTypesMatch<["inBuffer", "outBuffer"]>]>,
-    Arguments<(ins Index:$curSize,
-               StridedMemRefRankOf<[AnyType], [1]>:$inBuffer,
-               AnyType:$value, Optional<Index>:$n,
-               UnitAttr:$inbounds)>,
-    Results<(outs StridedMemRefRankOf<[AnyType], [1]>:$outBuffer,
-             Index:$newSize)>  {
+     AllTypesMatch<["inBuffer", "outBuffer"]>]> {
   string summary = "Pushes a value to the back of a given buffer";
   string description = [{
     Pushes `value` to the end of the given sparse tensor storage buffer
@@ -719,6 +732,13 @@ def SparseTensor_PushBackOp : SparseTensor_Op<"push_back",
        : xindex, memref<?xf64>, f64
     ```
   }];
+
+  let arguments = (ins Index:$curSize,
+                       StridedMemRefRankOf<[AnyType], [1]>:$inBuffer,
+                       AnyType:$value, Optional<Index>:$n,
+                       UnitAttr:$inbounds);
+  let results = (outs StridedMemRefRankOf<[AnyType], [1]>:$outBuffer,
+                      Index:$newSize);
   let assemblyFormat = "(`inbounds` $inbounds^)? $curSize `,` $inBuffer"
                        " `,` $value (`,` $n^ )?  attr-dict `:`"
                        " type($curSize) `,` type($inBuffer) `,`"
@@ -732,12 +752,7 @@ def SparseTensor_PushBackOp : SparseTensor_Op<"push_back",
   let hasVerifier = 1;
 }
 
-def SparseTensor_ExpandOp : SparseTensor_Op<"expand", []>,
-    Arguments<(ins AnySparseTensor:$tensor)>,
-    Results<(outs AnyStridedMemRefOfRank<1>:$values,
-                  StridedMemRefRankOf<[I1],[1]>:$filled,
-                  StridedMemRefRankOf<[Index],[1]>:$added,
-                  Index:$count)> {
+def SparseTensor_ExpandOp : SparseTensor_Op<"expand", []> {
   string summary = "Expands an access pattern for insertion";
   string description = [{
     Performs an access pattern expansion for the innermost levels of the
@@ -771,19 +786,19 @@ def SparseTensor_ExpandOp : SparseTensor_Op<"expand", []>,
       : tensor<4x4xf64, #CSR> to memref<?xf64>, memref<?xi1>, memref<?xindex>
     ```
   }];
+
+
+  let arguments = (ins AnySparseTensor:$tensor);
+  let results = (outs AnyStridedMemRefOfRank<1>:$values,
+                      StridedMemRefRankOf<[I1],[1]>:$filled,
+                      StridedMemRefRankOf<[Index],[1]>:$added,
+                      Index:$count);
   let assemblyFormat = "$tensor attr-dict `:` type($tensor) `to` type($values)"
                        " `,` type($filled) `,` type($added)";
 }
 
 def SparseTensor_CompressOp : SparseTensor_Op<"compress",
-    [AllTypesMatch<["tensor", "result"]>]>,
-    Arguments<(ins AnyStridedMemRefOfRank<1>:$values,
-                   StridedMemRefRankOf<[I1],[1]>:$filled,
-                   StridedMemRefRankOf<[Index],[1]>:$added,
-                   Index:$count,
-                   AnySparseTensor:$tensor,
-                   Variadic<Index>:$lvlCoords)>,
-    Results<(outs AnySparseTensor:$result)> {
+    [AllTypesMatch<["tensor", "result"]>]> {
   string summary = "Compressed an access pattern for insertion";
   string description = [{
     Finishes a single access pattern expansion by moving inserted elements
@@ -807,6 +822,14 @@ def SparseTensor_CompressOp : SparseTensor_Op<"compress",
       : memref<?xf64>, memref<?xi1>, memref<?xindex>, tensor<4x4xf64, #CSR>
     ```
   }];
+
+  let arguments = (ins AnyStridedMemRefOfRank<1>:$values,
+                   StridedMemRefRankOf<[I1],[1]>:$filled,
+                   StridedMemRefRankOf<[Index],[1]>:$added,
+                   Index:$count,
+                   AnySparseTensor:$tensor,
+                   Variadic<Index>:$lvlCoords);
+  let results = (outs AnySparseTensor:$result);
   let assemblyFormat = "$values `,` $filled `,` $added `,` $count"
                        " `into` $tensor `[` $lvlCoords `]` attr-dict"
                        " `:` type($values) `,` type($filled) `,` type($added)"
@@ -814,9 +837,7 @@ def SparseTensor_CompressOp : SparseTensor_Op<"compress",
   let hasVerifier = 1;
 }
 
-def SparseTensor_LoadOp : SparseTensor_Op<"load", [SameOperandsAndResultType]>,
-    Arguments<(ins AnySparseTensor:$tensor, UnitAttr:$hasInserts)>,
-    Results<(outs AnyTensor:$result)> {
+def SparseTensor_LoadOp : SparseTensor_Op<"load", [SameOperandsAndResultType]> {
   let summary =
     "Rematerializes tensor from underlying sparse storage format";
   let description = [{
@@ -845,11 +866,13 @@ def SparseTensor_LoadOp : SparseTensor_Op<"load", [SameOperandsAndResultType]>,
     %1 = sparse_tensor.load %0 hasInserts : tensor<16x32xf32, #CSR>
     ```
   }];
+
+  let arguments = (ins AnySparseTensor:$tensor, UnitAttr:$hasInserts);
+  let results = (outs AnyTensor:$result);
   let assemblyFormat = "$tensor (`hasInserts` $hasInserts^)? attr-dict `:` type($tensor)";
 }
 
-def SparseTensor_OutOp : SparseTensor_Op<"out", []>,
-    Arguments<(ins AnySparseTensor:$tensor, AnyType:$dest)> {
+def SparseTensor_OutOp : SparseTensor_Op<"out", []> {
   string summary = "Outputs a sparse tensor to the given destination";
   string description = [{
     Outputs the contents of a sparse tensor to the destination defined by an
@@ -868,6 +891,8 @@ def SparseTensor_OutOp : SparseTensor_Op<"out", []>,
     sparse_tensor.out %t, %dest : tensor<1024x1024xf64, #CSR>, !Dest
     ```
   }];
+
+  let arguments = (ins AnySparseTensor:$tensor, AnyType:$dest);
   let assemblyFormat = "$tensor `,` $dest attr-dict `:` type($tensor) `,` type($dest)";
 }
 
@@ -875,11 +900,7 @@ def SparseTensor_OutOp : SparseTensor_Op<"out", []>,
 // Sparse Tensor Sorting/Ordering Operations.
 //===----------------------------------------------------------------------===//
 
-def SparseTensor_SortOp : SparseTensor_Op<"sort">,
-    Arguments<(ins Index:$n, StridedMemRefRankOf<[AnyInteger, Index], [1]>:$xy,
-               Variadic<StridedMemRefRankOf<[AnyType], [1]>>:$ys,
-               AffineMapAttr:$perm_map, OptionalAttr<IndexAttr>:$ny,
-               SparseTensorSortKindAttr:$algorithm)>  {
+def SparseTensor_SortOp : SparseTensor_Op<"sort"> {
   let summary = "Sorts the arrays in xs and ys lexicographically on the "
                 "integral values found in the xs list";
   let description = [{
@@ -904,16 +925,18 @@ def SparseTensor_SortOp : SparseTensor_Op<"sort">,
     ```
   }];
 
+  let arguments = (ins Index:$n,
+                       StridedMemRefRankOf<[AnyInteger, Index], [1]>:$xy,
+                       Variadic<StridedMemRefRankOf<[AnyType], [1]>>:$ys,
+                       AffineMapAttr:$perm_map, OptionalAttr<IndexAttr>:$ny,
+                       SparseTensorSortKindAttr:$algorithm);
   let assemblyFormat = "$algorithm $n"
                        "`,`$xy (`jointly` $ys^)? attr-dict"
                        "`:` type($xy) (`jointly` type($ys)^)?";
   let hasVerifier = 1;
 }
 
-def SparseTensor_ReorderCOOOp : SparseTensor_Op<"reorder_coo", [Pure]>,
-    Arguments<(ins AnySparseTensor: $input_coo,
-                   SparseTensorSortKindAttr:$algorithm)>,
-    Results<(outs AnySparseTensor: $result_coo)> {
+def SparseTensor_ReorderCOOOp : SparseTensor_Op<"reorder_coo", [Pure]> {
   let summary = "Reorder the input COO such that it has the the same order as "
                 "the output COO";
   let description = [{
@@ -933,6 +956,9 @@ def SparseTensor_ReorderCOOOp : SparseTensor_Op<"reorder_coo", [Pure]>,
     ```
   }];
 
+  let arguments = (ins AnySparseTensor: $input_coo,
+                       SparseTensorSortKindAttr:$algorithm);
+  let results = (outs AnySparseTensor: $result_coo);
   let assemblyFormat = "$algorithm $input_coo attr-dict"
                        "`:` type($input_coo) `to` type($result_coo)";
 
@@ -944,9 +970,7 @@ def SparseTensor_ReorderCOOOp : SparseTensor_Op<"reorder_coo", [Pure]>,
 // Sparse Tensor Syntax Operations.
 //===----------------------------------------------------------------------===//
 
-def SparseTensor_BinaryOp : SparseTensor_Op<"binary", [Pure]>,
-    Arguments<(ins AnyType:$x, AnyType:$y, UnitAttr:$left_identity, UnitAttr:$right_identity)>,
-    Results<(outs AnyType:$output)> {
+def SparseTensor_BinaryOp : SparseTensor_Op<"binary", [Pure]> {
   let summary = "Binary set operation utilized within linalg.generic";
   let description = [{
       Defines a computation within a `linalg.generic` operation that takes two
@@ -1054,18 +1078,24 @@ def SparseTensor_BinaryOp : SparseTensor_Op<"binary", [Pure]>,
   }];
 
   let regions = (region AnyRegion:$overlapRegion, AnyRegion:$leftRegion, AnyRegion:$rightRegion);
+  let arguments = (ins AnyType:$x, AnyType:$y, UnitAttr:$left_identity, UnitAttr:$right_identity);
+  let results = (outs AnyType:$output);
   let assemblyFormat = [{
         $x `,` $y `:` attr-dict type($x) `,` type($y) `to` type($output) `\n`
         `overlap` `=` $overlapRegion `\n`
         `left` `=` (`identity` $left_identity^):($leftRegion)? `\n`
         `right` `=` (`identity` $right_identity^):($rightRegion)?
   }];
+
   let hasVerifier = 1;
 }
 
-def SparseTensor_UnaryOp : SparseTensor_Op<"unary", [Pure]>,
-    Arguments<(ins AnyType:$x)>,
-    Results<(outs AnyType:$output)> {
+def SparseTensor_UnaryOp : SparseTensor_Op<"unary", [Pure]> {
+
+  let arguments = (ins AnyType:$x);
+
+  let results = (outs AnyType:$output);
+
   let summary = "Unary set operation utilized within linalg.generic";
   let description = [{
       Defines a computation with a `linalg.generic` operation that takes a single
@@ -1162,9 +1192,7 @@ def SparseTensor_UnaryOp : SparseTensor_Op<"unary", [Pure]>,
   let hasVerifier = 1;
 }
 
-def SparseTensor_ReduceOp : SparseTensor_Op<"reduce", [Pure, SameOperandsAndResultType]>,
-    Arguments<(ins AnyType:$x, AnyType:$y, AnyType:$identity)>,
-    Results<(outs AnyType:$output)> {
+def SparseTensor_ReduceOp : SparseTensor_Op<"reduce", [Pure, SameOperandsAndResultType]> {
   let summary = "Custom reduction operation utilized within linalg.generic";
   let description = [{
       Defines a computation with a `linalg.generic` operation that takes two
@@ -1208,16 +1236,14 @@ def SparseTensor_ReduceOp : SparseTensor_Op<"reduce", [Pure, SameOperandsAndResu
   }];
 
   let regions = (region SizedRegion<1>:$region);
+  let arguments = (ins AnyType:$x, AnyType:$y, AnyType:$identity);
+  let results = (outs AnyType:$output);
+  let assemblyFormat = "$x `,` $y `,` $identity attr-dict `:` type($output) $region";
 
-  let assemblyFormat = [{
-         $x `,` $y `,` $identity attr-dict `:` type($output) $region
-  }];
   let hasVerifier = 1;
 }
 
-def SparseTensor_SelectOp : SparseTensor_Op<"select", [Pure, SameOperandsAndResultType]>,
-    Arguments<(ins AnyType:$x)>,
-    Results<(outs AnyType:$output)> {
+def SparseTensor_SelectOp : SparseTensor_Op<"select", [Pure, SameOperandsAndResultType]> {
   let summary = "Select operation utilized within linalg.generic";
   let description = [{
       Defines an evaluation within a `linalg.generic` operation that takes a single
@@ -1269,16 +1295,16 @@ def SparseTensor_SelectOp : SparseTensor_Op<"select", [Pure, SameOperandsAndResu
   }];
 
   let regions = (region SizedRegion<1>:$region);
-  let assemblyFormat = [{
-         $x attr-dict `:` type($x) $region
-  }];
+  let arguments = (ins AnyType:$x);
+  let results = (outs AnyType:$output);
+  let assemblyFormat = "$x attr-dict `:` type($x) $region";
+
   let hasVerifier = 1;
 }
 
 def SparseTensor_YieldOp : SparseTensor_Op<"yield", [Pure, Terminator,
     ParentOneOf<["BinaryOp", "UnaryOp", "ReduceOp", "SelectOp",
-                 "ForeachOp"]>]>,
-    Arguments<(ins Variadic<AnyType>:$results)> {
+                 "ForeachOp"]>]> {
   let summary = "Yield from sparse_tensor set-like operations";
   let description = [{
       Yields a value from within a `binary`, `unary`, `reduce`,
@@ -1319,17 +1345,12 @@ def SparseTensor_YieldOp : SparseTensor_Op<"yield", [Pure, Terminator,
      }
   }];
 
-  let assemblyFormat = [{
-        $results attr-dict `:` type($results)
-  }];
+  let arguments = (ins Variadic<AnyType>:$results);
+  let assemblyFormat = "$results attr-dict `:` type($results)";
 }
 
 def SparseTensor_ForeachOp : SparseTensor_Op<"foreach",
-    [SingleBlockImplicitTerminator<"YieldOp">]>,
-     Arguments<(ins AnyTensor:$tensor,
-                    Variadic<AnyType>:$initArgs,
-                    OptionalAttr<AffineMapAttr>:$order)>,
-     Results<(outs Variadic<AnyType>:$results)> {
+    [SingleBlockImplicitTerminator<"YieldOp">]> {
   let summary = "Iterates over elements in a tensor";
   let description = [{
      Iterates over stored elements in a tensor (which are typically, but not always,
@@ -1424,18 +1445,79 @@ def SparseTensor_ForeachOp : SparseTensor_Op<"foreach",
   ];
 
   let regions = (region SizedRegion<1>:$region);
+  let arguments = (ins AnyTensor:$tensor,
+                       Variadic<AnyType>:$initArgs,
+                       OptionalAttr<AffineMapAttr>:$order);
+  let results = (outs Variadic<AnyType>:$results);
   let assemblyFormat = "`in` $tensor (`init``(`$initArgs^`)`)? attr-dict"
                        "    `:` type($tensor) (`,` type($initArgs)^)?"
                        "  (`->` type($results)^)?  `do` $region";
   let hasVerifier = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// Sparse Tensor Iteration Operations.
+//===----------------------------------------------------------------------===//
+
+def ExtractIterSpaceOp : SparseTensor_Op<"extract_iteration_space",
+    [Pure, DeclareOpInterfaceMethods<InferTypeOpInterface>]> {
+  let summary = "Extracts an iteration space from a sparse tensor between certain levels";
+  let description = [{
+      Extracts a `!sparse_tensor.iter_space` from a sparse tensor between
+      certain (consecutive) levels. For sparse levels, it is usually done by
+      loading a postion range from the underlying sparse tensor storage.
+      E.g., for a compressed level, the iteration space is extracted by
+      [pos[i], pos[i+1]) supposing the the parent iterator points at `i`.
+
+      `tensor`: the input sparse tensor that defines the iteration space.
+      `parentIter`: the iterator for the previous level, at which the iteration space
+      at the current levels will be extracted.
+      `loLvl`, `hiLvl`: the level range between [loLvl, hiLvl) in the input tensor that
+      the returned iteration space covers. `hiLvl - loLvl` defines the dimension of the
+      iteration space.
+
+      The type of returned the value is automatically inferred to
+      `!sparse_tensor.iter_space<#INPUT_ENCODING, lvls = $loLvl to $hiLvl>`.
+      The returned iteration space can then be iterated over by
+      `sparse_tensor.iterate` operations to visit every stored element
+      (usually nonzeros) in the input sparse tensor.
+
+      Example:
+      ```mlir
+      // Extracts a 1-D iteration space from a COO tensor at level 1.
+      %space = sparse_tensor.iteration.extract_space %sp at %it1 lvls = 1
+        : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0>
+      ```
+  }];
+
+
+  let extraClassDeclaration = [{
+    std::pair<Level, Level> getLvlRange() {
+      return std::make_pair(getLoLvl(), getHiLvl());
+    }
+    unsigned getSpaceDim() {
+      return getHiLvl() - getLoLvl();
+    }
+    ArrayRef<::mlir::sparse_tensor::LevelType> getSpaceLvlTypes() {
+      return getResultSpace().getType().getLvlTypes();
+    }
+  }];
+
+  let arguments = (ins AnySparseTensor:$tensor,
+                       Optional<AnySparseIterator>:$parentIter,
+                       LevelAttr:$loLvl, LevelAttr:$hiLvl);
+  let results = (outs AnySparseIterSpace:$resultSpace);
+  let assemblyFormat = "$tensor (`at` $parentIter^)? `lvls` `=` custom<LevelRange>($loLvl, $hiLvl) "
+                       " attr-dict `:` type($tensor) (`,` type($parentIter)^)?";
+
+  let hasVerifier = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Sparse Tensor Debugging and Test-Only Operations.
 //===----------------------------------------------------------------------===//
 
-def SparseTensor_PrintOp : SparseTensor_Op<"print">,
-    Arguments<(ins AnySparseTensor:$tensor)> {
+def SparseTensor_PrintOp : SparseTensor_Op<"print"> {
   string summary = "Prints a sparse tensor (for testing and debugging)";
   string description = [{
     Prints the individual components of a sparse tensors (the positions,
@@ -1449,6 +1531,8 @@ def SparseTensor_PrintOp : SparseTensor_Op<"print">,
     sparse_tensor.print %tensor : tensor<1024x1024xf64, #CSR>
     ```
   }];
+
+  let arguments = (ins AnySparseTensor:$tensor);
   let assemblyFormat = "$tensor attr-dict `:` type($tensor)";
 }
 
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td
index 185cff46ae25d..79113d8778743 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorTypes.td
@@ -72,4 +72,101 @@ def SparseTensorStorageSpecifier
     : Type<CPred<"::llvm::isa<::mlir::sparse_tensor::StorageSpecifierType>($_self)">, "metadata",
           "::mlir::sparse_tensor::StorageSpecifierType">;
 
+//===----------------------------------------------------------------------===//
+// Sparse Tensor Iteration Types.
+//===----------------------------------------------------------------------===//
+
+def SparseTensor_IterSpace : SparseTensor_Type<"IterSpace"> {
+  let mnemonic = "iter_space";
+
+  let description = [{
+    A sparse iteration space that represents an abstract N-D (sparse) iteration space
+    extracted from a sparse tensor, i.e., a set of (crd_0, crd_1, ..., crd_N) for
+    every stored element (usually nonzeros) in a sparse tensor between the specified
+    [$loLvl, $hiLvl) levels.
+
+    Examples:
+
+    ```mlir
+    // An iteration space extracted from a CSR tensor between levels [0, 2).
+    !iter_space<#CSR, lvls = 0 to 2>
+    ```
+  }];
+
+  let parameters = (ins
+     SparseTensorEncodingAttr : $encoding,
+     "Level" : $loLvl,
+     "Level" : $hiLvl
+  );
+
+  let extraClassDeclaration = [{
+     /// The the dimension of the iteration space.
+     unsigned getSpaceDim() const {
+       return getHiLvl() - getLoLvl();
+     }
+
+     /// Get the level types for the iteration space.
+     ArrayRef<LevelType> getLvlTypes() const {
+       return getEncoding().getLvlTypes().slice(getLoLvl(), getSpaceDim());
+     }
+
+     /// Whether the iteration space is unique (i.e., no duplicated coordinate).
+     bool isUnique() {
+       return !getLvlTypes().back().isa<LevelPropNonDefault::Nonunique>();
+     }
+
+     /// Get the corresponding iterator type.
+     ::mlir::sparse_tensor::IteratorType getIteratorType() const;
+  }];
+
+  let assemblyFormat="`<` $encoding `,` `lvls` `=` custom<LevelRange>($loLvl, $hiLvl) `>`";
+}
+
+def SparseTensor_Iterator : SparseTensor_Type<"Iterator"> {
+  let mnemonic = "iterator";
+
+  let description = [{
+    An iterator that points to the current element in the corresponding iteration space.
+
+    Examples:
+
+    ```mlir
+    // An iterator that iterates over a iteration space of type `!iter_space<#CSR, lvls = 0 to 2>`
+    !iterator<#CSR, lvls = 0 to 2>
+    ```
+  }];
+
+  let parameters = (ins
+     SparseTensorEncodingAttr : $encoding,
+     "Level" : $loLvl,
+     "Level" : $hiLvl
+  );
+
+  let extraClassDeclaration = [{
+     /// Get the corresponding iteration space type.
+     ::mlir::sparse_tensor::IterSpaceType getIterSpaceType() const;
+
+     unsigned getSpaceDim() const { return getIterSpaceType().getSpaceDim(); }
+     ArrayRef<LevelType> getLvlTypes() const { return getIterSpaceType().getLvlTypes(); }
+     bool isUnique() { return getIterSpaceType().isUnique(); }
+  }];
+
+  let assemblyFormat="`<` $encoding `,` `lvls` `=` custom<LevelRange>($loLvl, $hiLvl) `>`";
+}
+
+def IsSparseSparseIterSpaceTypePred
+    : CPred<"::llvm::isa<::mlir::sparse_tensor::IterSpaceType>($_self)">;
+
+def IsSparseSparseIteratorTypePred
+    : CPred<"::llvm::isa<::mlir::sparse_tensor::IteratorType>($_self)">;
+
+def AnySparseIterSpace
+    : Type<IsSparseSparseIterSpaceTypePred, "sparse iteration space",
+          "::mlir::sparse_tensor::IterSpaceType">;
+
+def AnySparseIterator
+    : Type<IsSparseSparseIteratorTypePred, "sparse iterator",
+          "::mlir::sparse_tensor::IteratorType">;
+
+
 #endif // SPARSETENSOR_TYPES
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
index 4706d5ba2f218..2f844cee5ff52 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
@@ -460,6 +460,7 @@ def SparsificationAndBufferization : Pass<"sparsification-and-bufferization", "M
     "memref::MemRefDialect",
     "scf::SCFDialect",
     "sparse_tensor::SparseTensorDialect",
+    "vector::VectorDialect"
   ];
 }
 
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
index cff3de0a69af9..3687891fe4b7c 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
@@ -130,11 +130,11 @@ def Tosa_ScalarTensor : TensorRankOf<[Tosa_AnyNumber], [0]>;
 // to not include any remaining unranked tensors.
 def Tosa_UnrankedTensor : UnrankedTensorOf<[Tosa_AnyNumber]>;
 
-def Tosa_Tensor1D : AnyTypeOf<[Tosa_UnrankedTensor, 1DTensorOf<[Tosa_AnyNumber]>]>;
-def Tosa_Tensor2D : AnyTypeOf<[Tosa_UnrankedTensor, 2DTensorOf<[Tosa_AnyNumber]>]>;
-def Tosa_Tensor3D : AnyTypeOf<[Tosa_UnrankedTensor, 3DTensorOf<[Tosa_AnyNumber]>]>;
-def Tosa_Tensor4D : AnyTypeOf<[Tosa_UnrankedTensor, 4DTensorOf<[Tosa_AnyNumber]>]>;
-def Tosa_Tensor5D : AnyTypeOf<[Tosa_UnrankedTensor, TensorRankOf<[Tosa_AnyNumber], [5]>]>;
+def Tosa_Tensor1D : AnyTypeOf<[Tosa_UnrankedTensor, 1DTensorOf<[Tosa_AnyNumber]>], "1-d tensor", "::mlir::TensorType">;
+def Tosa_Tensor2D : AnyTypeOf<[Tosa_UnrankedTensor, 2DTensorOf<[Tosa_AnyNumber]>], "2-d tensor", "::mlir::TensorType">;
+def Tosa_Tensor3D : AnyTypeOf<[Tosa_UnrankedTensor, 3DTensorOf<[Tosa_AnyNumber]>], "3-d tensor", "::mlir::TensorType">;
+def Tosa_Tensor4D : AnyTypeOf<[Tosa_UnrankedTensor, 4DTensorOf<[Tosa_AnyNumber]>], "4-d tensor", "::mlir::TensorType">;
+def Tosa_Tensor5D : AnyTypeOf<[Tosa_UnrankedTensor, TensorRankOf<[Tosa_AnyNumber], [5]>], "5-d tensor", "::mlir::TensorType">;
 
 // Ranked tensors up to given rank.
 def Tosa_Tensor1Dto4D : AnyTypeOf<[
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
index 21c9595860d4c..fbac1ffb621fd 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
@@ -331,7 +331,10 @@ def ApplyPatternsOp : TransformDialectOp<"apply_patterns",
   }];
 
   let arguments = (ins
-    TransformHandleTypeInterface:$target, UnitAttr:$apply_cse);
+    TransformHandleTypeInterface:$target,
+    UnitAttr:$apply_cse,
+    DefaultValuedAttr<I64Attr, "static_cast<uint64_t>(-1)">:$max_iterations,
+    DefaultValuedAttr<I64Attr, "static_cast<uint64_t>(-1)">:$max_num_rewrites);
   let results = (outs);
   let regions = (region MaxSizedRegion<1>:$patterns);
 
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 147bc2354977d..332b5ad08ced9 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -420,7 +420,7 @@ def Vector_ShuffleOp :
      PredOpTrait<"second operand v2 and result have same element type",
                  TCresVTEtIsSameAsOpBase<0, 1>>,
      InferTypeOpAdaptor]>,
-     Arguments<(ins AnyVectorOfAnyRank:$v1, AnyVectorOfAnyRank:$v2,
+     Arguments<(ins AnyFixedVector:$v1, AnyFixedVector:$v2,
                     I64ArrayAttr:$mask)>,
      Results<(outs AnyVector:$vector)> {
   let summary = "shuffle operation";
@@ -444,6 +444,8 @@ def Vector_ShuffleOp :
       mask values must be within range, viz. given two k-D operands v1 and v2
       above, all mask values are in the range [0,s_1+t_1)
 
+    Note, scalable vectors are not supported.
+
     Example:
 
     ```mlir
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index 87aabdc015fea..eca9255ff3974 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -12,6 +12,7 @@
 #include "mlir/Bytecode/BytecodeOpInterface.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/Interfaces/ShapedOpInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index cd38549f1ccf4..6579d07ec2621 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -19,17 +19,36 @@ class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
 }
 
 def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
+  let summary = [{a composite attribute for `TensorDescType`}];
+  let description = [{`TensorDescAttr` (or `tdesc_attr`) is a composite
+    attribute defined for `TensorDescType` for describing following
+    properties of a `TensorDesc`.
+    1. `memory_scope`: It describes where the data block described by the
+        TensorDesc is located, `Global` device memory or `Shared` local memory.
+        It is default to `Global`.
+    2. `array_length`: It describes how many horizontally consecutive blocks
+        will be loaded by a hardware load instruction. If the TensorDesc shape
+        is 8x16, with array_length = 2. The loaded block shape will be acctually
+        8x32. Its default value is 1.
+    3. `boundary_check`: It is used to indicates the hardware whether to do
+        out-of-boundary check. The default value is true.
+    4. `scattered`: It is used to differenciate TensorDescs created from
+       `create_nd_tdesc` vs from `create_tdesc`.
+  }];
+
   let parameters = (ins
     OptionalParameter<"MemoryScopeAttr">: $memory_scope,
     OptionalParameter<"IntegerAttr", "1">: $array_length,
-    OptionalParameter<"BoolAttr", "true">: $boundary_check
+    OptionalParameter<"BoolAttr", "true">: $boundary_check,
+    OptionalParameter<"BoolAttr", "false">: $scattered
   );
 
   let builders = [
     AttrBuilder<(ins
       CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope,
       CArg<"int", "1">:$array_length,
-      CArg<"bool", "true">: $boundary_check
+      CArg<"bool", "true">: $boundary_check,
+      CArg<"bool", "false">: $scattered
     )>
   ];
 
@@ -41,15 +60,17 @@ def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
 //===----------------------------------------------------------------------===//
 def XeGPU_MemoryScopeGlobal: I32EnumAttrCase<"Global", 0, "global">;
 def XeGPU_MemoryScopeShared: I32EnumAttrCase<"SLM", 1, "slm">;
-def XeGPU_MemoryScope: I32EnumAttr<"MemoryScope", 
-      "The address space of the memory the tensor descritor is created for", 
+def XeGPU_MemoryScope: I32EnumAttr<"MemoryScope",
+      "The address space of the memory the tensor descritor is created for",
       [XeGPU_MemoryScopeGlobal, XeGPU_MemoryScopeShared]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::xegpu";
 }
 
-def XeGPU_MemoryScopeAttr: 
+def XeGPU_MemoryScopeAttr:
   EnumAttr<XeGPU_Dialect, XeGPU_MemoryScope, "memory_scope"> {
+    let summary = [{Describe the location of data described by a `TensorDesc`:
+                 Global device memory (`Global`) or Shared local memory (`SLM`).}];
     let assemblyFormat = "$value";
 }
 
@@ -63,19 +84,18 @@ def XeGPU_CachePolicyInvalid:       I32EnumAttrCase<"READ_INVALIDATE", 3, "read_
 def XeGPU_CachePolicyWriteBack:     I32EnumAttrCase<"WRITE_BACK", 4, "write_back">;            // valid for write only
 def XeGPU_CachePolicyWriteThrough:  I32EnumAttrCase<"WRITE_THROUGH", 5, "write_through">;      // valid for write only
 
-def XeGPU_CachePolicyEnums : I32EnumAttr<"CachePolicy", "Cache policy", 
-  [XeGPU_CachePolicyCached, XeGPU_CachePolicyUncached, 
+def XeGPU_CachePolicyEnums : I32EnumAttr<"CachePolicy", "Cache policy",
+  [XeGPU_CachePolicyCached, XeGPU_CachePolicyUncached,
    XeGPU_CachePolicyStreaming, XeGPU_CachePolicyInvalid,
    XeGPU_CachePolicyWriteBack, XeGPU_CachePolicyWriteThrough]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::xegpu";
 }
 
-def XeGPU_CacheHintAttr 
+def XeGPU_CacheHintAttr
   : EnumAttr<XeGPU_Dialect, XeGPU_CachePolicyEnums, "cache_hint"> {
+    let summary = [{Describe the cache settings for prefetch/load/store operators}];
     let assemblyFormat = "`<` $value `>`";
 }
 
-
-
-#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
+#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
\ No newline at end of file
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index b8ebd1a40c607..c6f7f83441b96 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -47,36 +47,35 @@ class XeGPU_Op<string mnemonic, list<Trait> traits = []>:
 }
 
 
-def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface, 
+def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface,
                         AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> {
 
   let summary = "Create nd-tensor descriptor operation";
   let description = [{
     The "create_nd_tdesc" operation creates a TensorDescType which represents
     a sub-view of a 2D memory region (It can be extended to support n-D memory
-    region if needed in future). Elements in the subview continuous in each 
-    dimention. It encodes the following important information for supporting 
+    region if needed in future). Elements in the subview continuous in each
+    dimension. It encodes the following important information for supporting
     Intel hardware features:
 
-    * source: an object representing (starting address/pointer of) a 2D memory region. 
+    * source: an object representing (starting address/pointer of) a 2D memory region.
         It can be either a 2D memref object, or simply a pointer represented by uint64_t type.
-        for the later case, the shape and layout information of the 2D memory region should 
-        be explicitly passed via `dynamic_shape` and `dynamic_strides` parameters.
-    * offsets: two index values represents offsets from the "source" at the each dimension 
+        for the later case, the shape and layout information of the 2D memory region should
+        be explicitly passed via `shape` and `strides` parameters.
+    * offsets: two index values represents offsets from the "source" at the each dimension
         at which the subview of the target memory will be created. It is encoded via two
-        variables, including "dynamic_offsets" and "static_offsets", such that it can
-        accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4])).
-    * shape: the shape information of the memory region pointed by the "source".  It is 
-        typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>. 
-        But if "source" is simply a pointer represented as uint64_t type, or a memref 
-        type without shape information e.g., memref<?x?xf16>, the shape information has 
-        to be explicitly passed via the "dynamic_shape" argument. Currently "dynamic_shape" 
-        only accepts operands(e.g., [%c4096, %c4096]), not attributes(e.g., [4096, 4096]).
-    * strides: the strides of the memory region pointed by the "source". Similar to shape, 
-        it is typically encoded via the MemRefType of the source too. But if "source" is 
-        simply a pointer represented as uint64_t type, or a memref type without shape 
-        information e.g., memref<?x?xf16>, the strides information has to be explicitly 
-        passed via the "dynamic_strides" argument. And it currently only accepts operands two.
+        variables, including "offsets" and "const_offsets", such that it can
+        accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4]).
+    * shape: the shape information of the memory region pointed by the "source".  It is
+        typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>.
+        But if "source" is simply a pointer represented as uint64_t type, or a memref
+        type without shape information e.g., memref<?x?xf16>, the shape information has
+        to be explicitly passed via the "shape" and "const_shape" arguments.
+    * strides: the strides of the memory region pointed by the "source". Similar to shape,
+        it is typically encoded via the MemRefType of the source too. But if "source" is
+        simply a pointer represented as uint64_t type, or a memref type without shape
+        information e.g., memref<?x?xf16>, the strides information has to be explicitly
+        passed via the "strides" and "const_strides" argument.
 
     Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
     %0 = memref.alloc() : memref<1024x1024xf32>
@@ -97,10 +96,10 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     %1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32>
   }];
 
-  let arguments = (ins 
-    XeGPU_BaseAddrType: $source, 
-    Variadic<Index>: $offsets, 
-    Variadic<Index>: $shape, 
+  let arguments = (ins
+    XeGPU_BaseAddrType: $source,
+    Variadic<Index>: $offsets,
+    Variadic<Index>: $shape,
     Variadic<Index>: $strides,
     DenseI64ArrayAttr: $const_offsets,
     OptionalAttr<DenseI64ArrayAttr>: $const_shape,
@@ -119,12 +118,12 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
   let hasVerifier = 1;
 
   let builders = [
-    OpBuilder<(ins "Type": $tdesc, "TypedValue<MemRefType>": $source, 
+    OpBuilder<(ins "Type": $tdesc, "TypedValue<MemRefType>": $source,
                    "llvm::ArrayRef<OpFoldResult>": $offsets)>,
 
-    OpBuilder<(ins "Type": $tdesc, "TypedValue<IntegerType> ": $source, 
+    OpBuilder<(ins "Type": $tdesc, "TypedValue<IntegerType> ": $source,
                    "llvm::ArrayRef<OpFoldResult>": $offsets,
-                   "llvm::ArrayRef<OpFoldResult>": $shape, 
+                   "llvm::ArrayRef<OpFoldResult>": $shape,
                    "llvm::ArrayRef<OpFoldResult>": $strides)>
   ];
 
@@ -159,41 +158,41 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     }
 
     /// wrapper for matching with OffsetSizeAndStrideOpInterface
-    /// If source is IntegerType or `const_shape` is filled, 
+    /// If source is IntegerType or `const_shape` is filled,
     /// it will return `const_shape`, such that mixes of `shape`
-    /// and `const_shape` will be used to represent the shape of 
+    /// and `const_shape` will be used to represent the shape of
     /// source operand. They overide static shape from source memref type.
     ArrayRef<int64_t> getStaticSizes() {
       auto attr = getConstShapeAttr();
       if (getSourceType().isa<IntegerType>() || attr)
         return attr;
-      
+
       auto memrefType = getSourceType().dyn_cast<MemRefType>();
       assert(memrefType && "Incorrect use of getStaticSizes");
       return memrefType.getShape();
     }
 
     /// wrapper for matching with OffsetSizeAndStrideOpInterface
-    /// If source is IntegerType or `const_strides` is filled, it 
+    /// If source is IntegerType or `const_strides` is filled, it
     /// will return `const_strides`, such that mixes of `strides`
-    /// and `const_strides` will be used to represent the strides of 
+    /// and `const_strides` will be used to represent the strides of
     /// source operand. They overide static strides from source memref type.
     ArrayRef<int64_t> getStaticStrides() {
       auto attr = getConstStridesAttr();
       if (getSourceType().isa<IntegerType>() || attr)
         return attr;
-      
+
       auto memrefType = getSourceType().dyn_cast<MemRefType>();
       assert(memrefType && "Incorrect use of getStaticStrides");
       auto [strides, offset] = getStridesAndOffset(memrefType);
-      // reuse the storage of ConstStridesAttr since strides from 
+      // reuse the storage of ConstStridesAttr since strides from
       // memref is not persistant
       setConstStrides(strides);
       attr = getConstStridesAttr();
       return attr;
     }
 
-    /// Return the expected rank of each of the`static_offsets`, 
+    /// Return the expected rank of each of the`static_offsets`,
     /// `static_shape` and `static_strides` attributes.
     std::array<unsigned, 3> getArrayAttrMaxRanks() {
       unsigned rank;
@@ -204,8 +203,8 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
       }
       return {rank, rank, rank};
     }
-    
-    /// Return the number of leading operands before the `offsets`, 
+
+    /// Return the number of leading operands before the `offsets`,
     /// `shape` and `strides` operands.
     static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }
 
@@ -214,15 +213,15 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
 }
 
 def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
-  let summary = "prefetches a nD block to cache";
+  let summary = "prefetches a n-D block to cache";
   let description = [{
-    It issues an instruction to prefetch the data from memory to each 
-    level of the cache based on their cache policy.
+    It issues an instruction to prefetch a block of data from continuous
+    memory regions to each level of the cache based on their cache policy.
 
     Example:
     ```
-      xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint<cached>, 
-                                l2_hint = #xegpu.cache_hint<cached>, 
+      xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint<cached>,
+                                l2_hint = #xegpu.cache_hint<cached>,
                                 l3_hint = #xegpu.cache_hint<cached>}
         : !xegpu.tensor_desc<8x16xf16>
     ```
@@ -233,34 +232,41 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
-                       
-  let extraClassDeclaration = extraBaseClassDeclaration;
+
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
+    xegpu::TensorDescType getTensorDescType() {
+      return getTensorDesc().getType();
+    }
+  }];
 
   let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc))";
+
+  let hasVerifier = 1;
 }
 
 
-def XeGPU_LoadNdOp : XeGPU_Op<"load_nd"> {
-  let summary = "loads a n-D block from memory (represented by TensorDesc)" 
+def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [AllElementTypesMatch<["value", "TensorDesc"]>,
+                                         AllElementCountsMatch<["value", "TensorDesc"]>]> {
+  let summary = "loads a n-D block from memory (represented by TensorDesc)"
                 "to registers (represented by vector)";
   let description = [{
-    LoadNdOp essentially mimics the hardware block read instruction to read 
-    a block of data from memory to register. It takes a set of optional cache 
-    hints for each level of cache, L1, L2 and L3. If hardware does not have a 
+    LoadNdOp essentially mimics the hardware block read instruction to read
+    a block of data from memory to register. It takes a set of optional cache
+    hints for each level of cache, L1, L2 and L3. If hardware does not have a
     correspoding cache, Corresponding cache hint attribute will be masked.
-    vnni transform is an hardware feature for Intel GPU, which is used to 
-    do data packing during the load for B operand of matrix operation, if 
-    the bit width of the data type is less then 32 bits, e.g., fp16. And 
+    vnni transform is an hardware feature for Intel GPU, which is used to
+    do data packing during the load for B operand of matrix operation, if
+    the bit width of the data type is less then 32 bits, e.g., fp16. And
     transpose is another Intel hardware feature, which will do transpose
-    operation when loading the data if the bit width of the data type is 
-    fp32 or fp64. It implies that vnni and transpose cannot exit at the 
+    operation when loading the data if the bit width of the data type is
+    fp32 or fp64. It implies that vnni and transpose cannot exit at the
     same time.
 
     Example:
     ```
       xegpu.load_nd %1 {transpose = [1, 0],
-                        l1_hint = #xegpu.cache_hint<cached>, 
-                        l2_hint = #xegpu.cache_hint<uncached>, 
+                        l1_hint = #xegpu.cache_hint<cached>,
+                        l2_hint = #xegpu.cache_hint<uncached>,
                         l3_hint = #xegpu.cache_hint<streaming>}
               : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
     ```
@@ -291,20 +297,21 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd"> {
   let hasVerifier = 1;
 }
 
-def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", []> {
+def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [AllShapesMatch<["value", "TensorDesc"]>,
+                                       AllElementTypesMatch<["value", "TensorDesc"]>]> {
   let summary = "stores a n-D block register region back to memory, currently only supports 2D";
 
   let description = [{
     StoreNdOp essentially mimics the hardware block write instruction io
-    write a block of data from register into the memory region as described 
-    by the TensorDesc. It takes a set of optional cache hints for each level 
-    of cache, L1, L2 and L3. If hardware does not have a correspoding cache, 
+    write a block of data from register into the memory region as described
+    by the TensorDesc. It takes a set of optional cache hints for each level
+    of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
     Corresponding cache hint attribute will be masked.
 
     Example:
     ```
       xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
-                             l2_hint = #xegpu.cache_hint<write_back>, 
+                             l2_hint = #xegpu.cache_hint<write_back>,
                              l3_hint = #xegpu.cache_hint<write_through>}
                              : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
     ```
@@ -318,11 +325,342 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", []> {
                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
 
-  let extraClassDeclaration = extraBaseClassDeclaration;
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
+    VectorType getValueType() {
+      return llvm::dyn_cast<VectorType>(getValue().getType());
+    }
 
-  let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict 
+    xegpu::TensorDescType getTensorDescType() {
+      return getTensorDesc().getType();
+    }
+  }];
+
+  let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict
                         `:` type($value) `,` qualified(type($TensorDesc))}];
   let hasVerifier = 1;
 }
 
+def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
+                [AllTypesMatch<["TensorDesc", "result"]>]> {
+  let summary = "It updates the offsets for the TensorDesc.";
+  let description = [{The op updates the offset of the given TensorDesc.
+    The offsets are relative offset to the current position in the number
+    of elements. It will result in a same type TensorDesc as the input.
+
+  example:
+  ```
+    %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32>
+  ```
+  }];
+
+  let arguments = (ins
+    XeGPU_TensorDesc: $TensorDesc,
+    Variadic<Index>: $offsets,
+    DenseI64ArrayAttr: $const_offsets);
+
+  let results = (outs XeGPU_TensorDesc: $result);
+
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
+    xegpu::TensorDescType getTensorDescType() {
+      return getTensorDesc().getType();
+    }
+
+    SmallVector<OpFoldResult> getMixedOffsets() {
+      Builder b(getContext());
+      return getMixedValues(getConstOffsets(), getOffsets(), b);
+    }
+
+    size_t getNumOffsets() {
+      return getMixedOffsets().size();
+    }
+
+    OpFoldResult getOffset(unsigned idx) {
+      assert(idx < getNumOffsets() && "Invalid out of bound access.");
+      return getMixedOffsets()[idx];
+    }
+  }];
+
+  let assemblyFormat = [{
+    $TensorDesc `,`
+    custom<DynamicIndexList>($offsets, $const_offsets)
+    attr-dict `:` qualified(type($result))
+  }];
+
+  let hasVerifier = 1;
+}
+
+def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
+  let summary = "create scattered tensor descriptors (TensorDesc).";
+  let description = [{
+    "create_tdesc" is similar to "create_nd_tdesc" in terms that it creates
+    a Tensor Descriptor (TensorDescType) for a memory region. While "create_nd_tdesc"
+    is for creating continuous subviews, "create_tdesc" is for creating non-continuous
+    (scattered) subviews, allowing each work-item in a subgroup specifying their own offset.
+    It accepts the following parameters:
+
+    * source: a 1D memref or pointer (uint64_t) represents the flattened memory object.
+    * offsets: a array containing offsets of each access point. Its size
+      is fixed to the hardware supportted subgroup size, e.g., 16 on PVC,
+      implying each element in the array corresponds to a work-item (SIMT lane)
+      in the subgroup.
+    * chunk_size: [optional attribute] indicates number of continious
+      elements accessed for each offset, default is 1.
+
+    Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
+    ```
+    %a = memref.alloc() : memref<1024xf32>
+    %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32>
+    ```
+
+    Example 2. It assumes subgroup size is 4, and each workitem access 8 elements.
+               It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
+    ```
+    %0 = memref.alloc() : memref<1024xf32>
+    %1 = xegpu.create_tdesc %0[0, 16, 32, 64] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
+    ```
+
+    Example 3. It is similar to Example 2, but there is some overlaps among workitems.
+               It accesses: a[0:7], a[4:11], a[8:15], a[12:19]
+    ```
+    %0 = memref.alloc() : memref<1024xf32>
+    %1 = xegpu.create_tdesc %0[0, 4, 8, 12] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
+    ```
+
+
+
+
+  }];
+
+  let arguments = (ins XeGPU_BaseAddrType: $source,
+                       Variadic<Index>: $offsets,
+                       DenseI64ArrayAttr: $const_offsets,
+                       DefaultValuedAttr<I64Attr, "1">: $chunk_size);
+  let results = (outs XeGPU_TensorDesc:$TensorDesc);
+
+  let builders = [
+    OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source,
+                   "llvm::ArrayRef<OpFoldResult>": $offsets,
+                   CArg<"uint32_t", "1"> : $chunk_size)>,
+  ];
+
+  let assemblyFormat = [{
+    $source
+    custom<DynamicIndexList>($offsets, $const_offsets)
+    attr-dict `:`  type($source) `->` qualified(type($TensorDesc))
+  }];
+
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
+    xegpu::TensorDescType getTensorDescType() {
+      return getTensorDesc().getType();
+    }
+
+    SmallVector<OpFoldResult> getMixedOffsets() {
+      Builder b(getContext());
+      return getMixedValues(getConstOffsets(), getOffsets(), b);
+    }
+
+    size_t getNumOffsets() {
+      return getMixedOffsets().size();
+    }
+
+    mlir::Value getViewSource() { return getSource(); }
+
+    OpFoldResult getOffset(unsigned idx) {
+      assert(idx < getNumOffsets() && "Invalid out of bound access.");
+      return getMixedOffsets()[idx];
+    }
+  }];
+
+  let hasVerifier = 1;
+}
+
+def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
+  let summary = "prefetches a set of scattered data points to cache";
+
+  let description = [{
+    It issues instructions to prefetch a set of scattered data points
+    from memory to each level of the cache based on their cache policy.
+    As compared to prefetch_nd, which works on non-scattered TensorDesc,
+    it works on scattered TensorDesc instead.
+
+    Example:
+    ```
+      xegpu.prefetch %tdesc {l1_hint = #xegpu.cache_hint<cached>,
+                             l2_hint = #xegpu.cache_hint<cached>,
+                             l3_hint = #xegpu.cache_hint<cached>}
+        : !xegpu.tensor_desc<16xf16>
+    ```
+
+  }];
+
+  let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
+    xegpu::TensorDescType getTensorDescType() {
+      return getTensorDesc().getType();
+    }
+  }];
+
+  let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc))";
+
+  let hasVerifier = 1;
+}
+
+def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"]>,
+                                    AllElementTypesMatch<["value", "TensorDesc"]>,
+                                   AllElementCountsMatch<["value", "TensorDesc"]>]> {
+  let summary = "load a set of scattered data points from memory.";
+
+  let description = [{ It (aka. load) load data per each work-item. The output
+    describes the data being loaded at the subgroup level, so its size is
+    consistent with the number of work-items in a subgroup. When `chunk_size_per_lane`
+    attribute is larger than 1 in TensorDesc, the output vector will be 2D vector,
+    with dim-1 correspoding to the chunk size.
+
+    The mask operand masks out memory access so that it is safe to pass out-of-boundary
+    addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
+
+  Example:
+  ```
+    %2 = xegpu.load %1, %0 {transpose = [1, 0],
+                            l1_hint = #xegpu.cache_hint<cached>,
+                            l2_hint = #xegpu.cache_hint<uncached>,
+                            l3_hint = #xegpu.cache_hint<uncached>}
+          : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered=true>>, vector<16xi1>
+            -> vector<16xf32>
+  ```
+
+  }];
+
+  let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+                       XeGPU_MaskType: $mask,
+                       OptionalAttr<DenseI64ArrayAttr>: $transpose,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+  let results = (outs XeGPU_ValueType: $value);
+
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
+    xegpu::TensorDescType getTensorDescType() {
+      return getTensorDesc().getType();
+    }
+
+    mlir::Type getElementType() {
+      auto type = getValue().getType();
+      return getElementTypeOrSelf(type);
+    }
+
+    Type getValueType() {
+      return getValue().getType();
+    }
+
+    Type getMaskType() {
+      return getMask().getType();
+    }
+
+  }];
+
+  let assemblyFormat = [{$TensorDesc `,` $mask prop-dict attr-dict
+      `:` qualified(type($TensorDesc)) `,` type($mask) `->` type($value)}];
+
+  let hasVerifier = 1;
+}
+
+def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch<["value", "TensorDesc"]>,
+                                        AllElementTypesMatch<["value", "TensorDesc"]>]> {
+  let summary = "store data to scattered memory locations.";
+  let description = [{ It (aka. store) stores data to scattered memory locations.
+  It has similar semantic to `load_gather`.
+
+  Example:
+  ```
+    %3 = xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint<uncached>,
+                                 l2_hint = #xegpu.cache_hint<write_back>,
+                                 l3_hint = #xegpu.cache_hint<write_through>}
+          : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered=true>>, vector<16xi1>
+  ```
+  }];
+
+  let arguments = (ins
+    XeGPU_ValueType: $value,
+    XeGPU_TensorDesc: $TensorDesc,
+    XeGPU_MaskType: $mask,
+    OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+    OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+    OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
+    xegpu::TensorDescType getTensorDescType() {
+      return getTensorDesc().getType();
+    }
+
+    Type getValueType() {
+      return getValue().getType();
+    }
+
+    Type getMaskType() {
+      return getMask().getType();
+    }
+  }];
+
+  let assemblyFormat = [{$value `,` $TensorDesc `,` $mask prop-dict attr-dict
+            `:` type($value) `,` qualified(type($TensorDesc)) `,` type($mask)}];
+
+  let hasVerifier = 1;
+}
+
+def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
+          [AllTypesMatch<["TensorDesc", "result"]>]> {
+  let summary = "It updates the offsets for the given tensor descriptor";
+
+  let description = [{It behaves similar to `update_nd_offset` in terms that
+    it updates offset of a TensorDesc, and the offsets are relative offset to
+    the current position in the number of elements. However, `update_nd_offset`
+    is to update the start point of a 2D block, so its offset constains two
+    elements representing the shift in each dimension. `update_offset` is to
+    update the offset per work-item, so its offsets contains values representing
+    shifts for each work-item.
+
+    Example:
+    ```
+      %2 = xegpu.update_offset %1, [32, 32, 32, 32]
+            : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+    ```
+  }];
+
+  let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+                       Variadic<Index>: $offsets,
+                       DenseI64ArrayAttr: $const_offsets);
+  let results = (outs XeGPU_TensorDesc: $result);
+
+  let extraClassDeclaration = extraBaseClassDeclaration # [{
+    xegpu::TensorDescType getTensorDescType() {
+      return getTensorDesc().getType();
+    }
+
+    SmallVector<OpFoldResult> getMixedOffsets() {
+      Builder b(getContext());
+      return getMixedValues(getConstOffsets(), getOffsets(), b);
+    }
+
+    size_t getNumOffsets() {
+      return getMixedOffsets().size();
+    }
+
+    OpFoldResult getOffset(unsigned idx) {
+      assert(idx < getNumOffsets() && "Invalid out of bound access.");
+      return getMixedOffsets()[idx];
+    }
+  }];
+
+  let assemblyFormat = [{
+    $TensorDesc `,`
+    custom<DynamicIndexList>($offsets, $const_offsets)
+    attr-dict `:` qualified(type($TensorDesc))
+  }];
+}
+
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 19ac1693712dd..4cd4e5411653c 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -34,10 +34,10 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
         [ShapedTypeInterface], "::mlir::TensorType"> {
   let summary = "TensorDesc describing regions of interested data.";
   let description = [{
-    TensorDesc is a type designed to describe regions of the interested data as well as some 
-    features that are unique to Intel hardware. Different with the builtin tensor type in MLIR, 
-    it essentially only contains the meta data, and doesn't hold the data by itself. It is designed 
-    to mainly support 2D block load/store and DPAS (matrix multiplication instruction) on Intel GPU. 
+    TensorDesc is a type designed to describe regions of the interested data as well as some
+    features that are unique to Intel hardware. Different with the builtin tensor type in MLIR,
+    it essentially only contains the meta data, and doesn't hold the data by itself. It is designed
+    to mainly support 2D block load/store and DPAS (matrix multiplication instruction) on Intel GPU.
     It encodes the following information:
 
     * shape:  the sizes/shape of the intereted data block, e.g., 8x16 means 8 rows
@@ -46,15 +46,15 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
               is set or not.
     * element_type: the data type of the data element, e.g., f16, f32.
 
-    Similar to the builtin tensor, it also provides an optinal attribute to encoding 
+    Similar to the builtin tensor, it also provides an optinal attribute to encoding
     the following information via the TensorDescAttr object:
-    * memory_scope (xegpu::MemoryScope): [optional] where the data is located, 
+    * memory_scope (xegpu::MemoryScope): [optional] where the data is located,
                 global memory or shared memory. It is default to Global.
     * array_length (int): [optional] The number of contiguous blocks with size as `shape`,
                that will be loaded by block load at a time. It is default to 1.
-    * boundary_check (bool): [optional] indicates whether the operation detects the boundary 
+    * boundary_check (bool): [optional] indicates whether the operation detects the boundary
                 and pads with zero for out-of-boundary access. It is default to do boundary check.
-    
+
 
     Syntax:
 
@@ -63,7 +63,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     element-type ::= float-type | integer-type | index-type
     dim-list := (static-dim-list `x`)?
     static-dim-list ::= decimal-literal `x` decimal-literal
-    attr-list = (, memory_scope = value)? (, arr_len = value)? (, boundary_check = value)?
+    attr-list = (, memory_scope = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)?
     ```
 
     Examples:
@@ -84,6 +84,17 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
                         "mlir::Type": $elementType,
                         OptionalParameter<"mlir::Attribute">: $encoding);
 
+  let builders = [
+    TypeBuilderWithInferredContext<(ins
+      "llvm::ArrayRef<int64_t>": $shape,
+      "mlir::Type": $elementType,
+      CArg<"bool", "false">: $scattered,
+      CArg<"int", "1">: $array_length,
+      CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope,
+      CArg<"bool", "true">: $boundary_check
+    )>
+  ];
+
   let extraClassDeclaration = [{
     using TensorType::clone;
     using mlir::ShapedType::Trait<TensorDescType>::getElementTypeBitWidth;
@@ -116,7 +127,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
       if (attr && attr.getArrayLength())
         return attr.getArrayLength().getInt();
       // return default value
-      return 1; 
+      return 1;
     }
 
     bool getBoundaryCheck() {
@@ -126,10 +137,18 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
       // return default value
       return true;
     }
+
+    bool getScattered() {
+      auto attr = getEncodingAsTensorDescAttr();
+      if (attr && attr.getScattered())
+        return attr.getScattered().getValue();
+      // return default value
+      return false;
+    }
   }];
 
   let hasCustomAssemblyFormat = true;
-  
+
 }
 
 #endif // MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD
diff --git a/mlir/include/mlir/IR/CommonAttrConstraints.td b/mlir/include/mlir/IR/CommonAttrConstraints.td
index 0312ac7ec1d8d..0d69bb0717a59 100644
--- a/mlir/include/mlir/IR/CommonAttrConstraints.td
+++ b/mlir/include/mlir/IR/CommonAttrConstraints.td
@@ -755,7 +755,7 @@ class AllAttrOf<list<AttrConstraint> constraints> : AttrConstraint<
 
 class IntNEQValue<int n> : AttrConstraint<
     CPred<"::llvm::cast<::mlir::IntegerAttr>($_self).getInt() != " # n>,
-    "whose minimum value is " # n>;
+    "whose value is not " # n>;
 
 class IntMinValue<int n> : AttrConstraint<
     CPred<"::llvm::cast<::mlir::IntegerAttr>($_self).getInt() >= " # n>,
diff --git a/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h b/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
index 1d7bc6ea961cc..ac17ace5a976d 100644
--- a/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
+++ b/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
@@ -15,6 +15,7 @@
 #include "mlir/IR/Value.h"
 #include "mlir/Interfaces/DestinationStyleOpInterface.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/ExtensibleRTTI.h"
 
 #include <queue>
@@ -111,6 +112,39 @@ class ValueBoundsConstraintSet
 public:
   static char ID;
 
+  /// A variable that can be added to the constraint set as a "column". The
+  /// value bounds infrastructure can compute bounds for variables and compare
+  /// two variables.
+  ///
+  /// Internally, a variable is represented as an affine map and operands.
+  class Variable {
+  public:
+    /// Construct a variable for an index-typed attribute or SSA value.
+    Variable(OpFoldResult ofr);
+
+    /// Construct a variable for an index-typed SSA value.
+    Variable(Value indexValue);
+
+    /// Construct a variable for a dimension of a shaped value.
+    Variable(Value shapedValue, int64_t dim);
+
+    /// Construct a variable for an index-typed attribute/SSA value or for a
+    /// dimension of a shaped value. A non-null dimension must be provided if
+    /// and only if `ofr` is a shaped value.
+    Variable(OpFoldResult ofr, std::optional<int64_t> dim);
+
+    /// Construct a variable for a map and its operands.
+    Variable(AffineMap map, ArrayRef<Variable> mapOperands);
+    Variable(AffineMap map, ArrayRef<Value> mapOperands);
+
+    MLIRContext *getContext() const { return map.getContext(); }
+
+  private:
+    friend class ValueBoundsConstraintSet;
+    AffineMap map;
+    ValueDimList mapOperands;
+  };
+
   /// The stop condition when traversing the backward slice of a shaped value/
   /// index-type value. The traversal continues until the stop condition
   /// evaluates to "true" for a value.
@@ -121,35 +155,31 @@ class ValueBoundsConstraintSet
   using StopConditionFn = std::function<bool(
       Value, std::optional<int64_t> /*dim*/, ValueBoundsConstraintSet &cstr)>;
 
-  /// Compute a bound for the given index-typed value or shape dimension size.
-  /// The computed bound is stored in `resultMap`. The operands of the bound are
-  /// stored in `mapOperands`. An operand is either an index-type SSA value
-  /// or a shaped value and a dimension.
+  /// Compute a bound for the given variable. The computed bound is stored in
+  /// `resultMap`. The operands of the bound are stored in `mapOperands`. An
+  /// operand is either an index-type SSA value or a shaped value and a
+  /// dimension.
   ///
-  /// `dim` must be `nullopt` if and only if `value` is index-typed. The bound
-  /// is computed in terms of values/dimensions for which `stopCondition`
-  /// evaluates to "true". To that end, the backward slice (reverse use-def
-  /// chain) of the given value is visited in a worklist-driven manner and the
-  /// constraint set is populated according to `ValueBoundsOpInterface` for each
-  /// visited value.
+  /// The bound is computed in terms of values/dimensions for which
+  /// `stopCondition` evaluates to "true". To that end, the backward slice
+  /// (reverse use-def chain) of the given value is visited in a worklist-driven
+  /// manner and the constraint set is populated according to
+  /// `ValueBoundsOpInterface` for each visited value.
   ///
   /// By default, lower/equal bounds are closed and upper bounds are open. If
   /// `closedUB` is set to "true", upper bounds are also closed.
-  static LogicalResult computeBound(AffineMap &resultMap,
-                                    ValueDimList &mapOperands,
-                                    presburger::BoundType type, Value value,
-                                    std::optional<int64_t> dim,
-                                    StopConditionFn stopCondition,
-                                    bool closedUB = false);
+  static LogicalResult
+  computeBound(AffineMap &resultMap, ValueDimList &mapOperands,
+               presburger::BoundType type, const Variable &var,
+               StopConditionFn stopCondition, bool closedUB = false);
 
   /// Compute a bound in terms of the values/dimensions in `dependencies`. The
   /// computed bound consists of only constant terms and dependent values (or
   /// dimension sizes thereof).
   static LogicalResult
   computeDependentBound(AffineMap &resultMap, ValueDimList &mapOperands,
-                        presburger::BoundType type, Value value,
-                        std::optional<int64_t> dim, ValueDimList dependencies,
-                        bool closedUB = false);
+                        presburger::BoundType type, const Variable &var,
+                        ValueDimList dependencies, bool closedUB = false);
 
   /// Compute a bound in that is independent of all values in `independencies`.
   ///
@@ -161,13 +191,10 @@ class ValueBoundsConstraintSet
   /// appear in the computed bound.
   static LogicalResult
   computeIndependentBound(AffineMap &resultMap, ValueDimList &mapOperands,
-                          presburger::BoundType type, Value value,
-                          std::optional<int64_t> dim, ValueRange independencies,
-                          bool closedUB = false);
+                          presburger::BoundType type, const Variable &var,
+                          ValueRange independencies, bool closedUB = false);
 
-  /// Compute a constant bound for the given affine map, where dims and symbols
-  /// are bound to the given operands. The affine map must have exactly one
-  /// result.
+  /// Compute a constant bound for the given variable.
   ///
   /// This function traverses the backward slice of the given operands in a
   /// worklist-driven manner until `stopCondition` evaluates to "true". The
@@ -182,16 +209,9 @@ class ValueBoundsConstraintSet
   /// By default, lower/equal bounds are closed and upper bounds are open. If
   /// `closedUB` is set to "true", upper bounds are also closed.
   static FailureOr<int64_t>
-  computeConstantBound(presburger::BoundType type, Value value,
-                       std::optional<int64_t> dim = std::nullopt,
+  computeConstantBound(presburger::BoundType type, const Variable &var,
                        StopConditionFn stopCondition = nullptr,
                        bool closedUB = false);
-  static FailureOr<int64_t> computeConstantBound(
-      presburger::BoundType type, AffineMap map, ValueDimList mapOperands,
-      StopConditionFn stopCondition = nullptr, bool closedUB = false);
-  static FailureOr<int64_t> computeConstantBound(
-      presburger::BoundType type, AffineMap map, ArrayRef<Value> mapOperands,
-      StopConditionFn stopCondition = nullptr, bool closedUB = false);
 
   /// Compute a constant delta between the given two values. Return "failure"
   /// if a constant delta could not be determined.
@@ -221,9 +241,8 @@ class ValueBoundsConstraintSet
   /// proven. This could be because the specified relation does in fact not hold
   /// or because there is not enough information in the constraint set. In other
   /// words, if we do not know for sure, this function returns "false".
-  bool populateAndCompare(OpFoldResult lhs, std::optional<int64_t> lhsDim,
-                          ComparisonOperator cmp, OpFoldResult rhs,
-                          std::optional<int64_t> rhsDim);
+  bool populateAndCompare(const Variable &lhs, ComparisonOperator cmp,
+                          const Variable &rhs);
 
   /// Return "true" if "lhs cmp rhs" was proven to hold. Return "false" if the
   /// specified relation could not be proven. This could be because the
@@ -233,24 +252,12 @@ class ValueBoundsConstraintSet
   ///
   /// This function keeps traversing the backward slice of lhs/rhs until could
   /// prove the relation or until it ran out of IR.
-  static bool compare(OpFoldResult lhs, std::optional<int64_t> lhsDim,
-                      ComparisonOperator cmp, OpFoldResult rhs,
-                      std::optional<int64_t> rhsDim);
-  static bool compare(AffineMap lhs, ValueDimList lhsOperands,
-                      ComparisonOperator cmp, AffineMap rhs,
-                      ValueDimList rhsOperands);
-  static bool compare(AffineMap lhs, ArrayRef<Value> lhsOperands,
-                      ComparisonOperator cmp, AffineMap rhs,
-                      ArrayRef<Value> rhsOperands);
-
-  /// Compute whether the given values/dimensions are equal. Return "failure" if
+  static bool compare(const Variable &lhs, ComparisonOperator cmp,
+                      const Variable &rhs);
+
+  /// Compute whether the given variables are equal. Return "failure" if
   /// equality could not be determined.
-  ///
-  /// `dim1`/`dim2` must be `nullopt` if and only if `value1`/`value2` are
-  /// index-typed.
-  static FailureOr<bool> areEqual(OpFoldResult value1, OpFoldResult value2,
-                                  std::optional<int64_t> dim1 = std::nullopt,
-                                  std::optional<int64_t> dim2 = std::nullopt);
+  static FailureOr<bool> areEqual(const Variable &var1, const Variable &var2);
 
   /// Return "true" if the given slices are guaranteed to be overlapping.
   /// Return "false" if the given slices are guaranteed to be non-overlapping.
@@ -317,9 +324,6 @@ class ValueBoundsConstraintSet
   ///
   /// This function does not analyze any IR and does not populate any additional
   /// constraints.
-  bool compareValueDims(OpFoldResult lhs, std::optional<int64_t> lhsDim,
-                        ComparisonOperator cmp, OpFoldResult rhs,
-                        std::optional<int64_t> rhsDim);
   bool comparePos(int64_t lhsPos, ComparisonOperator cmp, int64_t rhsPos);
 
   /// Given an affine map with a single result (and map operands), add a new
@@ -374,6 +378,7 @@ class ValueBoundsConstraintSet
   /// constraint system. Return the position of the new column. Any operands
   /// that were not analyzed yet are put on the worklist.
   int64_t insert(AffineMap map, ValueDimList operands, bool isSymbol = true);
+  int64_t insert(const Variable &var, bool isSymbol = true);
 
   /// Project out the given column in the constraint set.
   void projectOut(int64_t pos);
@@ -381,6 +386,8 @@ class ValueBoundsConstraintSet
   /// Project out all columns for which the condition holds.
   void projectOut(function_ref<bool(ValueDim)> condition);
 
+  void projectOutAnonymous(std::optional<int64_t> except = std::nullopt);
+
   /// Mapping of columns to values/shape dimensions.
   SmallVector<std::optional<ValueDim>> positionToValueDim;
   /// Reverse mapping of values/shape dimensions to columns.
diff --git a/mlir/include/mlir/Interfaces/ViewLikeInterface.td b/mlir/include/mlir/Interfaces/ViewLikeInterface.td
index ea5bb1b5ac485..9397f271e1bc6 100644
--- a/mlir/include/mlir/Interfaces/ViewLikeInterface.td
+++ b/mlir/include/mlir/Interfaces/ViewLikeInterface.td
@@ -158,7 +158,7 @@ def OffsetSizeAndStrideOpInterface : OpInterface<"OffsetSizeAndStrideOpInterface
     >,
     InterfaceMethod<
       /*desc=*/[{
-        Return a vector of all the static or dynamic sizes of the op.
+        Return a vector of all the static or dynamic offsets of the op.
       }],
       /*retTy=*/"::llvm::SmallVector<::mlir::OpFoldResult, 4>",
       /*methodName=*/"getMixedOffsets",
diff --git a/mlir/lib/Bindings/Python/DialectNVGPU.cpp b/mlir/lib/Bindings/Python/DialectNVGPU.cpp
new file mode 100644
index 0000000000000..341e4d55bcf21
--- /dev/null
+++ b/mlir/lib/Bindings/Python/DialectNVGPU.cpp
@@ -0,0 +1,41 @@
+//===--- DialectNvgpu.cpp - Pybind module for Nvgpu dialect API support ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir-c/Dialect/NVGPU.h"
+#include "mlir-c/IR.h"
+#include "mlir/Bindings/Python/PybindAdaptors.h"
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+using namespace llvm;
+using namespace mlir;
+using namespace mlir::python;
+using namespace mlir::python::adaptors;
+
+static void populateDialectNvgpuSubmodule(const pybind11::module &m) {
+  auto nvgpuTensorMapDescriptorType = mlir_type_subclass(
+      m, "TensorMapDescriptorType", mlirTypeIsANVGPUTensorMapDescriptorType);
+
+  nvgpuTensorMapDescriptorType.def_classmethod(
+      "get",
+      [](py::object cls, MlirType tensorMemrefType, int swizzle, int l2promo,
+         int oobFill, int interleave, MlirContext ctx) {
+        return cls(mlirNVGPUTensorMapDescriptorTypeGet(
+            ctx, tensorMemrefType, swizzle, l2promo, oobFill, interleave));
+      },
+      "Gets an instance of TensorMapDescriptorType in the same context",
+      py::arg("cls"), py::arg("tensor_type"), py::arg("swizzle"),
+      py::arg("l2promo"), py::arg("oob_fill"), py::arg("interleave"),
+      py::arg("ctx") = py::none());
+}
+
+PYBIND11_MODULE(_mlirDialectsNvgpu, m) {
+  m.doc() = "MLIR NVGPU dialect.";
+
+  populateDialectNvgpuSubmodule(m);
+}
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 734f2f7f3f94c..d875f4eba2b13 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -674,6 +674,7 @@ void PyMlirContext::clearOperationsInside(PyOperationBase &op) {
       data->rootOp.getOperation().getContext()->clearOperation(op);
     else
       data->rootSeen = true;
+    return MlirWalkResult::MlirWalkResultAdvance;
   };
   mlirOperationWalk(op.getOperation(), invalidatingCallback,
                     static_cast<void *>(&data), MlirWalkPreOrder);
@@ -1249,6 +1250,21 @@ void PyOperationBase::writeBytecode(const py::object &fileObject,
                               .str());
 }
 
+void PyOperationBase::walk(
+    std::function<MlirWalkResult(MlirOperation)> callback,
+    MlirWalkOrder walkOrder) {
+  PyOperation &operation = getOperation();
+  operation.checkValid();
+  MlirOperationWalkCallback walkCallback = [](MlirOperation op,
+                                              void *userData) {
+    auto *fn =
+        static_cast<std::function<MlirWalkResult(MlirOperation)> *>(userData);
+    return (*fn)(op);
+  };
+
+  mlirOperationWalk(operation, walkCallback, &callback, walkOrder);
+}
+
 py::object PyOperationBase::getAsm(bool binary,
                                    std::optional<int64_t> largeElementsLimit,
                                    bool enableDebugInfo, bool prettyDebugInfo,
@@ -2511,6 +2527,15 @@ void mlir::python::populateIRCore(py::module &m) {
       .value("NOTE", MlirDiagnosticNote)
       .value("REMARK", MlirDiagnosticRemark);
 
+  py::enum_<MlirWalkOrder>(m, "WalkOrder", py::module_local())
+      .value("PRE_ORDER", MlirWalkPreOrder)
+      .value("POST_ORDER", MlirWalkPostOrder);
+
+  py::enum_<MlirWalkResult>(m, "WalkResult", py::module_local())
+      .value("ADVANCE", MlirWalkResultAdvance)
+      .value("INTERRUPT", MlirWalkResultInterrupt)
+      .value("SKIP", MlirWalkResultSkip);
+
   //----------------------------------------------------------------------------
   // Mapping of Diagnostics.
   //----------------------------------------------------------------------------
@@ -2989,8 +3014,7 @@ void mlir::python::populateIRCore(py::module &m) {
            py::arg("binary") = false, kOperationPrintStateDocstring)
       .def("print",
            py::overload_cast<std::optional<int64_t>, bool, bool, bool, bool,
-                             bool, py::object, bool>(
-               &PyOperationBase::print),
+                             bool, py::object, bool>(&PyOperationBase::print),
            // Careful: Lots of arguments must match up with print method.
            py::arg("large_elements_limit") = py::none(),
            py::arg("enable_debug_info") = false,
@@ -3038,7 +3062,9 @@ void mlir::python::populateIRCore(py::module &m) {
             return operation.createOpView();
           },
           "Detaches the operation from its parent block.")
-      .def("erase", [](PyOperationBase &self) { self.getOperation().erase(); });
+      .def("erase", [](PyOperationBase &self) { self.getOperation().erase(); })
+      .def("walk", &PyOperationBase::walk, py::arg("callback"),
+           py::arg("walk_order") = MlirWalkPostOrder);
 
   py::class_<PyOperation, PyOperationBase>(m, "Operation", py::module_local())
       .def_static("create", &PyOperation::create, py::arg("name"),
diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h
index 9acfdde25ae04..b038a0c54d29b 100644
--- a/mlir/lib/Bindings/Python/IRModule.h
+++ b/mlir/lib/Bindings/Python/IRModule.h
@@ -579,6 +579,10 @@ class PyOperationBase {
   void writeBytecode(const pybind11::object &fileObject,
                      std::optional<int64_t> bytecodeVersion);
 
+  // Implement the walk method.
+  void walk(std::function<MlirWalkResult(MlirOperation)> callback,
+            MlirWalkOrder walkOrder);
+
   /// Moves the operation before or after the other operation.
   void moveAfter(PyOperationBase &other);
   void moveBefore(PyOperationBase &other);
diff --git a/mlir/lib/Bindings/Python/TransformInterpreter.cpp b/mlir/lib/Bindings/Python/TransformInterpreter.cpp
index 6517f8c39dfad..f6b4532b1b6be 100644
--- a/mlir/lib/Bindings/Python/TransformInterpreter.cpp
+++ b/mlir/lib/Bindings/Python/TransformInterpreter.cpp
@@ -82,6 +82,21 @@ static void populateTransformInterpreterSubmodule(py::module &m) {
       py::arg("payload_root"), py::arg("transform_root"),
       py::arg("transform_module"),
       py::arg("transform_options") = PyMlirTransformOptions());
+
+  m.def(
+      "copy_symbols_and_merge_into",
+      [](MlirOperation target, MlirOperation other) {
+        mlir::python::CollectDiagnosticsToStringScope scope(
+            mlirOperationGetContext(target));
+
+        MlirLogicalResult result = mlirMergeSymbolsIntoFromClone(target, other);
+        if (mlirLogicalResultIsFailure(result)) {
+          throw py::value_error(
+              "Failed to merge symbols.\nDiagnostic message " +
+              scope.takeMessage());
+        }
+      },
+      py::arg("target"), py::arg("other"));
 }
 
 PYBIND11_MODULE(_mlirTransformInterpreter, m) {
diff --git a/mlir/lib/CAPI/Dialect/NVGPU.cpp b/mlir/lib/CAPI/Dialect/NVGPU.cpp
index 02d10954a0377..e6da529e1b6b5 100644
--- a/mlir/lib/CAPI/Dialect/NVGPU.cpp
+++ b/mlir/lib/CAPI/Dialect/NVGPU.cpp
@@ -9,5 +9,23 @@
 #include "mlir-c/Dialect/NVGPU.h"
 #include "mlir/CAPI/Registration.h"
 #include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
+#include "mlir/IR/BuiltinTypes.h"
+
+using namespace mlir;
+using namespace mlir::nvgpu;
 
 MLIR_DEFINE_CAPI_DIALECT_REGISTRATION(NVGPU, nvgpu, mlir::nvgpu::NVGPUDialect)
+
+bool mlirTypeIsANVGPUTensorMapDescriptorType(MlirType type) {
+  return isa<nvgpu::TensorMapDescriptorType>(unwrap(type));
+}
+
+MlirType mlirNVGPUTensorMapDescriptorTypeGet(MlirContext ctx,
+                                             MlirType tensorMemrefType,
+                                             int swizzle, int l2promo,
+                                             int oobFill, int interleave) {
+  return wrap(nvgpu::TensorMapDescriptorType::get(
+      unwrap(ctx), cast<MemRefType>(unwrap(tensorMemrefType)),
+      TensorMapSwizzleKind(swizzle), TensorMapL2PromoKind(l2promo),
+      TensorMapOOBKind(oobFill), TensorMapInterleaveKind(interleave)));
+}
diff --git a/mlir/lib/CAPI/Dialect/TransformInterpreter.cpp b/mlir/lib/CAPI/Dialect/TransformInterpreter.cpp
index eb6951dc5584d..145455e1c1b3d 100644
--- a/mlir/lib/CAPI/Dialect/TransformInterpreter.cpp
+++ b/mlir/lib/CAPI/Dialect/TransformInterpreter.cpp
@@ -15,6 +15,7 @@
 #include "mlir/CAPI/IR.h"
 #include "mlir/CAPI/Support.h"
 #include "mlir/CAPI/Wrap.h"
+#include "mlir/Dialect/Transform/IR/Utils.h"
 #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/Transforms/TransformInterpreterUtils.h"
 
@@ -71,4 +72,12 @@ MlirLogicalResult mlirTransformApplyNamedSequence(
       unwrap(payload), unwrap(transformRoot),
       cast<ModuleOp>(unwrap(transformModule)), *unwrap(transformOptions)));
 }
+
+MlirLogicalResult mlirMergeSymbolsIntoFromClone(MlirOperation target,
+                                                MlirOperation other) {
+  OwningOpRef<Operation *> otherOwning(unwrap(other)->clone());
+  LogicalResult result = transform::detail::mergeSymbolsInto(
+      unwrap(target), std::move(otherOwning));
+  return wrap(result);
+}
 }
diff --git a/mlir/lib/CAPI/IR/IR.cpp b/mlir/lib/CAPI/IR/IR.cpp
index cdb64f4ec4a40..a72cd247e73f6 100644
--- a/mlir/lib/CAPI/IR/IR.cpp
+++ b/mlir/lib/CAPI/IR/IR.cpp
@@ -717,17 +717,34 @@ void mlirOperationMoveBefore(MlirOperation op, MlirOperation other) {
   return unwrap(op)->moveBefore(unwrap(other));
 }
 
+static mlir::WalkResult unwrap(MlirWalkResult result) {
+  switch (result) {
+  case MlirWalkResultAdvance:
+    return mlir::WalkResult::advance();
+
+  case MlirWalkResultInterrupt:
+    return mlir::WalkResult::interrupt();
+
+  case MlirWalkResultSkip:
+    return mlir::WalkResult::skip();
+  }
+}
+
 void mlirOperationWalk(MlirOperation op, MlirOperationWalkCallback callback,
                        void *userData, MlirWalkOrder walkOrder) {
   switch (walkOrder) {
 
   case MlirWalkPreOrder:
     unwrap(op)->walk<mlir::WalkOrder::PreOrder>(
-        [callback, userData](Operation *op) { callback(wrap(op), userData); });
+        [callback, userData](Operation *op) {
+          return unwrap(callback(wrap(op), userData));
+        });
     break;
   case MlirWalkPostOrder:
     unwrap(op)->walk<mlir::WalkOrder::PostOrder>(
-        [callback, userData](Operation *op) { callback(wrap(op), userData); });
+        [callback, userData](Operation *op) {
+          return unwrap(callback(wrap(op), userData));
+        });
   }
 }
 
diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
index 03e578136e590..4a15976d40c76 100644
--- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
+++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
@@ -1289,13 +1289,14 @@ struct AngleOpConversion : public OpConversionPattern<complex::AngleOp> {
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = op.getLoc();
     auto type = op.getType();
+    arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr();
 
     Value real =
         rewriter.create<complex::ReOp>(loc, type, adaptor.getComplex());
     Value imag =
         rewriter.create<complex::ImOp>(loc, type, adaptor.getComplex());
 
-    rewriter.replaceOpWithNewOp<math::Atan2Op>(op, imag, real);
+    rewriter.replaceOpWithNewOp<math::Atan2Op>(op, imag, real, fmf);
 
     return success();
   }
diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
index b9ada0fa0f979..a206c7b228d21 100644
--- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
+++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
@@ -251,11 +251,11 @@ void mlir::configureOpenMPToLLVMConversionLegality(
   });
   target.addDynamicallyLegalOp<
       mlir::omp::AtomicUpdateOp, mlir::omp::CriticalOp, mlir::omp::TargetOp,
-      mlir::omp::TargetDataOp, mlir::omp::OrderedRegionOp,
-      mlir::omp::ParallelOp, mlir::omp::WsloopOp, mlir::omp::SimdLoopOp,
-      mlir::omp::MasterOp, mlir::omp::SectionOp, mlir::omp::SectionsOp,
-      mlir::omp::SingleOp, mlir::omp::TaskgroupOp, mlir::omp::TaskOp,
-      mlir::omp::DeclareReductionOp,
+      mlir::omp::TargetDataOp, mlir::omp::LoopNestOp,
+      mlir::omp::OrderedRegionOp, mlir::omp::ParallelOp, mlir::omp::WsloopOp,
+      mlir::omp::SimdOp, mlir::omp::MasterOp, mlir::omp::SectionOp,
+      mlir::omp::SectionsOp, mlir::omp::SingleOp, mlir::omp::TaskgroupOp,
+      mlir::omp::TaskOp, mlir::omp::DeclareReductionOp,
       mlir::omp::PrivateClauseOp>([&](Operation *op) {
     return std::all_of(op->getRegions().begin(), op->getRegions().end(),
                        [&](Region &region) {
@@ -278,11 +278,12 @@ void mlir::populateOpenMPToLLVMConversionPatterns(LLVMTypeConverter &converter,
       AtomicReadOpConversion, MapInfoOpConversion, ReductionOpConversion,
       MultiRegionOpConversion<omp::DeclareReductionOp>,
       MultiRegionOpConversion<omp::PrivateClauseOp>,
-      RegionOpConversion<omp::CriticalOp>, RegionOpConversion<omp::MasterOp>,
-      ReductionOpConversion, RegionOpConversion<omp::OrderedRegionOp>,
+      RegionOpConversion<omp::CriticalOp>, RegionOpConversion<omp::LoopNestOp>,
+      RegionOpConversion<omp::MasterOp>, ReductionOpConversion,
+      RegionOpConversion<omp::OrderedRegionOp>,
       RegionOpConversion<omp::ParallelOp>, RegionOpConversion<omp::WsloopOp>,
       RegionOpConversion<omp::SectionsOp>, RegionOpConversion<omp::SectionOp>,
-      RegionOpConversion<omp::SimdLoopOp>, RegionOpConversion<omp::SingleOp>,
+      RegionOpConversion<omp::SimdOp>, RegionOpConversion<omp::SingleOp>,
       RegionOpConversion<omp::TaskgroupOp>, RegionOpConversion<omp::TaskOp>,
       RegionOpConversion<omp::TargetDataOp>, RegionOpConversion<omp::TargetOp>,
       RegionLessOpWithVarOperandsConversion<omp::AtomicWriteOp>,
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index 7c477f2e1412b..af19ebaea937d 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -766,11 +766,15 @@ static Value broadcastDynamicDimension(PatternRewriter &rewriter, Location loc,
 
   // Emit 'then' region of 'scf.if'
   auto emitThenRegion = [&](OpBuilder &opBuilder, Location loc) {
+    // It is not safe to cache constants across regions.
+    // New constants could potentially violate dominance requirements.
+    IndexPool localPool;
+
     // Emit 'tensor.empty' op
     SmallVector<OpFoldResult> outputTensorShape;
     for (auto index : llvm::seq<int64_t>(0, rank)) {
       auto size = index == dim ? targetSize
-                               : getOrFoldTensorDim(rewriter, loc, indexPool,
+                               : getOrFoldTensorDim(rewriter, loc, localPool,
                                                     operand, index);
       outputTensorShape.push_back(size);
     }
@@ -812,9 +816,9 @@ static Value broadcastDynamicDimensions(PatternRewriter &rewriter, Location loc,
                                         IndexPool &indexPool, Value operand,
                                         ArrayRef<OpFoldResult> targetShape,
                                         ArrayRef<Value> masterOperands) {
-  size_t rank = operand.getType().cast<RankedTensorType>().getRank();
-  assert(targetShape.size() == rank);
-  assert(masterOperands.size() == rank);
+  int64_t rank = operand.getType().cast<RankedTensorType>().getRank();
+  assert((int64_t)targetShape.size() == rank);
+  assert((int64_t)masterOperands.size() == rank);
   for (auto index : llvm::seq<int64_t>(0, rank))
     operand =
         broadcastDynamicDimension(rewriter, loc, indexPool, operand, index,
@@ -1578,17 +1582,16 @@ class GenericResizeConverter : public OpRewritePattern<tosa::ResizeOp> {
         }
         // x = x * scale_d + offset;
         // ix = floor(x / scale_n)
-        // dx = x / scale_n - ix
-        Value val = b.create<arith::UIToFPOp>(floatTy, in);
-        scaleN = b.create<arith::UIToFPOp>(floatTy, scaleN);
-        scaleD = b.create<arith::UIToFPOp>(floatTy, scaleD);
-        offset = b.create<arith::SIToFPOp>(floatTy, offset);
-        val = b.create<arith::MulFOp>(val, scaleD);
-        val = b.create<arith::AddFOp>(val, offset);
-        val = b.create<arith::DivFOp>(val, scaleN);
-        index = b.create<math::FloorOp>(val);
-        delta = b.create<arith::SubFOp>(val, index);
-        index = b.create<arith::FPToSIOp>(b.getI32Type(), index);
+        Value val = b.create<arith::MulIOp>(in, scaleD);
+        val = b.create<arith::AddIOp>(val, offset);
+        index = b.create<arith::FloorDivSIOp>(val, scaleN);
+
+        // rx = x % scale_n
+        // dx = rx / scale_n
+        Value r = b.create<arith::RemSIOp>(val, scaleN);
+        Value rFp = b.create<arith::SIToFPOp>(floatTy, r);
+        Value scaleNfp = b.create<arith::UIToFPOp>(floatTy, scaleN);
+        delta = b.create<arith::DivFOp>(rFp, scaleNfp);
       };
 
       // Compute the ix and dx values for the X and Y dimensions - int case.
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
index 3f39cbf03a9a8..8fb8d16486560 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
@@ -26,6 +26,8 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
+#include "mlir/Interfaces/InferTypeOpInterface.h"
+
 #include <numeric>
 #include <type_traits>
 
@@ -34,7 +36,7 @@ using namespace mlir::tosa;
 
 static mlir::Value applyPad(Location loc, Value input, ArrayRef<int64_t> pad,
                             TypedAttr padAttr, OpBuilder &rewriter) {
-  // Input should be padded if necessary.
+  // Input should be padded only if necessary.
   if (llvm::all_of(pad, [](int64_t p) { return p == 0; }))
     return input;
 
@@ -47,7 +49,7 @@ static mlir::Value applyPad(Location loc, Value input, ArrayRef<int64_t> pad,
   SmallVector<int64_t, 4> paddedShape;
   SmallVector<OpFoldResult, 8> lowIndices;
   SmallVector<OpFoldResult, 8> highIndices;
-  for (int i = 0, s = inputShape.size(); i < s; i++) {
+  for (size_t i : llvm::seq(inputShape.size())) {
     auto lowPad = pad[i * 2];
     auto highPad = pad[i * 2 + 1];
     if (ShapedType::isDynamic(inputShape[i]))
@@ -131,20 +133,19 @@ static mlir::Value linalgBroadcastAndMaybeExtSI(PatternRewriter &rewriter,
 
 static mlir::Value reifyConstantDim(int64_t attr,
                                     ImplicitLocOpBuilder &builder) {
-  return builder.createOrFold<arith::IndexCastOp>(
-      builder.getIndexType(),
-      builder.create<arith::ConstantOp>(builder.getI64IntegerAttr(attr)));
+  return builder.create<arith::ConstantIndexOp>(attr);
 }
 
 // Calculating the output width/height using the formula:
 // H = ((IH+pad_top+pad_bottom-(dilation_y*(KH-1)+1))/stride_y)+1
 // W = ((IW+pad_left+pad_right-(dilation_x*(KW-1)+1))/stride_x)+1
 
-static mlir::Value getConvOutputDim(Location loc, Value inputDim,
-                                    int64_t padBeforeAttr, int64_t padAfterAttr,
-                                    Value kernelDim, int64_t strideAttr,
-                                    int64_t dilationAttr, Type inputETy,
-                                    OpBuilder &rewriter) {
+static mlir::Value getConvOrPoolOutputDim(Location loc, Value inputDim,
+                                          int64_t padBeforeAttr,
+                                          int64_t padAfterAttr, Value kernelDim,
+                                          int64_t strideAttr,
+                                          int64_t dilationAttr,
+                                          OpBuilder &rewriter) {
   ImplicitLocOpBuilder builder(loc, rewriter);
   auto one = rewriter.create<arith::ConstantOp>(
       loc, IntegerAttr::get(inputDim.getType(), 1));
@@ -171,7 +172,6 @@ static SmallVector<Value> inferDynamicDimsForConv(
     ArrayRef<int64_t> dilationAttr, ArrayRef<int64_t> inputSizeDims,
     ArrayRef<int64_t> kernelSizeDims, OpBuilder &rewriter) {
   ShapedType inputTy = cast<ShapedType>(input.getType());
-  Type inputETy = inputTy.getElementType();
   int64_t inputRank = inputTy.getRank();
 
   SmallVector<Value> dynDims;
@@ -190,8 +190,8 @@ static SmallVector<Value> inferDynamicDimsForConv(
           rewriter.create<tensor::DimOp>(loc, weight, kernelDim);
       // H = F(IH, pad_top, pad_bottom, dilation_y, KH, stride_y)
       dynDims[inputDim] =
-          getConvOutputDim(loc, initDynDim, padTop, padBottom, kernelDynDim,
-                           stride, dilation, inputETy, rewriter);
+          getConvOrPoolOutputDim(loc, initDynDim, padTop, padBottom,
+                                 kernelDynDim, stride, dilation, rewriter);
     }
   }
 
@@ -685,20 +685,61 @@ class MaxPool2dConverter : public OpRewritePattern<tosa::MaxPool2dOp> {
 public:
   using OpRewritePattern<tosa::MaxPool2dOp>::OpRewritePattern;
 
+  // Compute the dynamic output sizes of the maxpool operation.
+  static SmallVector<Value>
+  computeDynamicOutputSizes(tosa::MaxPool2dOp op, PatternRewriter &rewriter) {
+    TensorType resultTy = op.getType();
+    Location loc = op.getLoc();
+
+    TypedValue<TensorType> input = op.getInput();
+    ArrayRef<int64_t> kernel = op.getKernel();
+    ArrayRef<int64_t> pad = op.getPad();
+    ArrayRef<int64_t> stride = op.getStride();
+
+    SmallVector<Value> dynamicDims;
+
+    // Batch dimension
+    if (resultTy.isDynamicDim(0))
+      dynamicDims.push_back(rewriter.create<tensor::DimOp>(loc, input, 0));
+
+    // Height/width dimensions
+    for (int64_t dim : {1, 2}) {
+      if (!resultTy.isDynamicDim(dim))
+        continue;
+
+      // Index into the attribute arrays
+      int64_t index = dim - 1;
+
+      // Input height/width
+      Value ihw = rewriter.create<tensor::DimOp>(loc, input, dim);
+
+      // Kernel height/width
+      Value khw = rewriter.create<arith::ConstantIndexOp>(loc, kernel[index]);
+
+      // Output height/width
+      Value ohw = getConvOrPoolOutputDim(loc, ihw, pad[index * 2],
+                                         pad[index * 2 + 1], khw, stride[index],
+                                         /*dilationAttr=*/1, rewriter);
+      dynamicDims.push_back(ohw);
+    }
+
+    // Channel dimension
+    if (resultTy.isDynamicDim(3))
+      dynamicDims.push_back(rewriter.create<tensor::DimOp>(loc, input, 3));
+
+    return dynamicDims;
+  }
+
   LogicalResult matchAndRewrite(tosa::MaxPool2dOp op,
                                 PatternRewriter &rewriter) const final {
     Location loc = op.getLoc();
-    Value input = op.getInput();
-    ShapedType inputTy = cast<ShapedType>(input.getType());
+    TypedValue<TensorType> input = op.getInput();
+    ShapedType inputTy = input.getType();
 
-    ShapedType resultTy = cast<ShapedType>(op.getType());
+    ShapedType resultTy = op.getType();
     Type resultETy = inputTy.getElementType();
 
-    auto dynamicDimsOr =
-        checkHasDynamicBatchDims(rewriter, op, {input, op.getOutput()});
-    if (!dynamicDimsOr.has_value())
-      return failure();
-    SmallVector<Value> dynamicDims = *dynamicDimsOr;
+    SmallVector<Value> dynamicDims = computeDynamicOutputSizes(op, rewriter);
 
     // Determine what the initial value needs to be for the max pool op.
     TypedAttr initialAttr;
@@ -721,6 +762,7 @@ class MaxPool2dConverter : public OpRewritePattern<tosa::MaxPool2dOp> {
     pad.resize(2, 0);
     llvm::append_range(pad, op.getPad());
     pad.resize(pad.size() + 2, 0);
+
     Value paddedInput = applyPad(loc, input, pad, initialAttr, rewriter);
 
     Value initialValue = rewriter.create<arith::ConstantOp>(loc, initialAttr);
@@ -736,9 +778,7 @@ class MaxPool2dConverter : public OpRewritePattern<tosa::MaxPool2dOp> {
         loc, resultTy.getShape(), resultTy.getElementType(), dynamicDims);
 
     Value filledEmptyTensor =
-        rewriter
-            .create<linalg::FillOp>(loc, ValueRange{initialValue},
-                                    ValueRange{emptyTensor})
+        rewriter.create<linalg::FillOp>(loc, initialValue, emptyTensor)
             .result();
 
     Value fakeWindowDims =
diff --git a/mlir/lib/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.cpp
index e0c3abe7a0f71..82a9fb0d49088 100644
--- a/mlir/lib/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Affine/IR/ValueBoundsOpInterfaceImpl.cpp
@@ -120,9 +120,7 @@ mlir::affine::fullyComposeAndComputeConstantDelta(Value value1, Value value2) {
   mapOperands.push_back(value1);
   mapOperands.push_back(value2);
   affine::fullyComposeAffineMapAndOperands(&map, &mapOperands);
-  ValueDimList valueDims;
-  for (Value v : mapOperands)
-    valueDims.push_back({v, std::nullopt});
   return ValueBoundsConstraintSet::computeConstantBound(
-      presburger::BoundType::EQ, map, valueDims);
+      presburger::BoundType::EQ,
+      ValueBoundsConstraintSet::Variable(map, mapOperands));
 }
diff --git a/mlir/lib/Dialect/Affine/Transforms/ReifyValueBounds.cpp b/mlir/lib/Dialect/Affine/Transforms/ReifyValueBounds.cpp
index 117ee8e8701ad..1a266b72d1f8d 100644
--- a/mlir/lib/Dialect/Affine/Transforms/ReifyValueBounds.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/ReifyValueBounds.cpp
@@ -16,16 +16,15 @@
 using namespace mlir;
 using namespace mlir::affine;
 
-static FailureOr<OpFoldResult>
-reifyValueBound(OpBuilder &b, Location loc, presburger::BoundType type,
-                Value value, std::optional<int64_t> dim,
-                ValueBoundsConstraintSet::StopConditionFn stopCondition,
-                bool closedUB) {
+FailureOr<OpFoldResult> mlir::affine::reifyValueBound(
+    OpBuilder &b, Location loc, presburger::BoundType type,
+    const ValueBoundsConstraintSet::Variable &var,
+    ValueBoundsConstraintSet::StopConditionFn stopCondition, bool closedUB) {
   // Compute bound.
   AffineMap boundMap;
   ValueDimList mapOperands;
   if (failed(ValueBoundsConstraintSet::computeBound(
-          boundMap, mapOperands, type, value, dim, stopCondition, closedUB)))
+          boundMap, mapOperands, type, var, stopCondition, closedUB)))
     return failure();
 
   // Reify bound.
@@ -93,7 +92,7 @@ FailureOr<OpFoldResult> mlir::affine::reifyShapedValueDimBound(
     // the owner of `value`.
     return v != value;
   };
-  return reifyValueBound(b, loc, type, value, dim,
+  return reifyValueBound(b, loc, type, {value, dim},
                          stopCondition ? stopCondition : reifyToOperands,
                          closedUB);
 }
@@ -105,7 +104,7 @@ FailureOr<OpFoldResult> mlir::affine::reifyIndexValueBound(
                              ValueBoundsConstraintSet &cstr) {
     return v != value;
   };
-  return reifyValueBound(b, loc, type, value, /*dim=*/std::nullopt,
+  return reifyValueBound(b, loc, type, value,
                          stopCondition ? stopCondition : reifyToOperands,
                          closedUB);
 }
diff --git a/mlir/lib/Dialect/Arith/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/Arith/IR/ValueBoundsOpInterfaceImpl.cpp
index f0d43808bc45d..7cfcc4180539c 100644
--- a/mlir/lib/Dialect/Arith/IR/ValueBoundsOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ValueBoundsOpInterfaceImpl.cpp
@@ -107,9 +107,9 @@ struct SelectOpInterface
     // If trueValue <= falseValue:
     // * result <= falseValue
     // * result >= trueValue
-    if (cstr.compare(trueValue, dim,
+    if (cstr.compare(/*lhs=*/{trueValue, dim},
                      ValueBoundsConstraintSet::ComparisonOperator::LE,
-                     falseValue, dim)) {
+                     /*rhs=*/{falseValue, dim})) {
       if (dim) {
         cstr.bound(value)[*dim] >= cstr.getExpr(trueValue, dim);
         cstr.bound(value)[*dim] <= cstr.getExpr(falseValue, dim);
@@ -121,9 +121,9 @@ struct SelectOpInterface
     // If falseValue <= trueValue:
     // * result <= trueValue
     // * result >= falseValue
-    if (cstr.compare(falseValue, dim,
+    if (cstr.compare(/*lhs=*/{falseValue, dim},
                      ValueBoundsConstraintSet::ComparisonOperator::LE,
-                     trueValue, dim)) {
+                     /*rhs=*/{trueValue, dim})) {
       if (dim) {
         cstr.bound(value)[*dim] >= cstr.getExpr(falseValue, dim);
         cstr.bound(value)[*dim] <= cstr.getExpr(trueValue, dim);
diff --git a/mlir/lib/Dialect/Arith/Transforms/IntNarrowing.cpp b/mlir/lib/Dialect/Arith/Transforms/IntNarrowing.cpp
index 79fabd6ed2e99..f87f3d6350c02 100644
--- a/mlir/lib/Dialect/Arith/Transforms/IntNarrowing.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/IntNarrowing.cpp
@@ -449,7 +449,7 @@ struct IndexCastPattern final : NarrowingPattern<CastOp> {
       return failure();
 
     FailureOr<int64_t> ub = ValueBoundsConstraintSet::computeConstantBound(
-        presburger::BoundType::UB, in, /*dim=*/std::nullopt,
+        presburger::BoundType::UB, in,
         /*stopCondition=*/nullptr, /*closedUB=*/true);
     if (failed(ub))
       return failure();
diff --git a/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp b/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp
index fad221288f190..5fb7953f93700 100644
--- a/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/ReifyValueBounds.cpp
@@ -61,16 +61,15 @@ static Value buildArithValue(OpBuilder &b, Location loc, AffineMap map,
   return buildExpr(map.getResult(0));
 }
 
-static FailureOr<OpFoldResult>
-reifyValueBound(OpBuilder &b, Location loc, presburger::BoundType type,
-                Value value, std::optional<int64_t> dim,
-                ValueBoundsConstraintSet::StopConditionFn stopCondition,
-                bool closedUB) {
+FailureOr<OpFoldResult> mlir::arith::reifyValueBound(
+    OpBuilder &b, Location loc, presburger::BoundType type,
+    const ValueBoundsConstraintSet::Variable &var,
+    ValueBoundsConstraintSet::StopConditionFn stopCondition, bool closedUB) {
   // Compute bound.
   AffineMap boundMap;
   ValueDimList mapOperands;
   if (failed(ValueBoundsConstraintSet::computeBound(
-          boundMap, mapOperands, type, value, dim, stopCondition, closedUB)))
+          boundMap, mapOperands, type, var, stopCondition, closedUB)))
     return failure();
 
   // Materialize tensor.dim/memref.dim ops.
@@ -128,7 +127,7 @@ FailureOr<OpFoldResult> mlir::arith::reifyShapedValueDimBound(
     // the owner of `value`.
     return v != value;
   };
-  return reifyValueBound(b, loc, type, value, dim,
+  return reifyValueBound(b, loc, type, {value, dim},
                          stopCondition ? stopCondition : reifyToOperands,
                          closedUB);
 }
@@ -140,7 +139,7 @@ FailureOr<OpFoldResult> mlir::arith::reifyIndexValueBound(
                              ValueBoundsConstraintSet &cstr) {
     return v != value;
   };
-  return reifyValueBound(b, loc, type, value, /*dim=*/std::nullopt,
+  return reifyValueBound(b, loc, type, value,
                          stopCondition ? stopCondition : reifyToOperands,
                          closedUB);
 }
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
index 31500c62c0d60..b595c6dd8a684 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
@@ -165,6 +165,35 @@ int getNumberOfSMETilesForVectorType(VectorType type) {
   return (vectorRows * vectorCols) / (minNumElts * minNumElts);
 }
 
+/// Legalize `arith.constant dense<value>` splat operations to fit within SME
+/// tiles by decomposing them into tile-sized operations.
+struct LegalizeArithConstantOpsByDecomposition
+    : public OneToNOpConversionPattern<arith::ConstantOp> {
+  using OneToNOpConversionPattern::OneToNOpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(arith::ConstantOp constantOp, OpAdaptor adaptor,
+                  OneToNPatternRewriter &rewriter) const override {
+    auto vectorType = dyn_cast<VectorType>(constantOp.getType());
+    auto denseAttr = dyn_cast<DenseElementsAttr>(constantOp.getValueAttr());
+    if (!vectorType || !denseAttr || !denseAttr.isSplat())
+      return failure();
+
+    if (!isMultipleOfSMETileVectorType(vectorType))
+      return rewriter.notifyMatchFailure(constantOp,
+                                         kMatchFailureNotSMETileTypeMultiple);
+
+    auto smeTileType = getSMETileTypeForElement(vectorType.getElementType());
+    auto tileCount = getNumberOfSMETilesForVectorType(vectorType);
+    auto tileSplat = rewriter.create<arith::ConstantOp>(
+        constantOp.getLoc(), denseAttr.resizeSplat(smeTileType));
+    rewriter.replaceOp(constantOp, SmallVector<Value>(tileCount, tileSplat),
+                       adaptor.getResultMapping());
+
+    return success();
+  }
+};
+
 /// Legalize `vector.outerproduct` operations to fit within SME tiles by
 /// decomposing them into tile-sized operations.
 struct LegalizeVectorOuterProductOpsByDecomposition
@@ -637,7 +666,8 @@ struct VectorLegalizationPass
     // Note: High benefit to ensure masked outer products are lowered first.
     patterns.add<LegalizeMaskedVectorOuterProductOpsByDecomposition>(
         converter, context, 1024);
-    patterns.add<LegalizeVectorOuterProductOpsByDecomposition,
+    patterns.add<LegalizeArithConstantOpsByDecomposition,
+                 LegalizeVectorOuterProductOpsByDecomposition,
                  LegalizeTransferReadOpsByDecomposition,
                  LegalizeTransferWriteOpsByDecomposition>(converter, context);
     populateFuncTypeConversionPatterns(converter, patterns);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp b/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp
index 8c4b70db24898..518d2e138c02a 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp
@@ -72,8 +72,10 @@ static LogicalResult computePaddedShape(linalg::LinalgOp opToPad,
     // Otherwise, try to compute a constant upper bound for the size value.
     FailureOr<int64_t> upperBound =
         ValueBoundsConstraintSet::computeConstantBound(
-            presburger::BoundType::UB, opOperand->get(),
-            /*dim=*/i, /*stopCondition=*/nullptr, /*closedUB=*/true);
+            presburger::BoundType::UB,
+            {opOperand->get(),
+             /*dim=*/i},
+            /*stopCondition=*/nullptr, /*closedUB=*/true);
     if (failed(upperBound)) {
       LLVM_DEBUG(DBGS() << "----could not compute a bounding box for padding");
       return failure();
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
index ac896d6c30d04..71eb59d40836c 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
@@ -257,14 +257,12 @@ FailureOr<PromotionInfo> mlir::linalg::promoteSubviewAsNewBuffer(
     if (auto attr = llvm::dyn_cast_if_present<Attribute>(rangeValue.size)) {
       size = getValueOrCreateConstantIndexOp(b, loc, rangeValue.size);
     } else {
-      Value materializedSize =
-          getValueOrCreateConstantIndexOp(b, loc, rangeValue.size);
       FailureOr<int64_t> upperBound =
           ValueBoundsConstraintSet::computeConstantBound(
-              presburger::BoundType::UB, materializedSize, /*dim=*/std::nullopt,
+              presburger::BoundType::UB, rangeValue.size,
               /*stopCondition=*/nullptr, /*closedUB=*/true);
       size = failed(upperBound)
-                 ? materializedSize
+                 ? getValueOrCreateConstantIndexOp(b, loc, rangeValue.size)
                  : b.create<arith::ConstantIndexOp>(loc, *upperBound);
     }
     LLVM_DEBUG(llvm::dbgs() << "Extracted tightest: " << size << "\n");
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 25785653a7167..df61381432921 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -1412,10 +1412,11 @@ static SmallVector<int64_t> getTiledPackShape(tensor::PackOp packOp,
 
 /// Create a TransferReadOp from `source` with static shape `readShape`. If the
 /// vector type for the read is not the same as the type of `source`, then a
-/// mask is created on the read.
+/// mask is created on the read.  If `doMasking` parameter is set to false we
+/// update the `inBounds` attribute instead of masking.
 static Value createReadOrMaskedRead(OpBuilder &builder, Location loc,
                                     Value source, ArrayRef<int64_t> readShape,
-                                    Value padValue) {
+                                    Value padValue, bool doMasking = true) {
   assert(llvm::none_of(readShape,
                        [](int64_t s) { return s == ShapedType::kDynamic; }));
   auto sourceShape = dyn_cast<ShapedType>(source.getType()).getShape();
@@ -1424,14 +1425,21 @@ static Value createReadOrMaskedRead(OpBuilder &builder, Location loc,
   auto vectorType = VectorType::get(readShape, padValue.getType());
   int64_t readRank = readShape.size();
   auto zero = builder.create<arith::ConstantIndexOp>(loc, 0);
+  SmallVector<bool> inBoundsVal(readRank, true);
+  if (!doMasking) {
+    // Update the inBounds attribute.
+    for (unsigned i = 0; i < readRank; i++)
+      inBoundsVal[i] = sourceShape[i] == readShape[i];
+  }
   auto transferReadOp = builder.create<vector::TransferReadOp>(
       loc,
       /*vectorType=*/vectorType,
       /*source=*/source,
       /*indices=*/SmallVector<Value>(readRank, zero),
       /*padding=*/padValue,
-      /*inBounds=*/SmallVector<bool>(readRank, true));
-  if (llvm::equal(readShape, sourceShape)) {
+      /*inBounds=*/inBoundsVal);
+
+  if (llvm::equal(readShape, sourceShape) || !doMasking) {
     return transferReadOp;
   }
   SmallVector<OpFoldResult> mixedSourceDims =
@@ -1482,11 +1490,10 @@ static Operation *createWriteOrMaskedWrite(OpBuilder &builder, Location loc,
   return write;
 }
 
-/// Vectorize tensor::PackOp with (1) static innerTiles and (2) constant
-/// padding value into:
+/// Vectorize tensor::PackOp with (1) static innerTiles (2) constant
+/// padding value and (3) input vector sizes into:
 /// masked_transfer_read->shape_cast->transpose->transfer_write_in_bounds
 /// As in the following example:
-///
 /// %pack = tensor.pack %src inner_dims_pos = [2, 1] inner_tiles = [16, 2]
 ///     into %dst : tensor<32x8x16xf32> -> tensor<32x4x1x16x2xf32>
 ///
@@ -1505,6 +1512,10 @@ static Operation *createWriteOrMaskedWrite(OpBuilder &builder, Location loc,
 ///     %empty[%c0_0, %c0_0, %c0_0, %c0_0, %c0_0]
 ///     {in_bounds = [true, true, true, true, true]}
 ///     : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
+///
+/// If the (3) input vector sizes are not provided, the vector sizes are
+/// determined by the result tensor shape. Also, we update the inBounds
+/// attribute instead of masking.
 static LogicalResult
 vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp,
                         ArrayRef<int64_t> inputVectorSizes,
@@ -1525,6 +1536,16 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp,
   (void)status; // prevent unused variable warning on non-assert builds.
   assert(succeeded(status) && "failed to reify result shapes");
 
+  // If the input vector sizes are not provided, then the vector sizes are
+  // determined by the result tensor shape. In case the vector sizes aren't
+  // provided, we update the inBounds attribute instead of masking.
+  bool doMasking = true;
+  if (inputVectorSizes.empty()) {
+    ArrayRef<int64_t> resultTensorShape = packOp.getDestType().getShape();
+    inputVectorSizes = resultTensorShape.take_front(packOp.getSourceRank());
+    doMasking = false;
+  }
+
   // Create masked TransferReadOp.
   SmallVector<int64_t> inputShape(inputVectorSizes);
   auto innerTiles = packOp.getStaticInnerTiles();
@@ -1536,7 +1557,7 @@ vectorizeAsTensorPackOp(RewriterBase &rewriter, tensor::PackOp packOp,
   for (auto [idx, size] : enumerate(innerTiles))
     inputShape[innerDimsPos[idx]] *= size;
   auto maskedRead = createReadOrMaskedRead(rewriter, loc, packOp.getSource(),
-                                           inputShape, padValue);
+                                           inputShape, padValue, doMasking);
 
   // Create ShapeCastOp.
   SmallVector<int64_t> destShape(inputVectorSizes);
@@ -1763,7 +1784,7 @@ vectorizeDynamicLinalgOpPrecondition(linalg::LinalgOp op,
 /// Returns success if `inputVectorSizes` is a valid masking configuraion for
 /// given `shape`, i.e., it meets:
 ///   1. The numbers of elements in both array are equal.
-///   2. `inputVectorSizes` does nos have dynamic dimensions.
+///   2. `inputVectorSizes` does not have dynamic dimensions.
 ///   3. All the values in `inputVectorSizes` are greater than or equal to
 ///      static sizes in `shape`.
 static LogicalResult
@@ -1881,18 +1902,25 @@ static LogicalResult vectorizeLinalgOpPrecondition(
   return success();
 }
 
-/// TODO: Use a matcher to check for a constant padding value.
 static LogicalResult
 vectorizePackOpPrecondition(tensor::PackOp packOp,
                             ArrayRef<int64_t> inputVectorSizes) {
   auto padValue = packOp.getPaddingValue();
-  if (padValue && !padValue.getDefiningOp<arith::ConstantOp>()) {
+  Attribute cstAttr;
+  if (padValue && !matchPattern(padValue, m_Constant(&cstAttr))) {
     LDBG("pad value is not constant: " << packOp << "\n");
     return failure();
   }
-
   ArrayRef<int64_t> resultTensorShape = packOp.getDestType().getShape();
-  if (failed(isValidMaskedInputVector(
+  bool satisfyEmptyCond = true;
+  if (inputVectorSizes.empty()) {
+    if (!packOp.getDestType().hasStaticShape() ||
+        !packOp.getSourceType().hasStaticShape())
+      satisfyEmptyCond = false;
+  }
+
+  if (!satisfyEmptyCond &&
+      failed(isValidMaskedInputVector(
           resultTensorShape.take_front(packOp.getSourceRank()),
           inputVectorSizes)))
     return failure();
diff --git a/mlir/lib/Dialect/MemRef/Transforms/IndependenceTransforms.cpp b/mlir/lib/Dialect/MemRef/Transforms/IndependenceTransforms.cpp
index 10ba508265e7b..1f06318cbd60e 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/IndependenceTransforms.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/IndependenceTransforms.cpp
@@ -23,12 +23,11 @@ static FailureOr<OpFoldResult> makeIndependent(OpBuilder &b, Location loc,
                                                ValueRange independencies) {
   if (ofr.is<Attribute>())
     return ofr;
-  Value value = ofr.get<Value>();
   AffineMap boundMap;
   ValueDimList mapOperands;
   if (failed(ValueBoundsConstraintSet::computeIndependentBound(
-          boundMap, mapOperands, presburger::BoundType::UB, value,
-          /*dim=*/std::nullopt, independencies, /*closedUB=*/true)))
+          boundMap, mapOperands, presburger::BoundType::UB, ofr, independencies,
+          /*closedUB=*/true)))
     return failure();
   return affine::materializeComputedBound(b, loc, boundMap, mapOperands);
 }
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 90b49b2528b79..5d2281ce6094f 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -1548,90 +1548,41 @@ void printWsloop(OpAsmPrinter &p, Operation *op, Region &region,
   p.printRegion(region, /*printEntryBlockArgs=*/false);
 }
 
-/// loop-control ::= `(` ssa-id-list `)` `:` type `=`  loop-bounds
-/// loop-bounds := `(` ssa-id-list `)` to `(` ssa-id-list `)` inclusive? steps
-/// steps := `step` `(`ssa-id-list`)`
-ParseResult
-parseLoopControl(OpAsmParser &parser, Region &region,
-                 SmallVectorImpl<OpAsmParser::UnresolvedOperand> &lowerBound,
-                 SmallVectorImpl<OpAsmParser::UnresolvedOperand> &upperBound,
-                 SmallVectorImpl<OpAsmParser::UnresolvedOperand> &steps,
-                 SmallVectorImpl<Type> &loopVarTypes, UnitAttr &inclusive) {
-  // Parse an opening `(` followed by induction variables followed by `)`
-  SmallVector<OpAsmParser::Argument> ivs;
-  Type loopVarType;
-  if (parser.parseArgumentList(ivs, OpAsmParser::Delimiter::Paren) ||
-      parser.parseColonType(loopVarType) ||
-      // Parse loop bounds.
-      parser.parseEqual() ||
-      parser.parseOperandList(lowerBound, ivs.size(),
-                              OpAsmParser::Delimiter::Paren) ||
-      parser.parseKeyword("to") ||
-      parser.parseOperandList(upperBound, ivs.size(),
-                              OpAsmParser::Delimiter::Paren))
-    return failure();
-
-  if (succeeded(parser.parseOptionalKeyword("inclusive")))
-    inclusive = UnitAttr::get(parser.getBuilder().getContext());
-
-  // Parse step values.
-  if (parser.parseKeyword("step") ||
-      parser.parseOperandList(steps, ivs.size(), OpAsmParser::Delimiter::Paren))
-    return failure();
-
-  // Now parse the body.
-  loopVarTypes = SmallVector<Type>(ivs.size(), loopVarType);
-  for (auto &iv : ivs)
-    iv.type = loopVarType;
-
-  return parser.parseRegion(region, ivs);
-}
-
-void printLoopControl(OpAsmPrinter &p, Operation *op, Region &region,
-                      ValueRange lowerBound, ValueRange upperBound,
-                      ValueRange steps, TypeRange loopVarTypes,
-                      UnitAttr inclusive) {
-  auto args = region.front().getArguments();
-  p << " (" << args << ") : " << args[0].getType() << " = (" << lowerBound
-    << ") to (" << upperBound << ") ";
-  if (inclusive)
-    p << "inclusive ";
-  p << "step (" << steps << ") ";
-  p.printRegion(region, /*printEntryBlockArgs=*/false);
-}
-
 //===----------------------------------------------------------------------===//
 // Simd construct [2.9.3.1]
 //===----------------------------------------------------------------------===//
 
-void SimdLoopOp::build(OpBuilder &builder, OperationState &state,
-                       const SimdLoopClauseOps &clauses) {
+void SimdOp::build(OpBuilder &builder, OperationState &state,
+                   const SimdClauseOps &clauses) {
   MLIRContext *ctx = builder.getContext();
   // TODO Store clauses in op: privateVars, reductionByRefAttr, reductionVars,
   // privatizers, reductionDeclSymbols.
-  SimdLoopOp::build(
-      builder, state, clauses.loopLBVar, clauses.loopUBVar, clauses.loopStepVar,
-      clauses.alignedVars, makeArrayAttr(ctx, clauses.alignmentAttrs),
-      clauses.ifVar, clauses.nontemporalVars, clauses.orderAttr,
-      clauses.simdlenAttr, clauses.safelenAttr, clauses.loopInclusiveAttr);
+  SimdOp::build(builder, state, clauses.alignedVars,
+                makeArrayAttr(ctx, clauses.alignmentAttrs), clauses.ifVar,
+                clauses.nontemporalVars, clauses.orderAttr, clauses.simdlenAttr,
+                clauses.safelenAttr);
 }
 
-LogicalResult SimdLoopOp::verify() {
-  if (this->getLowerBound().empty()) {
-    return emitOpError() << "empty lowerbound for simd loop operation";
-  }
-  if (this->getSimdlen().has_value() && this->getSafelen().has_value() &&
-      this->getSimdlen().value() > this->getSafelen().value()) {
+LogicalResult SimdOp::verify() {
+  if (getSimdlen().has_value() && getSafelen().has_value() &&
+      getSimdlen().value() > getSafelen().value())
     return emitOpError()
            << "simdlen clause and safelen clause are both present, but the "
               "simdlen value is not less than or equal to safelen value";
-  }
-  if (verifyAlignedClause(*this, this->getAlignmentValues(),
-                          this->getAlignedVars())
+
+  if (verifyAlignedClause(*this, getAlignmentValues(), getAlignedVars())
           .failed())
     return failure();
-  if (verifyNontemporalClause(*this, this->getNontemporalVars()).failed())
+
+  if (verifyNontemporalClause(*this, getNontemporalVars()).failed())
     return failure();
+
+  if (!isWrapper())
+    return emitOpError() << "must be a loop wrapper";
+
+  if (getNestedWrapper())
+    return emitOpError() << "must wrap an 'omp.loop_nest' directly";
+
   return success();
 }
 
@@ -1656,6 +1607,17 @@ LogicalResult DistributeOp::verify() {
     return emitError(
         "expected equal sizes for allocate and allocator variables");
 
+  if (!isWrapper())
+    return emitOpError() << "must be a loop wrapper";
+
+  if (LoopWrapperInterface nested = getNestedWrapper()) {
+    // Check for the allowed leaf constructs that may appear in a composite
+    // construct directly after DISTRIBUTE.
+    if (!isa<ParallelOp, SimdOp>(nested))
+      return emitError() << "only supported nested wrappers are 'omp.parallel' "
+                            "and 'omp.simd'";
+  }
+
   return success();
 }
 
@@ -1818,9 +1780,8 @@ void TaskloopOp::build(OpBuilder &builder, OperationState &state,
   MLIRContext *ctx = builder.getContext();
   // TODO Store clauses in op: reductionByRefAttr, privateVars, privatizers.
   TaskloopOp::build(
-      builder, state, clauses.loopLBVar, clauses.loopUBVar, clauses.loopStepVar,
-      clauses.loopInclusiveAttr, clauses.ifVar, clauses.finalVar,
-      clauses.untiedAttr, clauses.mergeableAttr, clauses.inReductionVars,
+      builder, state, clauses.ifVar, clauses.finalVar, clauses.untiedAttr,
+      clauses.mergeableAttr, clauses.inReductionVars,
       makeArrayAttr(ctx, clauses.inReductionDeclSymbols), clauses.reductionVars,
       makeArrayAttr(ctx, clauses.reductionDeclSymbols), clauses.priorityVar,
       clauses.allocateVars, clauses.allocatorVars, clauses.grainsizeVar,
@@ -1859,6 +1820,16 @@ LogicalResult TaskloopOp::verify() {
         "the grainsize clause and num_tasks clause are mutually exclusive and "
         "may not appear on the same taskloop directive");
   }
+
+  if (!isWrapper())
+    return emitOpError() << "must be a loop wrapper";
+
+  if (LoopWrapperInterface nested = getNestedWrapper()) {
+    // Check for the allowed leaf constructs that may appear in a composite
+    // construct directly after TASKLOOP.
+    if (!isa<SimdOp>(nested))
+      return emitError() << "only supported nested wrapper is 'omp.simd'";
+  }
   return success();
 }
 
diff --git a/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp
index 087ffc438a830..17a1c016ea16d 100644
--- a/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/IR/ValueBoundsOpInterfaceImpl.cpp
@@ -61,12 +61,13 @@ struct ForOpInterface
     // An EQ constraint can be added if the yielded value (dimension size)
     // equals the corresponding block argument (dimension size).
     if (cstr.populateAndCompare(
-            yieldedValue, dim, ValueBoundsConstraintSet::ComparisonOperator::EQ,
-            iterArg, dim)) {
+            /*lhs=*/{yieldedValue, dim},
+            ValueBoundsConstraintSet::ComparisonOperator::EQ,
+            /*rhs=*/{iterArg, dim})) {
       if (dim.has_value()) {
         cstr.bound(value)[*dim] == cstr.getExpr(initArg, dim);
       } else {
-        cstr.bound(value) == initArg;
+        cstr.bound(value) == cstr.getExpr(initArg);
       }
     }
   }
@@ -113,8 +114,9 @@ struct IfOpInterface
     // * result <= elseValue
     // * result >= thenValue
     if (cstr.populateAndCompare(
-            thenValue, dim, ValueBoundsConstraintSet::ComparisonOperator::LE,
-            elseValue, dim)) {
+            /*lhs=*/{thenValue, dim},
+            ValueBoundsConstraintSet::ComparisonOperator::LE,
+            /*rhs=*/{elseValue, dim})) {
       if (dim) {
         cstr.bound(value)[*dim] >= cstr.getExpr(thenValue, dim);
         cstr.bound(value)[*dim] <= cstr.getExpr(elseValue, dim);
@@ -127,8 +129,9 @@ struct IfOpInterface
     // * result <= thenValue
     // * result >= elseValue
     if (cstr.populateAndCompare(
-            elseValue, dim, ValueBoundsConstraintSet::ComparisonOperator::LE,
-            thenValue, dim)) {
+            /*lhs=*/{elseValue, dim},
+            ValueBoundsConstraintSet::ComparisonOperator::LE,
+            /*rhs=*/{thenValue, dim})) {
       if (dim) {
         cstr.bound(value)[*dim] >= cstr.getExpr(elseValue, dim);
         cstr.bound(value)[*dim] <= cstr.getExpr(thenValue, dim);
diff --git a/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp b/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp
index fea2f659535bb..7b4024b6861a7 100644
--- a/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/UpliftWhileToFor.cpp
@@ -101,38 +101,30 @@ FailureOr<scf::ForOp> mlir::scf::upliftWhileToForLoop(RewriterBase &rewriter,
 
   Block *afterBody = loop.getAfterBody();
   scf::YieldOp afterTerm = loop.getYieldOp();
-  auto argNumber = inductionVar.getArgNumber();
-  auto afterTermIndArg = afterTerm.getResults()[argNumber];
+  unsigned argNumber = inductionVar.getArgNumber();
+  Value afterTermIndArg = afterTerm.getResults()[argNumber];
 
-  auto inductionVarAfter = afterBody->getArgument(argNumber);
-
-  Value step;
+  Value inductionVarAfter = afterBody->getArgument(argNumber);
 
   // Find suitable `addi` op inside `after` block, one of the args must be an
   // Induction var passed from `before` block and second arg must be defined
   // outside of the loop and will be considered step value.
   // TODO: Add `subi` support?
-  for (auto &use : inductionVarAfter.getUses()) {
-    auto owner = dyn_cast<arith::AddIOp>(use.getOwner());
-    if (!owner)
-      continue;
-
-    auto other =
-        (inductionVarAfter == owner.getLhs() ? owner.getRhs() : owner.getLhs());
-    if (!dom.properlyDominates(other, loop))
-      continue;
-
-    if (afterTermIndArg != owner.getResult())
-      continue;
+  auto addOp = afterTermIndArg.getDefiningOp<arith::AddIOp>();
+  if (!addOp)
+    return rewriter.notifyMatchFailure(loop, "Didn't found suitable 'addi' op");
 
-    step = other;
-    break;
+  Value step;
+  if (addOp.getLhs() == inductionVarAfter) {
+    step = addOp.getRhs();
+  } else if (addOp.getRhs() == inductionVarAfter) {
+    step = addOp.getLhs();
   }
 
-  if (!step)
-    return rewriter.notifyMatchFailure(loop, "Didn't found suitable 'addi' op");
+  if (!step || !dom.properlyDominates(step, loop))
+    return rewriter.notifyMatchFailure(loop, "Invalid 'addi' form");
 
-  auto lb = loop.getInits()[argNumber];
+  Value lb = loop.getInits()[argNumber];
 
   assert(lb.getType().isIntOrIndex());
   assert(lb.getType() == ub.getType());
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index e9058394d33da..516b0943bdcfa 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -30,6 +30,14 @@
 #include "mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.cpp.inc"
 #include "mlir/Dialect/SparseTensor/IR/SparseTensorAttrEnums.cpp.inc"
 
+// Forward declarations, following custom print/parsing methods are referenced
+// by the generated code for SparseTensorTypes.td.
+static mlir::ParseResult parseLevelRange(mlir::AsmParser &,
+                                         mlir::sparse_tensor::Level &,
+                                         mlir::sparse_tensor::Level &);
+static void printLevelRange(mlir::AsmPrinter &, mlir::sparse_tensor::Level,
+                            mlir::sparse_tensor::Level);
+
 #define GET_TYPEDEF_CLASSES
 #include "mlir/Dialect/SparseTensor/IR/SparseTensorTypes.cpp.inc"
 
@@ -1953,6 +1961,108 @@ LogicalResult SortOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// Sparse Tensor Iteration Operations.
+//===----------------------------------------------------------------------===//
+
+IterSpaceType IteratorType::getIterSpaceType() const {
+  return IterSpaceType::get(getContext(), getEncoding(), getLoLvl(),
+                            getHiLvl());
+}
+
+IteratorType IterSpaceType::getIteratorType() const {
+  return IteratorType::get(getContext(), getEncoding(), getLoLvl(), getHiLvl());
+}
+
+/// Parses a level range in the form "$lo `to` $hi"
+/// or simply "$lo" if $hi - $lo = 1
+static ParseResult parseLevelRange(AsmParser &parser, Level &lvlLo,
+                                   Level &lvlHi) {
+  if (parser.parseInteger(lvlLo))
+    return failure();
+
+  if (succeeded(parser.parseOptionalKeyword("to"))) {
+    if (parser.parseInteger(lvlHi))
+      return failure();
+  } else {
+    lvlHi = lvlLo + 1;
+  }
+
+  if (lvlHi <= lvlLo)
+    parser.emitError(parser.getNameLoc(),
+                     "expect larger level upper bound than lower bound");
+
+  return success();
+}
+
+/// Parses a level range in the form "$lo `to` $hi"
+/// or simply "$lo" if $hi - $lo = 1
+static ParseResult parseLevelRange(OpAsmParser &parser, IntegerAttr &lvlLoAttr,
+                                   IntegerAttr &lvlHiAttr) {
+  Level lvlLo, lvlHi;
+  if (parseLevelRange(parser, lvlLo, lvlHi))
+    return failure();
+
+  lvlLoAttr = IntegerAttr::get(parser.getBuilder().getIndexType(), lvlLo);
+  lvlHiAttr = IntegerAttr::get(parser.getBuilder().getIndexType(), lvlHi);
+  return success();
+}
+
+/// Prints a level range in the form "$lo `to` $hi"
+/// or simply "$lo" if $hi - $lo = 1
+static void printLevelRange(AsmPrinter &p, Level lo, Level hi) {
+
+  if (lo + 1 == hi)
+    p << lo;
+  else
+    p << lo << " to " << hi;
+}
+
+/// Prints a level range in the form "$lo `to` $hi"
+/// or simply "$lo" if $hi - $lo = 1
+static void printLevelRange(OpAsmPrinter &p, Operation *, IntegerAttr lvlLo,
+                            IntegerAttr lvlHi) {
+  unsigned lo = lvlLo.getValue().getZExtValue();
+  unsigned hi = lvlHi.getValue().getZExtValue();
+  printLevelRange(p, lo, hi);
+}
+
+LogicalResult ExtractIterSpaceOp::inferReturnTypes(
+    MLIRContext *ctx, std::optional<Location> loc, ValueRange ops,
+    DictionaryAttr attr, OpaqueProperties prop, RegionRange region,
+    SmallVectorImpl<mlir::Type> &ret) {
+
+  ExtractIterSpaceOp::Adaptor adaptor(ops, attr, prop, region);
+  SparseTensorType stt = getSparseTensorType(adaptor.getTensor());
+  ret.push_back(IterSpaceType::get(ctx, stt.getEncoding(), adaptor.getLoLvl(),
+                                   adaptor.getHiLvl()));
+  return success();
+}
+
+LogicalResult ExtractIterSpaceOp::verify() {
+  if (getLoLvl() >= getHiLvl())
+    return emitOpError("expected smaller level low than level high");
+
+  TypedValue<IteratorType> pIter = getParentIter();
+  if ((pIter && getLoLvl() == 0) || (!pIter && getLoLvl() != 0)) {
+    return emitOpError(
+        "parent iterator should be specified iff level lower bound equals 0");
+  }
+
+  if (pIter) {
+    IterSpaceType spaceTp = getResultSpace().getType();
+    if (pIter.getType().getEncoding() != spaceTp.getEncoding())
+      return emitOpError(
+          "mismatch in parent iterator encoding and iteration space encoding.");
+
+    if (spaceTp.getLoLvl() != pIter.getType().getHiLvl())
+      return emitOpError("parent iterator should be used to extract an "
+                         "iteration space from a consecutive level.");
+  }
+
+  return success();
+}
+
 /// Materialize a single constant operation from a given attribute value with
 /// the desired resultant type.
 Operation *SparseTensorDialect::materializeConstant(OpBuilder &builder,
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
index f497be6e48eba..3a8972072ac3b 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
@@ -24,6 +24,7 @@
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
 #include "mlir/Dialect/SparseTensor/Transforms/Passes.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"
 
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
index 67080d8e301c1..d25efcf50ec56 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
@@ -289,8 +289,7 @@ static UnpackTileDimInfo getUnpackTileDimInfo(OpBuilder &b, UnPackOp unpackOp,
 
   info.isAlignedToInnerTileSize = false;
   FailureOr<int64_t> cstSize = ValueBoundsConstraintSet::computeConstantBound(
-      presburger::BoundType::UB,
-      getValueOrCreateConstantIndexOp(b, loc, tileSize), /*dim=*/std::nullopt,
+      presburger::BoundType::UB, tileSize,
       /*stopCondition=*/nullptr, /*closedUB=*/true);
   std::optional<int64_t> cstInnerSize = getConstantIntValue(innerTileSize);
   if (!failed(cstSize) && cstInnerSize) {
diff --git a/mlir/lib/Dialect/Tensor/Transforms/IndependenceTransforms.cpp b/mlir/lib/Dialect/Tensor/Transforms/IndependenceTransforms.cpp
index 721730862d49b..a89ce20048dff 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/IndependenceTransforms.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/IndependenceTransforms.cpp
@@ -28,7 +28,8 @@ static FailureOr<OpFoldResult> makeIndependent(OpBuilder &b, Location loc,
   ValueDimList mapOperands;
   if (failed(ValueBoundsConstraintSet::computeIndependentBound(
           boundMap, mapOperands, presburger::BoundType::UB, value,
-          /*dim=*/std::nullopt, independencies, /*closedUB=*/true)))
+          independencies,
+          /*closedUB=*/true)))
     return failure();
   return mlir::affine::materializeComputedBound(b, loc, boundMap, mapOperands);
 }
diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
index 2dd91e2f7a170..15381ec520e21 100644
--- a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
@@ -154,7 +154,7 @@ bool mlir::tensor::isCastLikeInsertSliceOp(InsertSliceOp op) {
       continue;
     }
     FailureOr<bool> equalDimSize = ValueBoundsConstraintSet::areEqual(
-        op.getSource(), op.getResult(), srcDim, resultDim);
+        {op.getSource(), srcDim}, {op.getResult(), resultDim});
     if (failed(equalDimSize) || !*equalDimSize)
       return false;
     ++srcDim;
@@ -178,7 +178,7 @@ bool mlir::tensor::isCastLikeExtractSliceOp(ExtractSliceOp op) {
       continue;
     }
     FailureOr<bool> equalDimSize = ValueBoundsConstraintSet::areEqual(
-        op.getSource(), op.getResult(), dim, resultDim);
+        {op.getSource(), dim}, {op.getResult(), resultDim});
     if (failed(equalDimSize) || !*equalDimSize)
       return false;
     ++resultDim;
diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index dc19022219e5b..53f958caa0bdb 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -396,6 +396,13 @@ DiagnosedSilenceableFailure transform::ApplyPatternsOp::applyToOne(
       static_cast<RewriterBase::Listener *>(rewriter.getListener());
   FrozenRewritePatternSet frozenPatterns(std::move(patterns));
 
+  config.maxIterations = getMaxIterations() == static_cast<uint64_t>(-1)
+                             ? GreedyRewriteConfig::kNoLimit
+                             : getMaxIterations();
+  config.maxNumRewrites = getMaxNumRewrites() == static_cast<uint64_t>(-1)
+                              ? GreedyRewriteConfig::kNoLimit
+                              : getMaxNumRewrites();
+
   // Apply patterns and CSE repetitively until a fixpoint is reached. If no CSE
   // was requested, apply the greedy pattern rewrite only once. (The greedy
   // pattern rewrite driver already iterates to a fixpoint internally.)
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 0b3f4b9c9dbea..24719fe748fe4 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -32,6 +32,17 @@ void XeGPUDialect::initialize() {
 //===----------------------------------------------------------------------===//
 // XeGPU_TensorDescAttr
 //===----------------------------------------------------------------------===//
+TensorDescAttr TensorDescAttr::get(mlir::MLIRContext *context,
+                                   xegpu::MemoryScope memory_scope,
+                                   int array_length, bool boundary_check,
+                                   bool scattered) {
+  auto scopeAttr = MemoryScopeAttr::get(context, memory_scope);
+  auto lengthAttr =
+      IntegerAttr::get(IntegerType::get(context, 64), array_length);
+  auto boundaryAttr = BoolAttr::get(context, boundary_check);
+  auto scatteredAttr = BoolAttr::get(context, scattered);
+  return Base::get(context, scopeAttr, lengthAttr, boundaryAttr, scatteredAttr);
+}
 
 //===----------------------------------------------------------------------===//
 // XeGPU_TensorDescType
@@ -96,6 +107,16 @@ void TensorDescType::print(::mlir::AsmPrinter &printer) const {
   printer << ">";
 }
 
+TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
+                                   mlir::Type elementType, bool scattered,
+                                   int array_length, MemoryScope memory_scope,
+                                   bool boundary_check) {
+  auto context = elementType.getContext();
+  auto attr = TensorDescAttr::get(context, memory_scope, array_length,
+                                  boundary_check, scattered);
+  return Base::get(context, shape, elementType, attr);
+}
+
 } // namespace xegpu
 } // namespace mlir
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 02106f221f323..530c50ef74f7a 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -9,6 +9,9 @@
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/TypeUtilities.h"
+
+#include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "xegpu"
 
@@ -16,8 +19,8 @@ namespace mlir {
 namespace xegpu {
 
 static void transpose(llvm::ArrayRef<int64_t> trans,
-                      std::vector<int64_t> &shape) {
-  std::vector<int64_t> old = shape;
+                      SmallVector<int64_t> &shape) {
+  SmallVector<int64_t> old = shape;
   for (size_t i = 0; i < trans.size(); i++)
     shape[i] = old[trans[i]];
 }
@@ -38,6 +41,38 @@ static std::string makeString(T array, bool breakline = false) {
   return buf;
 }
 
+static SmallVector<int64_t> getShapeOf(Type type) {
+  SmallVector<int64_t> shape;
+  if (auto ty = llvm::dyn_cast<ShapedType>(type))
+    shape = SmallVector<int64_t>(ty.getShape());
+  else
+    shape.push_back(1);
+  return shape;
+}
+
+static int64_t getRankOf(Value val) {
+  auto type = val.getType();
+  if (auto ty = llvm::dyn_cast<ShapedType>(type))
+    return ty.getRank();
+  return 0;
+}
+
+static bool isReadHintOrNone(const CachePolicyAttr &attr) {
+  if (!attr)
+    return true;
+  auto kind = attr.getValue();
+  return kind == CachePolicy::CACHED || kind == CachePolicy::UNCACHED ||
+         kind == CachePolicy::STREAMING || kind == CachePolicy::READ_INVALIDATE;
+}
+
+static bool isWriteHintOrNone(const CachePolicyAttr &attr) {
+  if (!attr)
+    return true;
+  auto kind = attr.getValue();
+  return kind == CachePolicy::CACHED || kind == CachePolicy::UNCACHED ||
+         kind == CachePolicy::WRITE_BACK || kind == CachePolicy::WRITE_THROUGH;
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_CreateNdDescOp
 //===----------------------------------------------------------------------===//
@@ -114,6 +149,29 @@ LogicalResult CreateNdDescOp::verify() {
     return emitOpError("TensorDesc should have the same element "
                        "type with the source if it is a memref.\n");
 
+  if (getType().getScattered())
+    return emitOpError("Expects a non-scattered TensorDesc.\n");
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_PrefetchNdOp
+//===----------------------------------------------------------------------===//
+LogicalResult PrefetchNdOp::verify() {
+  auto tdescTy = getTensorDescType();
+  if (tdescTy.getScattered())
+    return emitOpError("Expects a non-scattered TensorDesc.\n");
+
+  if (!isReadHintOrNone(getL1HintAttr()))
+    return emitOpError("invlid l1_hint: ") << getL1HintAttr();
+
+  if (!isReadHintOrNone(getL2HintAttr()))
+    return emitOpError("invlid l2_hint: ") << getL2HintAttr();
+
+  if (!isReadHintOrNone(getL3HintAttr()))
+    return emitOpError("invlid l3_hint: ") << getL3HintAttr();
+
   return success();
 }
 
@@ -125,22 +183,26 @@ LogicalResult LoadNdOp::verify() {
   auto valueTy = getType();
 
   if (tdescTy.getRank() != 2)
-    return emitOpError(
-        "The TensorDesc for LoadNdOp should be a 2D TensorDesc.");
+    return emitOpError("Expecting a 2D TensorDesc.\n");
+
+  if (tdescTy.getScattered())
+    return emitOpError("Expects a non-scattered TensorDesc.\n");
 
   if (!valueTy)
     return emitOpError("Invalid result, it should be a VectorType.\n");
 
-  auto tdescElemTy = tdescTy.getElementType();
-  auto valueElemTy = valueTy.getElementType();
+  if (!isReadHintOrNone(getL1HintAttr()))
+    return emitOpError("invlid l1_hint: ") << getL1HintAttr();
 
-  if (tdescElemTy != valueElemTy)
-    return emitOpError(
-        "Value should have the same element type as TensorDesc.");
+  if (!isReadHintOrNone(getL2HintAttr()))
+    return emitOpError("invlid l2_hint: ") << getL2HintAttr();
+
+  if (!isReadHintOrNone(getL3HintAttr()))
+    return emitOpError("invlid l3_hint: ") << getL3HintAttr();
 
   auto array_len = tdescTy.getArrayLength();
-  auto tdescShape = tdescTy.getShape().vec();
-  auto valueShape = valueTy.getShape().vec();
+  auto tdescShape = getShapeOf(tdescTy);
+  auto valueShape = getShapeOf(valueTy);
 
   if (getTranspose()) {
     auto trans = getTranspose().value();
@@ -174,26 +236,174 @@ LogicalResult LoadNdOp::verify() {
 // XeGPU_StoreNdOp
 //===----------------------------------------------------------------------===//
 LogicalResult StoreNdOp::verify() {
-  auto dstTy = getTensorDesc().getType();               // Tile
-  auto valTy = getValue().getType().cast<VectorType>(); // Vector
+  auto dstTy = getTensorDescType(); // Tile
+  auto valTy = getValueType();      // Vector
 
   if (dstTy.getRank() != 2)
-    return emitOpError("Expecting a 2D TensorDesc shape.\n");
+    return emitOpError("Expecting a 2D TensorDesc.\n");
+
+  if (dstTy.getScattered())
+    return emitOpError("Expects a non-scattered TensorDesc.\n");
 
   if (!valTy)
     return emitOpError("Exepcting a VectorType result.\n");
 
-  auto dstElemTy = dstTy.getElementType();
-  auto valElemTy = valTy.getElementType();
+  if (!isWriteHintOrNone(getL1HintAttr()))
+    return emitOpError("invlid l1_hint: ") << getL1HintAttr();
+
+  if (!isWriteHintOrNone(getL2HintAttr()))
+    return emitOpError("invlid l2_hint: ") << getL2HintAttr();
+
+  if (!isWriteHintOrNone(getL3HintAttr()))
+    return emitOpError("invlid l3_hint: ") << getL3HintAttr();
+
+  return success();
+}
 
-  if (dstElemTy != valElemTy) {
-    return emitOpError() << "The element type of the value should "
-                            "match the elementtype of the TensorDesc.\n";
+//===----------------------------------------------------------------------===//
+// XeGPU_UpdateNDOffsetOp
+//===----------------------------------------------------------------------===//
+LogicalResult UpdateNdOffsetOp::verify() {
+  auto ty = getTensorDescType();
+  if (ty.getScattered())
+    return emitOpError("Expects a non-scattered TensorDesc.\n");
+
+  // number of offsets specified must match the rank of the tensor descriptor
+  if (ty.getRank() != (int64_t)getNumOffsets()) {
+    return emitOpError("Invalid number of offsets.");
   }
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_CreateDescOp
+//===----------------------------------------------------------------------===//
+void CreateDescOp::build(OpBuilder &builder, OperationState &state,
+                         TensorDescType TensorDesc, Value source,
+                         llvm::ArrayRef<OpFoldResult> offsets,
+                         uint32_t chunk_size) {
+  llvm::SmallVector<int64_t> staticOffsets;
+  llvm::SmallVector<Value> dynamicOffsets;
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+  build(builder, state, TensorDesc, source, dynamicOffsets, staticOffsets,
+        chunk_size);
+}
+
+LogicalResult CreateDescOp::verify() {
+  auto tdescTy = getTensorDescType();
+  auto chunkSize = getChunkSize();
+
+  if (getRankOf(getSource()) > 1)
+    return emitOpError(
+        "Expecting the source is a 1D memref or pointer (uint64_t).");
+
+  if (!tdescTy.getScattered())
+    return emitOpError("Expects a scattered TensorDesc.\n");
+
+  SmallVector<int64_t> shape({(int64_t)getNumOffsets()});
+  if (chunkSize != 1)
+    shape.push_back(chunkSize);
+
+  auto tdescShape = getShapeOf(tdescTy);
+  if (shape != tdescShape)
+    return emitOpError("Incorrect TensorDesc shape. ")
+           << "Expected is " << makeString(shape) << "\n";
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_PrefetchOp
+//===----------------------------------------------------------------------===//
+LogicalResult PrefetchOp::verify() {
+  auto tdescTy = getTensorDescType();
+  if (!tdescTy.getScattered())
+    return emitOpError("Expects a scattered TensorDesc.\n");
+
+  if (!isReadHintOrNone(getL1HintAttr()))
+    return emitOpError("invlid l1_hint: ") << getL1HintAttr();
+
+  if (!isReadHintOrNone(getL2HintAttr()))
+    return emitOpError("invlid l2_hint: ") << getL2HintAttr();
+
+  if (!isReadHintOrNone(getL3HintAttr()))
+    return emitOpError("invlid l3_hint: ") << getL3HintAttr();
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_LoadGatherOp
+//===----------------------------------------------------------------------===//
+LogicalResult LoadGatherOp::verify() {
+  auto tdescTy = getTensorDescType();
+  auto maskTy = getMaskType();
+  auto valueTy = getValueType();
+
+  if (!tdescTy.getScattered())
+    return emitOpError("Expects a scattered TensorDesc.\n");
+
+  if (!isReadHintOrNone(getL1HintAttr()))
+    return emitOpError("invlid l1_hint: ") << getL1HintAttr();
+
+  if (!isReadHintOrNone(getL2HintAttr()))
+    return emitOpError("invlid l2_hint: ") << getL2HintAttr();
+
+  if (!isReadHintOrNone(getL3HintAttr()))
+    return emitOpError("invlid l3_hint: ") << getL3HintAttr();
+
+  auto tdescElemTy = tdescTy.getElementType();
+  auto valueElemTy = getElementType();
+  if (tdescElemTy != valueElemTy)
+    return emitOpError(
+        "Value should have the same element type as TensorDesc.");
+
+  auto maskShape = getShapeOf(maskTy);
+  auto valueShape = getShapeOf(valueTy);
+  auto tdescShape = getShapeOf(tdescTy);
+
+  if (tdescShape[0] != maskShape[0])
+    return emitOpError("dim-0 of the Mask and TensorDesc should be the same.");
+
+  if (getTransposeAttr()) {
+    auto trans = getTranspose().value();
+    if (tdescShape.size() < trans.size())
+      emitWarning("Invalid transpose attr. It is ignored.");
+    else
+      transpose(trans, tdescShape);
+  }
+
+  if (valueShape != tdescShape)
+    return emitOpError("Unexpected result shape")
+           << "(Expected shape: " << makeString(tdescShape)
+           << ", Given shape: " << makeString(valueShape) << ").\n";
+
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_StoreScatterOp
+//===----------------------------------------------------------------------===//
+LogicalResult StoreScatterOp::verify() {
+  auto tdescTy = getTensorDescType();
+  if (!tdescTy.getScattered())
+    return emitOpError("Expects a scattered TensorDesc.\n");
+
+  if (!isWriteHintOrNone(getL1HintAttr()))
+    return emitOpError("invlid l1_hint: ") << getL1HintAttr();
+
+  if (!isWriteHintOrNone(getL2HintAttr()))
+    return emitOpError("invlid l2_hint: ") << getL2HintAttr();
+
+  if (!isWriteHintOrNone(getL3HintAttr()))
+    return emitOpError("invlid l3_hint: ") << getL3HintAttr();
+
+  auto maskTy = getMaskType();
+  auto maskShape = getShapeOf(maskTy);
+  auto tdescShape = getShapeOf(tdescTy);
+  if (tdescShape[0] != maskShape[0])
+    return emitOpError("dim-0 of the Mask and TensorDesc should be the same.");
 
-  if (dstTy.getShape() != valTy.getShape())
-    return emitOpError()
-           << "The result shape should match the TensorDesc shape.\n";
   return success();
 }
 
diff --git a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
index ffa4c0b55cad7..87937591e60ad 100644
--- a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
+++ b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
@@ -25,6 +25,12 @@ namespace mlir {
 #include "mlir/Interfaces/ValueBoundsOpInterface.cpp.inc"
 } // namespace mlir
 
+static Operation *getOwnerOfValue(Value value) {
+  if (auto bbArg = dyn_cast<BlockArgument>(value))
+    return bbArg.getOwner()->getParentOp();
+  return value.getDefiningOp();
+}
+
 HyperrectangularSlice::HyperrectangularSlice(ArrayRef<OpFoldResult> offsets,
                                              ArrayRef<OpFoldResult> sizes,
                                              ArrayRef<OpFoldResult> strides)
@@ -67,6 +73,83 @@ static std::optional<int64_t> getConstantIntValue(OpFoldResult ofr) {
   return std::nullopt;
 }
 
+ValueBoundsConstraintSet::Variable::Variable(OpFoldResult ofr)
+    : Variable(ofr, std::nullopt) {}
+
+ValueBoundsConstraintSet::Variable::Variable(Value indexValue)
+    : Variable(static_cast<OpFoldResult>(indexValue)) {}
+
+ValueBoundsConstraintSet::Variable::Variable(Value shapedValue, int64_t dim)
+    : Variable(static_cast<OpFoldResult>(shapedValue), std::optional(dim)) {}
+
+ValueBoundsConstraintSet::Variable::Variable(OpFoldResult ofr,
+                                             std::optional<int64_t> dim) {
+  Builder b(ofr.getContext());
+  if (auto constInt = ::getConstantIntValue(ofr)) {
+    assert(!dim && "expected no dim for index-typed values");
+    map = AffineMap::get(/*dimCount=*/0, /*symbolCount=*/0,
+                         b.getAffineConstantExpr(*constInt));
+    return;
+  }
+  Value value = cast<Value>(ofr);
+#ifndef NDEBUG
+  if (dim) {
+    assert(isa<ShapedType>(value.getType()) && "expected shaped type");
+  } else {
+    assert(value.getType().isIndex() && "expected index type");
+  }
+#endif // NDEBUG
+  map = AffineMap::get(/*dimCount=*/0, /*symbolCount=*/1,
+                       b.getAffineSymbolExpr(0));
+  mapOperands.emplace_back(value, dim);
+}
+
+ValueBoundsConstraintSet::Variable::Variable(AffineMap map,
+                                             ArrayRef<Variable> mapOperands) {
+  assert(map.getNumResults() == 1 && "expected single result");
+
+  // Turn all dims into symbols.
+  Builder b(map.getContext());
+  SmallVector<AffineExpr> dimReplacements, symReplacements;
+  for (int64_t i = 0, e = map.getNumDims(); i < e; ++i)
+    dimReplacements.push_back(b.getAffineSymbolExpr(i));
+  for (int64_t i = 0, e = map.getNumSymbols(); i < e; ++i)
+    symReplacements.push_back(b.getAffineSymbolExpr(i + map.getNumDims()));
+  AffineMap tmpMap = map.replaceDimsAndSymbols(
+      dimReplacements, symReplacements, /*numResultDims=*/0,
+      /*numResultSyms=*/map.getNumSymbols() + map.getNumDims());
+
+  // Inline operands.
+  DenseMap<AffineExpr, AffineExpr> replacements;
+  for (auto [index, var] : llvm::enumerate(mapOperands)) {
+    assert(var.map.getNumResults() == 1 && "expected single result");
+    assert(var.map.getNumDims() == 0 && "expected only symbols");
+    SmallVector<AffineExpr> symReplacements;
+    for (auto valueDim : var.mapOperands) {
+      auto it = llvm::find(this->mapOperands, valueDim);
+      if (it != this->mapOperands.end()) {
+        // There is already a symbol for this operand.
+        symReplacements.push_back(b.getAffineSymbolExpr(
+            std::distance(this->mapOperands.begin(), it)));
+      } else {
+        // This is a new operand: add a new symbol.
+        symReplacements.push_back(
+            b.getAffineSymbolExpr(this->mapOperands.size()));
+        this->mapOperands.push_back(valueDim);
+      }
+    }
+    replacements[b.getAffineSymbolExpr(index)] =
+        var.map.getResult(0).replaceSymbols(symReplacements);
+  }
+  this->map = tmpMap.replace(replacements, /*numResultDims=*/0,
+                             /*numResultSyms=*/this->mapOperands.size());
+}
+
+ValueBoundsConstraintSet::Variable::Variable(AffineMap map,
+                                             ArrayRef<Value> mapOperands)
+    : Variable(map, llvm::map_to_vector(mapOperands,
+                                        [](Value v) { return Variable(v); })) {}
+
 ValueBoundsConstraintSet::ValueBoundsConstraintSet(
     MLIRContext *ctx, StopConditionFn stopCondition)
     : builder(ctx), stopCondition(stopCondition) {
@@ -176,6 +259,11 @@ int64_t ValueBoundsConstraintSet::insert(Value value,
   assert(!valueDimToPosition.contains(valueDim) && "already mapped");
   int64_t pos = isSymbol ? cstr.appendVar(VarKind::Symbol)
                          : cstr.appendVar(VarKind::SetDim);
+  LLVM_DEBUG(llvm::dbgs() << "Inserting constraint set column " << pos
+                          << " for: " << value
+                          << " (dim: " << dim.value_or(kIndexValue)
+                          << ", owner: " << getOwnerOfValue(value)->getName()
+                          << ")\n");
   positionToValueDim.insert(positionToValueDim.begin() + pos, valueDim);
   // Update reverse mapping.
   for (int64_t i = pos, e = positionToValueDim.size(); i < e; ++i)
@@ -194,6 +282,8 @@ int64_t ValueBoundsConstraintSet::insert(Value value,
 int64_t ValueBoundsConstraintSet::insert(bool isSymbol) {
   int64_t pos = isSymbol ? cstr.appendVar(VarKind::Symbol)
                          : cstr.appendVar(VarKind::SetDim);
+  LLVM_DEBUG(llvm::dbgs() << "Inserting anonymous constraint set column " << pos
+                          << "\n");
   positionToValueDim.insert(positionToValueDim.begin() + pos, std::nullopt);
   // Update reverse mapping.
   for (int64_t i = pos, e = positionToValueDim.size(); i < e; ++i)
@@ -224,6 +314,10 @@ int64_t ValueBoundsConstraintSet::insert(AffineMap map, ValueDimList operands,
   return pos;
 }
 
+int64_t ValueBoundsConstraintSet::insert(const Variable &var, bool isSymbol) {
+  return insert(var.map, var.mapOperands, isSymbol);
+}
+
 int64_t ValueBoundsConstraintSet::getPos(Value value,
                                          std::optional<int64_t> dim) const {
 #ifndef NDEBUG
@@ -232,7 +326,10 @@ int64_t ValueBoundsConstraintSet::getPos(Value value,
           cast<BlockArgument>(value).getOwner()->isEntryBlock()) &&
          "unstructured control flow is not supported");
 #endif // NDEBUG
-
+  LLVM_DEBUG(llvm::dbgs() << "Getting pos for: " << value
+                          << " (dim: " << dim.value_or(kIndexValue)
+                          << ", owner: " << getOwnerOfValue(value)->getName()
+                          << ")\n");
   auto it =
       valueDimToPosition.find(std::make_pair(value, dim.value_or(kIndexValue)));
   assert(it != valueDimToPosition.end() && "expected mapped entry");
@@ -253,12 +350,6 @@ bool ValueBoundsConstraintSet::isMapped(Value value,
   return it != valueDimToPosition.end();
 }
 
-static Operation *getOwnerOfValue(Value value) {
-  if (auto bbArg = dyn_cast<BlockArgument>(value))
-    return bbArg.getOwner()->getParentOp();
-  return value.getDefiningOp();
-}
-
 void ValueBoundsConstraintSet::processWorklist() {
   LLVM_DEBUG(llvm::dbgs() << "Processing value bounds worklist...\n");
   while (!worklist.empty()) {
@@ -346,41 +437,47 @@ void ValueBoundsConstraintSet::projectOut(
   }
 }
 
+void ValueBoundsConstraintSet::projectOutAnonymous(
+    std::optional<int64_t> except) {
+  int64_t nextPos = 0;
+  while (nextPos < static_cast<int64_t>(positionToValueDim.size())) {
+    if (positionToValueDim[nextPos].has_value() || except == nextPos) {
+      ++nextPos;
+    } else {
+      projectOut(nextPos);
+      // The column was projected out so another column is now at that position.
+      // Do not increase the counter.
+    }
+  }
+}
+
 LogicalResult ValueBoundsConstraintSet::computeBound(
     AffineMap &resultMap, ValueDimList &mapOperands, presburger::BoundType type,
-    Value value, std::optional<int64_t> dim, StopConditionFn stopCondition,
-    bool closedUB) {
-#ifndef NDEBUG
-  assertValidValueDim(value, dim);
-#endif // NDEBUG
-
+    const Variable &var, StopConditionFn stopCondition, bool closedUB) {
+  MLIRContext *ctx = var.getContext();
   int64_t ubAdjustment = closedUB ? 0 : 1;
-  Builder b(value.getContext());
+  Builder b(ctx);
   mapOperands.clear();
 
   // Process the backward slice of `value` (i.e., reverse use-def chain) until
   // `stopCondition` is met.
-  ValueDim valueDim = std::make_pair(value, dim.value_or(kIndexValue));
-  ValueBoundsConstraintSet cstr(value.getContext(), stopCondition);
-  assert(!stopCondition(value, dim, cstr) &&
-         "stop condition should not be satisfied for starting point");
-  int64_t pos = cstr.insert(value, dim, /*isSymbol=*/false);
+  ValueBoundsConstraintSet cstr(ctx, stopCondition);
+  int64_t pos = cstr.insert(var, /*isSymbol=*/false);
+  assert(pos == 0 && "expected first column");
   cstr.processWorklist();
 
   // Project out all variables (apart from `valueDim`) that do not match the
   // stop condition.
   cstr.projectOut([&](ValueDim p) {
-    // Do not project out `valueDim`.
-    if (valueDim == p)
-      return false;
     auto maybeDim =
         p.second == kIndexValue ? std::nullopt : std::make_optional(p.second);
     return !stopCondition(p.first, maybeDim, cstr);
   });
+  cstr.projectOutAnonymous(/*except=*/pos);
 
   // Compute lower and upper bounds for `valueDim`.
   SmallVector<AffineMap> lb(1), ub(1);
-  cstr.cstr.getSliceBounds(pos, 1, value.getContext(), &lb, &ub,
+  cstr.cstr.getSliceBounds(pos, 1, ctx, &lb, &ub,
                            /*closedUB=*/true);
 
   // Note: There are TODOs in the implementation of `getSliceBounds`. In such a
@@ -477,10 +574,9 @@ LogicalResult ValueBoundsConstraintSet::computeBound(
 
 LogicalResult ValueBoundsConstraintSet::computeDependentBound(
     AffineMap &resultMap, ValueDimList &mapOperands, presburger::BoundType type,
-    Value value, std::optional<int64_t> dim, ValueDimList dependencies,
-    bool closedUB) {
+    const Variable &var, ValueDimList dependencies, bool closedUB) {
   return computeBound(
-      resultMap, mapOperands, type, value, dim,
+      resultMap, mapOperands, type, var,
       [&](Value v, std::optional<int64_t> d, ValueBoundsConstraintSet &cstr) {
         return llvm::is_contained(dependencies, std::make_pair(v, d));
       },
@@ -489,8 +585,7 @@ LogicalResult ValueBoundsConstraintSet::computeDependentBound(
 
 LogicalResult ValueBoundsConstraintSet::computeIndependentBound(
     AffineMap &resultMap, ValueDimList &mapOperands, presburger::BoundType type,
-    Value value, std::optional<int64_t> dim, ValueRange independencies,
-    bool closedUB) {
+    const Variable &var, ValueRange independencies, bool closedUB) {
   // Return "true" if the given value is independent of all values in
   // `independencies`. I.e., neither the value itself nor any value in the
   // backward slice (reverse use-def chain) is contained in `independencies`.
@@ -516,7 +611,7 @@ LogicalResult ValueBoundsConstraintSet::computeIndependentBound(
 
   // Reify bounds in terms of any independent values.
   return computeBound(
-      resultMap, mapOperands, type, value, dim,
+      resultMap, mapOperands, type, var,
       [&](Value v, std::optional<int64_t> d, ValueBoundsConstraintSet &cstr) {
         return isIndependent(v);
       },
@@ -524,35 +619,8 @@ LogicalResult ValueBoundsConstraintSet::computeIndependentBound(
 }
 
 FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
-    presburger::BoundType type, Value value, std::optional<int64_t> dim,
-    StopConditionFn stopCondition, bool closedUB) {
-#ifndef NDEBUG
-  assertValidValueDim(value, dim);
-#endif // NDEBUG
-
-  AffineMap map =
-      AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0,
-                     Builder(value.getContext()).getAffineDimExpr(0));
-  return computeConstantBound(type, map, {{value, dim}}, stopCondition,
-                              closedUB);
-}
-
-FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
-    presburger::BoundType type, AffineMap map, ArrayRef<Value> operands,
+    presburger::BoundType type, const Variable &var,
     StopConditionFn stopCondition, bool closedUB) {
-  ValueDimList valueDims;
-  for (Value v : operands) {
-    assert(v.getType().isIndex() && "expected index type");
-    valueDims.emplace_back(v, std::nullopt);
-  }
-  return computeConstantBound(type, map, valueDims, stopCondition, closedUB);
-}
-
-FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
-    presburger::BoundType type, AffineMap map, ValueDimList operands,
-    StopConditionFn stopCondition, bool closedUB) {
-  assert(map.getNumResults() == 1 && "expected affine map with one result");
-
   // Default stop condition if none was specified: Keep adding constraints until
   // a bound could be computed.
   int64_t pos = 0;
@@ -562,8 +630,8 @@ FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
   };
 
   ValueBoundsConstraintSet cstr(
-      map.getContext(), stopCondition ? stopCondition : defaultStopCondition);
-  pos = cstr.populateConstraints(map, operands);
+      var.getContext(), stopCondition ? stopCondition : defaultStopCondition);
+  pos = cstr.populateConstraints(var.map, var.mapOperands);
   assert(pos == 0 && "expected `map` is the first column");
 
   // Compute constant bound for `valueDim`.
@@ -608,22 +676,13 @@ ValueBoundsConstraintSet::computeConstantDelta(Value value1, Value value2,
   Builder b(value1.getContext());
   AffineMap map = AffineMap::get(/*dimCount=*/2, /*symbolCount=*/0,
                                  b.getAffineDimExpr(0) - b.getAffineDimExpr(1));
-  return computeConstantBound(presburger::BoundType::EQ, map,
-                              {{value1, dim1}, {value2, dim2}});
+  return computeConstantBound(presburger::BoundType::EQ,
+                              Variable(map, {{value1, dim1}, {value2, dim2}}));
 }
 
-bool ValueBoundsConstraintSet::compareValueDims(OpFoldResult lhs,
-                                                std::optional<int64_t> lhsDim,
-                                                ComparisonOperator cmp,
-                                                OpFoldResult rhs,
-                                                std::optional<int64_t> rhsDim) {
-#ifndef NDEBUG
-  if (auto lhsVal = dyn_cast<Value>(lhs))
-    assertValidValueDim(lhsVal, lhsDim);
-  if (auto rhsVal = dyn_cast<Value>(rhs))
-    assertValidValueDim(rhsVal, rhsDim);
-#endif // NDEBUG
-
+bool ValueBoundsConstraintSet::comparePos(int64_t lhsPos,
+                                          ComparisonOperator cmp,
+                                          int64_t rhsPos) {
   // This function returns "true" if "lhs CMP rhs" is proven to hold.
   //
   // Example for ComparisonOperator::LE and index-typed values: We would like to
@@ -640,50 +699,6 @@ bool ValueBoundsConstraintSet::compareValueDims(OpFoldResult lhs,
     return false;
   }
 
-  // EQ can be expressed as LE and GE.
-  if (cmp == EQ)
-    return compareValueDims(lhs, lhsDim, ComparisonOperator::LE, rhs, rhsDim) &&
-           compareValueDims(lhs, lhsDim, ComparisonOperator::GE, rhs, rhsDim);
-
-  // Construct inequality. For the above example: lhs > rhs.
-  // `IntegerRelation` inequalities are expressed in the "flattened" form and
-  // with ">= 0". I.e., lhs - rhs - 1 >= 0.
-  SmallVector<int64_t> eq(cstr.getNumCols(), 0);
-  auto addToEq = [&](OpFoldResult ofr, std::optional<int64_t> dim,
-                     int64_t factor) {
-    if (auto constVal = ::getConstantIntValue(ofr)) {
-      eq[cstr.getNumCols() - 1] += *constVal * factor;
-    } else {
-      eq[getPos(cast<Value>(ofr), dim)] += factor;
-    }
-  };
-  if (cmp == LT || cmp == LE) {
-    addToEq(lhs, lhsDim, 1);
-    addToEq(rhs, rhsDim, -1);
-  } else if (cmp == GT || cmp == GE) {
-    addToEq(lhs, lhsDim, -1);
-    addToEq(rhs, rhsDim, 1);
-  } else {
-    llvm_unreachable("unsupported comparison operator");
-  }
-  if (cmp == LE || cmp == GE)
-    eq[cstr.getNumCols() - 1] -= 1;
-
-  // Add inequality to the constraint set and check if it made the constraint
-  // set empty.
-  int64_t ineqPos = cstr.getNumInequalities();
-  cstr.addInequality(eq);
-  bool isEmpty = cstr.isEmpty();
-  cstr.removeInequality(ineqPos);
-  return isEmpty;
-}
-
-bool ValueBoundsConstraintSet::comparePos(int64_t lhsPos,
-                                          ComparisonOperator cmp,
-                                          int64_t rhsPos) {
-  // This function returns "true" if "lhs CMP rhs" is proven to hold. For
-  // detailed documentation, see `compareValueDims`.
-
   // EQ can be expressed as LE and GE.
   if (cmp == EQ)
     return comparePos(lhsPos, ComparisonOperator::LE, rhsPos) &&
@@ -712,48 +727,17 @@ bool ValueBoundsConstraintSet::comparePos(int64_t lhsPos,
   return isEmpty;
 }
 
-bool ValueBoundsConstraintSet::populateAndCompare(
-    OpFoldResult lhs, std::optional<int64_t> lhsDim, ComparisonOperator cmp,
-    OpFoldResult rhs, std::optional<int64_t> rhsDim) {
-#ifndef NDEBUG
-  if (auto lhsVal = dyn_cast<Value>(lhs))
-    assertValidValueDim(lhsVal, lhsDim);
-  if (auto rhsVal = dyn_cast<Value>(rhs))
-    assertValidValueDim(rhsVal, rhsDim);
-#endif // NDEBUG
-
-  if (auto lhsVal = dyn_cast<Value>(lhs))
-    populateConstraints(lhsVal, lhsDim);
-  if (auto rhsVal = dyn_cast<Value>(rhs))
-    populateConstraints(rhsVal, rhsDim);
-
-  return compareValueDims(lhs, lhsDim, cmp, rhs, rhsDim);
+bool ValueBoundsConstraintSet::populateAndCompare(const Variable &lhs,
+                                                  ComparisonOperator cmp,
+                                                  const Variable &rhs) {
+  int64_t lhsPos = populateConstraints(lhs.map, lhs.mapOperands);
+  int64_t rhsPos = populateConstraints(rhs.map, rhs.mapOperands);
+  return comparePos(lhsPos, cmp, rhsPos);
 }
 
-bool ValueBoundsConstraintSet::compare(OpFoldResult lhs,
-                                       std::optional<int64_t> lhsDim,
-                                       ComparisonOperator cmp, OpFoldResult rhs,
-                                       std::optional<int64_t> rhsDim) {
-  auto stopCondition = [&](Value v, std::optional<int64_t> dim,
-                           ValueBoundsConstraintSet &cstr) {
-    // Keep processing as long as lhs/rhs are not mapped.
-    if (auto lhsVal = dyn_cast<Value>(lhs))
-      if (!cstr.isMapped(lhsVal, dim))
-        return false;
-    if (auto rhsVal = dyn_cast<Value>(rhs))
-      if (!cstr.isMapped(rhsVal, dim))
-        return false;
-    // Keep processing as long as the relation cannot be proven.
-    return cstr.compareValueDims(lhs, lhsDim, cmp, rhs, rhsDim);
-  };
-
-  ValueBoundsConstraintSet cstr(lhs.getContext(), stopCondition);
-  return cstr.populateAndCompare(lhs, lhsDim, cmp, rhs, rhsDim);
-}
-
-bool ValueBoundsConstraintSet::compare(AffineMap lhs, ValueDimList lhsOperands,
-                                       ComparisonOperator cmp, AffineMap rhs,
-                                       ValueDimList rhsOperands) {
+bool ValueBoundsConstraintSet::compare(const Variable &lhs,
+                                       ComparisonOperator cmp,
+                                       const Variable &rhs) {
   int64_t lhsPos = -1, rhsPos = -1;
   auto stopCondition = [&](Value v, std::optional<int64_t> dim,
                            ValueBoundsConstraintSet &cstr) {
@@ -765,39 +749,17 @@ bool ValueBoundsConstraintSet::compare(AffineMap lhs, ValueDimList lhsOperands,
     return cstr.comparePos(lhsPos, cmp, rhsPos);
   };
   ValueBoundsConstraintSet cstr(lhs.getContext(), stopCondition);
-  lhsPos = cstr.insert(lhs, lhsOperands);
-  rhsPos = cstr.insert(rhs, rhsOperands);
-  cstr.processWorklist();
+  lhsPos = cstr.populateConstraints(lhs.map, lhs.mapOperands);
+  rhsPos = cstr.populateConstraints(rhs.map, rhs.mapOperands);
   return cstr.comparePos(lhsPos, cmp, rhsPos);
 }
 
-bool ValueBoundsConstraintSet::compare(AffineMap lhs,
-                                       ArrayRef<Value> lhsOperands,
-                                       ComparisonOperator cmp, AffineMap rhs,
-                                       ArrayRef<Value> rhsOperands) {
-  ValueDimList lhsValueDimOperands =
-      llvm::map_to_vector(lhsOperands, [](Value v) {
-        return std::make_pair(v, std::optional<int64_t>());
-      });
-  ValueDimList rhsValueDimOperands =
-      llvm::map_to_vector(rhsOperands, [](Value v) {
-        return std::make_pair(v, std::optional<int64_t>());
-      });
-  return ValueBoundsConstraintSet::compare(lhs, lhsValueDimOperands, cmp, rhs,
-                                           rhsValueDimOperands);
-}
-
-FailureOr<bool>
-ValueBoundsConstraintSet::areEqual(OpFoldResult value1, OpFoldResult value2,
-                                   std::optional<int64_t> dim1,
-                                   std::optional<int64_t> dim2) {
-  if (ValueBoundsConstraintSet::compare(value1, dim1, ComparisonOperator::EQ,
-                                        value2, dim2))
+FailureOr<bool> ValueBoundsConstraintSet::areEqual(const Variable &var1,
+                                                   const Variable &var2) {
+  if (ValueBoundsConstraintSet::compare(var1, ComparisonOperator::EQ, var2))
     return true;
-  if (ValueBoundsConstraintSet::compare(value1, dim1, ComparisonOperator::LT,
-                                        value2, dim2) ||
-      ValueBoundsConstraintSet::compare(value1, dim1, ComparisonOperator::GT,
-                                        value2, dim2))
+  if (ValueBoundsConstraintSet::compare(var1, ComparisonOperator::LT, var2) ||
+      ValueBoundsConstraintSet::compare(var1, ComparisonOperator::GT, var2))
     return false;
   return failure();
 }
@@ -833,7 +795,7 @@ ValueBoundsConstraintSet::areOverlappingSlices(MLIRContext *ctx,
       AffineMap foldedMap =
           foldAttributesIntoMap(b, map, ofrOperands, valueOperands);
       FailureOr<int64_t> constBound = computeConstantBound(
-          presburger::BoundType::EQ, foldedMap, valueOperands);
+          presburger::BoundType::EQ, Variable(foldedMap, valueOperands));
       foundUnknownBound |= failed(constBound);
       if (succeeded(constBound) && *constBound <= 0)
         return false;
@@ -850,7 +812,7 @@ ValueBoundsConstraintSet::areOverlappingSlices(MLIRContext *ctx,
       AffineMap foldedMap =
           foldAttributesIntoMap(b, map, ofrOperands, valueOperands);
       FailureOr<int64_t> constBound = computeConstantBound(
-          presburger::BoundType::EQ, foldedMap, valueOperands);
+          presburger::BoundType::EQ, Variable(foldedMap, valueOperands));
       foundUnknownBound |= failed(constBound);
       if (succeeded(constBound) && *constBound <= 0)
         return false;
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 300fc8ba56fc5..e89ff9209b034 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1406,9 +1406,10 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
 
 /// Converts an OpenMP simd loop into LLVM IR using OpenMPIRBuilder.
 static LogicalResult
-convertOmpSimdLoop(Operation &opInst, llvm::IRBuilderBase &builder,
-                   LLVM::ModuleTranslation &moduleTranslation) {
-  auto loop = cast<omp::SimdLoopOp>(opInst);
+convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
+               LLVM::ModuleTranslation &moduleTranslation) {
+  auto simdOp = cast<omp::SimdOp>(opInst);
+  auto loopOp = cast<omp::LoopNestOp>(simdOp.getWrappedLoop());
 
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
 
@@ -1421,33 +1422,34 @@ convertOmpSimdLoop(Operation &opInst, llvm::IRBuilderBase &builder,
   auto bodyGen = [&](llvm::OpenMPIRBuilder::InsertPointTy ip, llvm::Value *iv) {
     // Make sure further conversions know about the induction variable.
     moduleTranslation.mapValue(
-        loop.getRegion().front().getArgument(loopInfos.size()), iv);
+        loopOp.getRegion().front().getArgument(loopInfos.size()), iv);
 
     // Capture the body insertion point for use in nested loops. BodyIP of the
     // CanonicalLoopInfo always points to the beginning of the entry block of
     // the body.
     bodyInsertPoints.push_back(ip);
 
-    if (loopInfos.size() != loop.getNumLoops() - 1)
+    if (loopInfos.size() != loopOp.getNumLoops() - 1)
       return;
 
     // Convert the body of the loop.
     builder.restoreIP(ip);
-    convertOmpOpRegions(loop.getRegion(), "omp.simdloop.region", builder,
+    convertOmpOpRegions(loopOp.getRegion(), "omp.simd.region", builder,
                         moduleTranslation, bodyGenStatus);
   };
 
   // Delegate actual loop construction to the OpenMP IRBuilder.
-  // TODO: this currently assumes SimdLoop is semantically similar to SCF loop,
-  // i.e. it has a positive step, uses signed integer semantics. Reconsider
-  // this code when SimdLoop clearly supports more cases.
+  // TODO: this currently assumes omp.loop_nest is semantically similar to SCF
+  // loop, i.e. it has a positive step, uses signed integer semantics.
+  // Reconsider this code when the nested loop operation clearly supports more
+  // cases.
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
-  for (unsigned i = 0, e = loop.getNumLoops(); i < e; ++i) {
+  for (unsigned i = 0, e = loopOp.getNumLoops(); i < e; ++i) {
     llvm::Value *lowerBound =
-        moduleTranslation.lookupValue(loop.getLowerBound()[i]);
+        moduleTranslation.lookupValue(loopOp.getLowerBound()[i]);
     llvm::Value *upperBound =
-        moduleTranslation.lookupValue(loop.getUpperBound()[i]);
-    llvm::Value *step = moduleTranslation.lookupValue(loop.getStep()[i]);
+        moduleTranslation.lookupValue(loopOp.getUpperBound()[i]);
+    llvm::Value *step = moduleTranslation.lookupValue(loopOp.getStep()[i]);
 
     // Make sure loop trip count are emitted in the preheader of the outermost
     // loop at the latest so that they are all available for the new collapsed
@@ -1473,18 +1475,18 @@ convertOmpSimdLoop(Operation &opInst, llvm::IRBuilderBase &builder,
       ompBuilder->collapseLoops(ompLoc.DL, loopInfos, {});
 
   llvm::ConstantInt *simdlen = nullptr;
-  if (std::optional<uint64_t> simdlenVar = loop.getSimdlen())
+  if (std::optional<uint64_t> simdlenVar = simdOp.getSimdlen())
     simdlen = builder.getInt64(simdlenVar.value());
 
   llvm::ConstantInt *safelen = nullptr;
-  if (std::optional<uint64_t> safelenVar = loop.getSafelen())
+  if (std::optional<uint64_t> safelenVar = simdOp.getSafelen())
     safelen = builder.getInt64(safelenVar.value());
 
   llvm::MapVector<llvm::Value *, llvm::Value *> alignedVars;
   ompBuilder->applySimd(
       loopInfo, alignedVars,
-      loop.getIfExpr() ? moduleTranslation.lookupValue(loop.getIfExpr())
-                       : nullptr,
+      simdOp.getIfExpr() ? moduleTranslation.lookupValue(simdOp.getIfExpr())
+                         : nullptr,
       llvm::omp::OrderKind::OMP_ORDER_unknown, simdlen, safelen);
 
   builder.restoreIP(afterIP);
@@ -3198,8 +3200,8 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder,
       .Case([&](omp::WsloopOp) {
         return convertOmpWsloop(*op, builder, moduleTranslation);
       })
-      .Case([&](omp::SimdLoopOp) {
-        return convertOmpSimdLoop(*op, builder, moduleTranslation);
+      .Case([&](omp::SimdOp) {
+        return convertOmpSimd(*op, builder, moduleTranslation);
       })
       .Case([&](omp::AtomicReadOp) {
         return convertOmpAtomicRead(*op, builder, moduleTranslation);
@@ -3421,7 +3423,6 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::convertOperation(
       return convertTargetOpsInNest(op, builder, moduleTranslation);
     }
   }
-
   return convertHostOrTargetOperation(op, builder, moduleTranslation);
 }
 
diff --git a/mlir/python/CMakeLists.txt b/mlir/python/CMakeLists.txt
index c27ee688a0408..0a2dc0754c09d 100644
--- a/mlir/python/CMakeLists.txt
+++ b/mlir/python/CMakeLists.txt
@@ -524,6 +524,19 @@ declare_mlir_python_extension(MLIRPythonExtension.Dialects.Quant.Pybind
     MLIRCAPIQuant
 )
 
+declare_mlir_python_extension(MLIRPythonExtension.Dialects.NVGPU.Pybind
+  MODULE_NAME _mlirDialectsNvgpu
+  ADD_TO_PARENT MLIRPythonSources.Dialects.nvgpu
+  ROOT_DIR "${PYTHON_SOURCE_DIR}"
+  SOURCES
+    DialectNVGPU.cpp
+  PRIVATE_LINK_LIBS
+    LLVMSupport
+  EMBED_CAPI_LINK_LIBS
+    MLIRCAPIIR
+    MLIRCAPINVGPU
+)
+
 declare_mlir_python_extension(MLIRPythonExtension.Dialects.PDL.Pybind
   MODULE_NAME _mlirDialectsPDL
   ADD_TO_PARENT MLIRPythonSources.Dialects.pdl
diff --git a/mlir/python/mlir/dialects/nvgpu.py b/mlir/python/mlir/dialects/nvgpu.py
index 2f6993b768ca5..e19bf610ea33a 100644
--- a/mlir/python/mlir/dialects/nvgpu.py
+++ b/mlir/python/mlir/dialects/nvgpu.py
@@ -4,3 +4,4 @@
 
 from ._nvgpu_ops_gen import *
 from ._nvgpu_enum_gen import *
+from .._mlir_libs._mlirDialectsNvgpu import *
diff --git a/mlir/python/mlir/dialects/transform/interpreter/__init__.py b/mlir/python/mlir/dialects/transform/interpreter/__init__.py
index 6145b99224eb5..34cdc43cb617f 100644
--- a/mlir/python/mlir/dialects/transform/interpreter/__init__.py
+++ b/mlir/python/mlir/dialects/transform/interpreter/__init__.py
@@ -5,7 +5,6 @@
 from ....ir import Operation
 from ...._mlir_libs import _mlirTransformInterpreter as _cextTransformInterpreter
 
-
 TransformOptions = _cextTransformInterpreter.TransformOptions
 
 
@@ -31,3 +30,12 @@ def apply_named_sequence(
         _cextTransformInterpreter.apply_named_sequence(*args)
     else:
         _cextTransformInterpreter(*args, transform_options)
+
+
+def copy_symbols_and_merge_into(target, other):
+    """Copies symbols from other into target, renaming private symbols to avoid
+    duplicates. Raises an error if copying would lead to duplicate public
+    symbols."""
+    _cextTransformInterpreter.copy_symbols_and_merge_into(
+        _unpack_operation(target), _unpack_operation(other)
+    )
diff --git a/mlir/test/CAPI/ir.c b/mlir/test/CAPI/ir.c
index 8e79338c57a22..3d05b2a12dd8e 100644
--- a/mlir/test/CAPI/ir.c
+++ b/mlir/test/CAPI/ir.c
@@ -2244,9 +2244,22 @@ typedef struct {
   const char *x;
 } callBackData;
 
-void walkCallBack(MlirOperation op, void *rootOpVoid) {
+MlirWalkResult walkCallBack(MlirOperation op, void *rootOpVoid) {
   fprintf(stderr, "%s: %s\n", ((callBackData *)(rootOpVoid))->x,
           mlirIdentifierStr(mlirOperationGetName(op)).data);
+  return MlirWalkResultAdvance;
+}
+
+MlirWalkResult walkCallBackTestWalkResult(MlirOperation op, void *rootOpVoid) {
+  fprintf(stderr, "%s: %s\n", ((callBackData *)(rootOpVoid))->x,
+          mlirIdentifierStr(mlirOperationGetName(op)).data);
+  if (strcmp(mlirIdentifierStr(mlirOperationGetName(op)).data, "func.func") ==
+      0)
+    return MlirWalkResultSkip;
+  if (strcmp(mlirIdentifierStr(mlirOperationGetName(op)).data, "arith.addi") ==
+      0)
+    return MlirWalkResultInterrupt;
+  return MlirWalkResultAdvance;
 }
 
 int testOperationWalk(MlirContext ctx) {
@@ -2259,6 +2272,9 @@ int testOperationWalk(MlirContext ctx) {
                              "    arith.addi %1, %1: i32\n"
                              "    return\n"
                              "  }\n"
+                             "  func.func @bar() {\n"
+                             "    return\n"
+                             "  }\n"
                              "}";
   MlirModule module =
       mlirModuleCreateParse(ctx, mlirStringRefCreateFromCString(moduleString));
@@ -2266,22 +2282,42 @@ int testOperationWalk(MlirContext ctx) {
   callBackData data;
   data.x = "i love you";
 
-  // CHECK: i love you: arith.constant
-  // CHECK: i love you: arith.addi
-  // CHECK: i love you: func.return
-  // CHECK: i love you: func.func
-  // CHECK: i love you: builtin.module
+  // CHECK-NEXT: i love you: arith.constant
+  // CHECK-NEXT: i love you: arith.addi
+  // CHECK-NEXT: i love you: func.return
+  // CHECK-NEXT: i love you: func.func
+  // CHECK-NEXT: i love you: func.return
+  // CHECK-NEXT: i love you: func.func
+  // CHECK-NEXT: i love you: builtin.module
   mlirOperationWalk(mlirModuleGetOperation(module), walkCallBack,
                     (void *)(&data), MlirWalkPostOrder);
 
   data.x = "i don't love you";
-  // CHECK: i don't love you: builtin.module
-  // CHECK: i don't love you: func.func
-  // CHECK: i don't love you: arith.constant
-  // CHECK: i don't love you: arith.addi
-  // CHECK: i don't love you: func.return
+  // CHECK-NEXT: i don't love you: builtin.module
+  // CHECK-NEXT: i don't love you: func.func
+  // CHECK-NEXT: i don't love you: arith.constant
+  // CHECK-NEXT: i don't love you: arith.addi
+  // CHECK-NEXT: i don't love you: func.return
+  // CHECK-NEXT: i don't love you: func.func
+  // CHECK-NEXT: i don't love you: func.return
   mlirOperationWalk(mlirModuleGetOperation(module), walkCallBack,
                     (void *)(&data), MlirWalkPreOrder);
+
+  data.x = "interrupt";
+  // Interrupted at `arith.addi`
+  // CHECK-NEXT: interrupt: arith.constant
+  // CHECK-NEXT: interrupt: arith.addi
+  mlirOperationWalk(mlirModuleGetOperation(module), walkCallBackTestWalkResult,
+                    (void *)(&data), MlirWalkPostOrder);
+
+  data.x = "skip";
+  // Skip at `func.func`
+  // CHECK-NEXT: skip: builtin.module
+  // CHECK-NEXT: skip: func.func
+  // CHECK-NEXT: skip: func.func
+  mlirOperationWalk(mlirModuleGetOperation(module), walkCallBackTestWalkResult,
+                    (void *)(&data), MlirWalkPreOrder);
+
   mlirModuleDestroy(module);
   return 0;
 }
diff --git a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
index fa1d564d6ad35..827ae940165c7 100644
--- a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
+++ b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
@@ -2187,3 +2187,16 @@ func.func @complex_tanh_nnan_ninf(%arg: complex<f32>) -> complex<f32> {
 
 // CHECK-COUNT-1: arith.select
 // CHECK-NOT: arith.select
+
+// -----
+
+// CHECK-LABEL:   func.func @complex_angle_with_fmf
+// CHECK-SAME: %[[ARG:.*]]: complex<f32>
+func.func @complex_angle_with_fmf(%arg: complex<f32>) -> f32 {
+  %angle = complex.angle %arg fastmath<nnan,contract> : complex<f32>
+  return %angle : f32
+}
+// CHECK: %[[REAL:.*]] = complex.re %[[ARG]] : complex<f32>
+// CHECK: %[[IMAG:.*]] = complex.im %[[ARG]] : complex<f32>
+// CHECK: %[[RESULT:.*]] = math.atan2 %[[IMAG]], %[[REAL]] fastmath<nnan,contract> : f32
+// CHECK: return %[[RESULT]] : f32
\ No newline at end of file
diff --git a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
index dc5d6969ca789..9f45d139b81f2 100644
--- a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
+++ b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
@@ -145,9 +145,10 @@ func.func @threadprivate(%a: !llvm.ptr) -> () {
 
 // -----
 
-// CHECK:      llvm.func @simdloop_block_arg(%[[LOWER:.*]]: i32, %[[UPPER:.*]]: i32, %[[ITER:.*]]: i64) {
-// CHECK:      omp.simdloop   for  (%[[ARG_0:.*]]) : i32 =
-// CHECK-SAME:     (%[[LOWER]]) to (%[[UPPER]]) inclusive step (%[[LOWER]]) {
+// CHECK:      llvm.func @loop_nest_block_arg(%[[LOWER:.*]]: i32, %[[UPPER:.*]]: i32, %[[ITER:.*]]: i64) {
+// CHECK:      omp.simd {
+// CHECK-NEXT: omp.loop_nest (%[[ARG_0:.*]]) : i32 = (%[[LOWER]])
+// CHECK-SAME: to (%[[UPPER]]) inclusive step (%[[LOWER]]) {
 // CHECK:      llvm.br ^[[BB1:.*]](%[[ITER]] : i64)
 // CHECK:        ^[[BB1]](%[[VAL_0:.*]]: i64):
 // CHECK:          %[[VAL_1:.*]] = llvm.icmp "slt" %[[VAL_0]], %[[ITER]] : i64
@@ -157,17 +158,19 @@ func.func @threadprivate(%a: !llvm.ptr) -> () {
 // CHECK:          llvm.br ^[[BB1]](%[[VAL_2]] : i64)
 // CHECK:        ^[[BB3]]:
 // CHECK:          omp.yield
-func.func @simdloop_block_arg(%val : i32, %ub : i32, %i : index) {
-  omp.simdloop   for  (%arg0) : i32 = (%val) to (%ub) inclusive step (%val) {
-    cf.br ^bb1(%i : index)
-  ^bb1(%0: index):
-    %1 = arith.cmpi slt, %0, %i : index
-    cf.cond_br %1, ^bb2, ^bb3
-  ^bb2:
-    %2 = arith.addi %0, %i : index
-    cf.br ^bb1(%2 : index)
-  ^bb3:
-    omp.yield
+func.func @loop_nest_block_arg(%val : i32, %ub : i32, %i : index) {
+  omp.simd {
+    omp.loop_nest (%arg0) : i32 = (%val) to (%ub) inclusive step (%val) {
+      cf.br ^bb1(%i : index)
+    ^bb1(%0: index):
+      %1 = arith.cmpi slt, %0, %i : index
+      cf.cond_br %1, ^bb2, ^bb3
+    ^bb2:
+      %2 = arith.addi %0, %i : index
+      cf.br ^bb1(%2 : index)
+    ^bb3:
+      omp.yield
+    }
   }
   return
 }
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
index e64903671e599..b4049000c50dc 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
@@ -1,5 +1,6 @@
 // RUN: mlir-opt --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named))" %s -verify-diagnostics -o -| FileCheck %s
 // RUN: mlir-opt --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named{prefer-conv2d-kernel-layout-hwcf=true}))" %s -verify-diagnostics -o -| FileCheck --check-prefix="HWCF" %s
+// RUN: mlir-opt --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named,cse))" %s -verify-diagnostics -o -| FileCheck --check-prefix="CHECK-CSE" %s
 
 // CHECK-LABEL: @matmul
 func.func @matmul(%arg0: tensor<1x5x3xf32>, %arg1: tensor<1x3x6xf32>) -> (tensor<1x5x6xf32>) {
@@ -215,6 +216,59 @@ func.func @max_pool_i32(%arg0: tensor<1x6x34x62xi32>) -> () {
   return
 }
 
+// CHECK-CSE-LABEL: @max_pool_all_dynamic
+func.func @max_pool_all_dynamic(%arg0: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
+  // Batch size
+  // CHECK-CSE: %[[C0:.+]] = arith.constant 0 : index
+  // CHECK-CSE: %[[BATCH:.+]] = tensor.dim %arg0, %[[C0]] : tensor<?x?x?x?xf32>
+
+  // Compute output height
+  // CHECK-CSE: %[[C1:.+]] = arith.constant 1 : index
+  // CHECK-CSE: %[[IH:.+]] = tensor.dim %arg0, %[[C1]] : tensor<?x?x?x?xf32>
+  // CHECK-CSE: %[[C2:.+]] = arith.constant 2 : index
+  // CHECK-CSE: %[[PADDED_BEFORE:.+]] = arith.addi %[[IH]], %[[C0]] : index
+  // CHECK-CSE: %[[PADDED_AFTER:.+]] = arith.addi %[[PADDED_BEFORE]], %[[C0]] : index
+  // CHECK-CSE: %[[SUB_ONE:.+]] = arith.subi %[[C2]], %[[C1]] : index
+  // CHECK-CSE: %[[DILATED:.+]] = arith.muli %[[C1]], %[[SUB_ONE]] : index
+  // CHECK-CSE: %[[ADD_ONE:.+]] = arith.addi %[[DILATED]], %[[C1]] : index
+  // CHECK-CSE: %[[SUBTRACT:.+]] = arith.subi %[[PADDED_AFTER]], %[[ADD_ONE]] : index
+  // CHECK-CSE: %[[DIVIDE:.+]] = arith.divui %[[SUBTRACT]], %[[C1]] : index
+  // CHECK-CSE: %[[HEIGHT:.+]] = arith.addi %[[DIVIDE]], %[[C1]] : index
+
+  // Compute output width
+  // CHECK-CSE: %[[IW:.+]] = tensor.dim %arg0, %[[C2]] : tensor<?x?x?x?xf32>
+  // CHECK-CSE: %[[C5:.+]] = arith.constant 5 : index
+  // CHECK-CSE: %[[PADDED_BEFORE:.+]] = arith.addi %[[IW]], %[[C2]] : index
+  // CHECK-CSE: %[[PADDED_AFTER:.+]] = arith.addi %[[PADDED_BEFORE]], %[[C2]] : index
+  // CHECK-CSE: %[[SUB_ONE:.+]] = arith.subi %[[C5]], %[[C1]] : index
+  // CHECK-CSE: %[[DILATED:.+]] = arith.muli %[[C1]], %[[SUB_ONE]] : index
+  // CHECK-CSE: %[[ADD_ONE:.+]] = arith.addi %[[DILATED]], %[[C1]] : index
+  // CHECK-CSE: %[[SUBTRACT:.+]] = arith.subi %[[PADDED_AFTER]], %[[ADD_ONE]] : index
+  // CHECK-CSE: %[[DIVIDE:.+]] = arith.divui %[[SUBTRACT]], %[[C1]] : index
+  // CHECK-CSE: %[[WIDTH:.+]] = arith.addi %14, %[[C1]] : index
+
+  // Channel size
+  // CHECK-CSE: %[[C3:.+]] = arith.constant 3 : index
+  // CHECK-CSE: %[[CHANNEL:.+]] = tensor.dim %arg0, %[[C3]] : tensor<?x?x?x?xf32>
+
+  // Pad the input
+  // CHECK-CSE: %[[FLOAT_MIN:.+]] = arith.constant -3.40282347E+38 : f32
+  // CHECK-CSE: %[[PADDED:.+]] = tensor.pad %arg0 low[0, 0, 2, 0] high[0, 0, 2, 0] {
+  // CHECK-CSE:   tensor.yield %[[FLOAT_MIN]] : f32
+
+  // Allocate the output and fill with minimum value
+  // CHECK-CSE: %[[INIT:.+]] = tensor.empty(%[[BATCH]], %[[HEIGHT]], %[[WIDTH]], %[[CHANNEL]]) : tensor<?x?x?x?xf32>
+  // CHECK-CSE: %[[FILL:.+]] = linalg.fill ins(%[[FLOAT_MIN]] : f32) outs(%[[INIT]] : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+  // CHECK-CSE: %[[FAKE_WINDOW:.+]] = tensor.empty() : tensor<2x5xf32>
+
+  // Compute max pool
+  // CHECK-CSE: %[[OUT:.+]] = linalg.pooling_nhwc_max {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%[[PADDED]], %[[FAKE_WINDOW]] : tensor<?x?x?x?xf32>, tensor<2x5xf32>) outs(%[[FILL]] : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+  // CHECK-CSE: return %[[OUT]]
+
+  %0 = tosa.max_pool2d %arg0 {kernel = array<i64: 2, 5>, pad = array<i64: 0, 0, 2, 2>, stride = array<i64: 1, 1>} : (tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+  return %0 : tensor<?x?x?x?xf32>
+}
+
 // -----
 
 // CHECK-LABEL: @avg_pool_f32
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-resize.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-resize.mlir
index 468e92e2a2661..d42d0a46692d4 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-resize.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-resize.mlir
@@ -304,42 +304,36 @@ func.func @resize_nearest_fp32(%input: tensor<1x50x48x1xf32>) -> () {
   // CHECK-DAG: %[[XMAX:.*]] = arith.constant 47
   // CHECK: %[[Y:.+]] = arith.index_cast %[[IDX1]]
   // CHECK: %[[X:.+]] = arith.index_cast %[[IDX2]]
-  // CHECK-DAG: %[[ISCALE_Y_N:.*]] = arith.constant 64
-  // CHECK-DAG: %[[ISCALE_Y_D:.*]] = arith.constant 2
-  // CHECK-DAG: %[[ISCALE_X_N:.*]] = arith.constant 64
-  // CHECK-DAG: %[[ISCALE_X_D:.*]] = arith.constant 2
-  // CHECK-DAG: %[[IOFFSET_Y:.*]] = arith.constant -31
-  // CHECK-DAG: %[[IOFFSET_X:.*]] = arith.constant -31
-  // CHECK-DAG: %[[IBORDER_Y:.*]] = arith.constant 31
-  // CHECK-DAG: %[[IBORDER_X:.*]] = arith.constant 31
-
-  // CHECK: %[[Y0:.+]] = arith.uitofp %[[Y]]
-  // CHECK: %[[SCALE_Y_N:.*]] = arith.uitofp %[[ISCALE_Y_N]]
-  // CHECK: %[[SCALE_Y_D:.*]] = arith.uitofp %[[ISCALE_Y_D]]
-  // CHECK: %[[OFFSET_Y:.*]] = arith.sitofp %[[IOFFSET_Y]]
-  // CHECK: %[[VAL_29:.*]] = arith.mulf %[[Y0]], %[[SCALE_Y_D]]
-  // CHECK: %[[VAL_31:.*]] = arith.addf %[[VAL_29]], %[[OFFSET_Y]]
-  // CHECK: %[[VAL_33:.*]] = arith.divf %[[VAL_31]], %[[SCALE_Y_N]]
-  // CHECK: %[[VAL_35:.*]] = math.floor %[[VAL_33]]
-  // CHECK: %[[D_Y:.*]] = arith.subf %[[VAL_33]], %[[VAL_35]]
-  // CHECK: %[[VAL_39:.*]] = arith.fptosi %[[VAL_35]]
-
-  // CHECK: %[[X0:.+]] = arith.uitofp %[[X]]
-  // CHECK: %[[SCALE_X_N:.*]] = arith.uitofp %[[ISCALE_X_N]]
-  // CHECK: %[[SCALE_X_D:.*]] = arith.uitofp %[[ISCALE_X_D]]
-  // CHECK: %[[OFFSET_X:.*]] = arith.sitofp %[[IOFFSET_X]]
-  // CHECK: %[[VAL_30:.*]] = arith.mulf %[[X0]], %[[SCALE_X_D]]
-  // CHECK: %[[VAL_32:.*]] = arith.addf %[[VAL_30]], %[[OFFSET_X]]
-  // CHECK: %[[VAL_34:.*]] = arith.divf %[[VAL_32]], %[[SCALE_X_N]]
-  // CHECK: %[[VAL_36:.*]] = math.floor %[[VAL_34]]
-  // CHECK: %[[D_X:.*]] = arith.subf %[[VAL_34]], %[[VAL_36]]
-  // CHECK: %[[VAL_40:.*]] = arith.fptosi %[[VAL_36]]
+  // CHECK-DAG: %[[SCALE_Y_N:.*]] = arith.constant 64
+  // CHECK-DAG: %[[SCALE_Y_D:.*]] = arith.constant 2
+  // CHECK-DAG: %[[SCALE_X_N:.*]] = arith.constant 64
+  // CHECK-DAG: %[[SCALE_X_D:.*]] = arith.constant 2
+  // CHECK-DAG: %[[OFFSET_Y:.*]] = arith.constant -31
+  // CHECK-DAG: %[[OFFSET_X:.*]] = arith.constant -31
+  // CHECK-DAG: %[[BORDER_Y:.*]] = arith.constant 31
+  // CHECK-DAG: %[[BORDER_X:.*]] = arith.constant 31
+
+  // CHECK: %[[VAL_29:.*]] = arith.muli %[[Y]], %[[SCALE_Y_D]]
+  // CHECK: %[[Y_TEMP:.*]] = arith.addi %[[VAL_29]], %[[OFFSET_Y]]
+  // CHECK: %[[IY_TEMP:.*]] = arith.floordivsi %[[Y_TEMP]], %[[SCALE_Y_N]]
+  // CHECK: %[[RY:.*]] = arith.remsi %[[Y_TEMP]], %[[SCALE_Y_N]]
+  // CHECK: %[[RY_FP:.*]] = arith.sitofp %[[RY]]
+  // CHECK: %[[SCALE_Y_N_FP:.*]] = arith.uitofp %[[SCALE_Y_N]]
+  // CHECK: %[[D_Y:.*]] = arith.divf %[[RY_FP]], %[[SCALE_Y_N_FP]]
+
+  // CHECK: %[[VAL_30:.*]] = arith.muli %[[X]], %[[SCALE_X_D]]
+  // CHECK: %[[X_TEMP:.*]] = arith.addi %[[VAL_30]], %[[OFFSET_X]]
+  // CHECK: %[[IX_TEMP:.*]] = arith.floordivsi %[[X_TEMP]], %[[SCALE_X_N]]
+  // CHECK: %[[RX:.*]] = arith.remsi %[[X_TEMP]], %[[SCALE_X_N]]
+  // CHECK: %[[RX_FP:.*]] = arith.sitofp %[[RX]]
+  // CHECK: %[[SCALE_X_N_FP:.*]] = arith.uitofp %[[SCALE_X_N]]
+  // CHECK: %[[D_X:.*]] = arith.divf %[[RX_FP]], %[[SCALE_X_N_FP]]
 
   // CHECK-DAG: %[[ONE:.*]] = arith.constant 1
   // CHECK-DAG: %[[HALF:.*]] = arith.constant 5.000000e-01
   // CHECK: %[[PRED_Y:.*]] = arith.cmpf oge, %[[D_Y]], %[[HALF]]
   // CHECK: %[[ROUND_Y:.*]] = arith.select %[[PRED_Y]], %[[ONE]], %[[ZERO]]
-  // CHECK: %[[VAL_48:.*]] = arith.addi %[[VAL_39]], %[[ROUND_Y]]
+  // CHECK: %[[VAL_48:.*]] = arith.addi %[[IY_TEMP]], %[[ROUND_Y]]
   // CHECK: %[[LOWER:.*]] = arith.maxsi %[[ZERO]], %[[VAL_48]]
   // CHECK: %[[CLAMPED:.*]] = arith.minsi %[[YMAX]], %[[LOWER]]
   // CHECK: %[[IDY:.*]] = arith.index_cast %[[CLAMPED]]
@@ -347,7 +341,7 @@ func.func @resize_nearest_fp32(%input: tensor<1x50x48x1xf32>) -> () {
   // CHECK-DAG: %[[HALF:.*]] = arith.constant 5.000000e-01
   // CHECK: %[[PRED_X:.*]] = arith.cmpf oge, %[[D_X]], %[[HALF]]
   // CHECK: %[[ROUND_X:.*]] = arith.select %[[PRED_X]], %[[ONE]], %[[ZERO]]
-  // CHECK: %[[VAL_49:.*]] = arith.addi %[[VAL_40]], %[[ROUND_X]]
+  // CHECK: %[[VAL_49:.*]] = arith.addi %[[IX_TEMP]], %[[ROUND_X]]
   // CHECK: %[[LOWER:.*]] = arith.maxsi %[[ZERO]], %[[VAL_49]]
   // CHECK: %[[CLAMPED:.*]] = arith.minsi %[[XMAX]], %[[LOWER]]
   // CHECK: %[[IDX:.*]] = arith.index_cast %[[CLAMPED]]
@@ -374,36 +368,30 @@ func.func @resize_bilinear_fp(%input: tensor<1x23x24x1xf32>) -> () {
   // CHECK-DAG: %[[X_MAX:.*]] = arith.constant 23
   // CHECK: %[[Y:.+]] = arith.index_cast %[[IDX_1]]
   // CHECK: %[[X:.+]] = arith.index_cast %[[IDX_2]]
-  // CHECK-DAG: %[[ISCALE_Y_N:.*]] = arith.constant 4
-  // CHECK-DAG: %[[ISCALE_Y_D:.*]] = arith.constant 1
-  // CHECK-DAG: %[[ISCALE_X_N:.*]] = arith.constant 4
-  // CHECK-DAG: %[[ISCALE_X_D:.*]] = arith.constant 1
-  // CHECK-DAG: %[[IOFFSET_Y:.*]] = arith.constant 0
-  // CHECK-DAG: %[[IOFFSET_X:.*]] = arith.constant 0
-  // CHECK-DAG: %[[IBORDER_Y:.*]] = arith.constant 0
-  // CHECK-DAG: %[[IBORDER_X:.*]] = arith.constant 0
-
-  // CHECK: %[[Y0:.+]] = arith.uitofp %[[Y]]
-  // CHECK: %[[SCALE_Y_N:.*]] = arith.uitofp %[[ISCALE_Y_N]]
-  // CHECK: %[[SCALE_Y_D:.*]] = arith.uitofp %[[ISCALE_Y_D]]
-  // CHECK: %[[OFFSET_Y:.*]] = arith.sitofp %[[IOFFSET_Y]]
-  // CHECK: %[[VAL_29:.*]] = arith.mulf %[[Y0]], %[[SCALE_Y_D]]
-  // CHECK: %[[VAL_31:.*]] = arith.addf %[[VAL_29]], %[[OFFSET_Y]]
-  // CHECK: %[[VAL_33:.*]] = arith.divf %[[VAL_31]], %[[SCALE_Y_N]]
-  // CHECK: %[[VAL_35:.*]] = math.floor %[[VAL_33]]
-  // CHECK: %[[D_Y:.*]] = arith.subf %[[VAL_33]], %[[VAL_35]]
-  // CHECK: %[[I_Y:.*]] = arith.fptosi %[[VAL_35]]
-
-  // CHECK: %[[X0:.+]] = arith.uitofp %[[X]]
-  // CHECK: %[[SCALE_X_N:.*]] = arith.uitofp %[[ISCALE_X_N]]
-  // CHECK: %[[SCALE_X_D:.*]] = arith.uitofp %[[ISCALE_X_D]]
-  // CHECK: %[[OFFSET_X:.*]] = arith.sitofp %[[IOFFSET_X]]
-  // CHECK: %[[VAL_30:.*]] = arith.mulf %[[X0]], %[[SCALE_X_D]]
-  // CHECK: %[[VAL_32:.*]] = arith.addf %[[VAL_30]], %[[OFFSET_X]]
-  // CHECK: %[[VAL_34:.*]] = arith.divf %[[VAL_32]], %[[SCALE_X_N]]
-  // CHECK: %[[VAL_36:.*]] = math.floor %[[VAL_34]]
-  // CHECK: %[[D_X:.*]] = arith.subf %[[VAL_34]], %[[VAL_36]]
-  // CHECK: %[[I_X:.*]] = arith.fptosi %[[VAL_36]]
+  // CHECK-DAG: %[[SCALE_Y_N:.*]] = arith.constant 4
+  // CHECK-DAG: %[[SCALE_Y_D:.*]] = arith.constant 1
+  // CHECK-DAG: %[[SCALE_X_N:.*]] = arith.constant 4
+  // CHECK-DAG: %[[SCALE_X_D:.*]] = arith.constant 1
+  // CHECK-DAG: %[[OFFSET_Y:.*]] = arith.constant 0
+  // CHECK-DAG: %[[OFFSET_X:.*]] = arith.constant 0
+  // CHECK-DAG: %[[BORDER_Y:.*]] = arith.constant 0
+  // CHECK-DAG: %[[BORDER_X:.*]] = arith.constant 0
+
+  // CHECK: %[[VAL_29:.*]] = arith.muli %[[Y]], %[[SCALE_Y_D]]
+  // CHECK: %[[Y_TEMP:.*]] = arith.addi %[[VAL_29]], %[[OFFSET_Y]]
+  // CHECK: %[[I_Y:.*]] = arith.floordivsi %[[Y_TEMP]], %[[SCALE_Y_N]]
+  // CHECK: %[[RY:.*]] = arith.remsi %[[Y_TEMP]], %[[SCALE_Y_N]]
+  // CHECK: %[[RY_FP:.*]] = arith.sitofp %[[RY]]
+  // CHECK: %[[SCALE_Y_N_FP:.*]] = arith.uitofp %[[SCALE_Y_N]]
+  // CHECK: %[[D_Y:.*]] = arith.divf %[[RY_FP]], %[[SCALE_Y_N_FP]]
+
+  // CHECK: %[[VAL_30:.*]] = arith.muli %[[X]], %[[SCALE_X_D]]
+  // CHECK: %[[X_TEMP:.*]] = arith.addi %[[VAL_30]], %[[OFFSET_X]]
+  // CHECK: %[[I_X:.*]] = arith.floordivsi %[[X_TEMP]], %[[SCALE_X_N]]
+  // CHECK: %[[RX:.*]] = arith.remsi %[[X_TEMP]], %[[SCALE_X_N]]
+  // CHECK: %[[RX_FP:.*]] = arith.sitofp %[[RX]]
+  // CHECK: %[[SCALE_X_N_FP:.*]] = arith.uitofp %[[SCALE_X_N]]
+  // CHECK: %[[D_X:.*]] = arith.divf %[[RX_FP]], %[[SCALE_X_N_FP]]
 
   // Compute the left, right, and top indices for the bilinear interpolation.
 
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
index 1fa783f05f04e..445e8be47678d 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
@@ -270,7 +270,8 @@ func.func @test_add_2d_all_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32
   // CHECK: %[[VAL_0:.*]] = tensor.dim %[[ARG0]], %[[CONST0]] : tensor<?x?xf32>
   // CHECK: %[[VAL_1:.*]] = arith.cmpi eq, %[[VAL_0]], %[[CONST1]] : index
   // CHECK: %[[ARG0_DIM0_BROADCAST:.*]] = scf.if %[[VAL_1]] -> (tensor<?x?xf32>) {
-  // CHECK:   %[[VAL_2:.*]] = tensor.dim %[[ARG0]], %[[CONST1]] : tensor<?x?xf32>
+  // CHECK:   %[[LOCAL_CONST1:.*]] = arith.constant 1 : index
+  // CHECK:   %[[VAL_2:.*]] = tensor.dim %[[ARG0]], %[[LOCAL_CONST1]] : tensor<?x?xf32>
   // CHECK:   %[[VAL_3:.*]] = tensor.empty(%[[MAX_DIM0]], %[[VAL_2]]) : tensor<?x?xf32>
   // CHECK:   %[[VAL_4:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%[[ARG0]] : tensor<?x?xf32>) outs(%[[VAL_3]] : tensor<?x?xf32>) {
   // CHECK:   ^bb0(%[[VAL_5:.*]]: f32, %[[VAL_6:.*]]: f32):
@@ -284,7 +285,8 @@ func.func @test_add_2d_all_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32
   // CHECK: %[[VAL_7:.*]] = tensor.dim %[[ARG0_DIM0_BROADCAST]], %[[CONST1]] : tensor<?x?xf32>
   // CHECK: %[[VAL_8:.*]] = arith.cmpi eq, %[[VAL_7]], %[[CONST1]] : index
   // CHECK: %[[ARG0_DIM1_BROADCAST:.*]] = scf.if %[[VAL_8]] -> (tensor<?x?xf32>) {
-  // CHECK:   %[[VAL_9:.*]] = tensor.dim %[[ARG0_DIM0_BROADCAST]], %[[CONST0]] : tensor<?x?xf32>
+  // CHECK:   %[[LOCAL_CONST0:.*]] = arith.constant 0 : index
+  // CHECK:   %[[VAL_9:.*]] = tensor.dim %[[ARG0_DIM0_BROADCAST]], %[[LOCAL_CONST0]] : tensor<?x?xf32>
   // CHECK:   %[[VAL_10:.*]] = tensor.empty(%[[VAL_9]], %[[MAX_DIM1]]) : tensor<?x?xf32>
   // CHECK:   %[[VAL_11:.*]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%[[ARG0_DIM0_BROADCAST]] : tensor<?x?xf32>) outs(%[[VAL_10]] : tensor<?x?xf32>) {
   // CHECK:   ^bb0(%[[VAL_12:.*]]: f32, %[[VAL_13:.*]]: f32):
@@ -298,7 +300,8 @@ func.func @test_add_2d_all_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32
   // CHECK: %[[VAL_14:.*]] = tensor.dim %[[ARG1]], %[[CONST0]] : tensor<?x?xf32>
   // CHECK: %[[VAL_15:.*]] = arith.cmpi eq, %[[VAL_14]], %[[CONST1]] : index
   // CHECK: %[[ARG1_DIM0_BROADCAST:.*]] = scf.if %[[VAL_15]] -> (tensor<?x?xf32>) {
-  // CHECK:   %[[VAL_16:.*]] = tensor.dim %[[ARG1]], %[[CONST1]] : tensor<?x?xf32>
+  // CHECK:   %[[LOCAL_CONST1:.*]] = arith.constant 1 : index
+  // CHECK:   %[[VAL_16:.*]] = tensor.dim %[[ARG1]], %[[LOCAL_CONST1]] : tensor<?x?xf32>
   // CHECK:   %[[VAL_17:.*]] = tensor.empty(%[[MAX_DIM0]], %[[VAL_16]]) : tensor<?x?xf32>
   // CHECK:   %[[VAL_18:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%[[ARG1]] : tensor<?x?xf32>) outs(%[[VAL_17]] : tensor<?x?xf32>) {
   // CHECK:   ^bb0(%[[VAL_19:.*]]: f32, %[[VAL_20:.*]]: f32):
@@ -312,7 +315,8 @@ func.func @test_add_2d_all_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32
   // CHECK: %[[VAL_21:.*]] = tensor.dim %[[ARG1_DIM0_BROADCAST]], %[[CONST1]] : tensor<?x?xf32>
   // CHECK: %[[VAL_22:.*]] = arith.cmpi eq, %[[VAL_21]], %[[CONST1]] : index
   // CHECK: %[[ARG1_DIM1_BROADCAST:.*]] = scf.if %[[VAL_22]] -> (tensor<?x?xf32>) {
-  // CHECK:   %[[VAL_23:.*]] = tensor.dim %[[ARG1_DIM0_BROADCAST]], %[[CONST0]] : tensor<?x?xf32>
+  // CHECK:   %[[LOCAL_CONST0:.*]] = arith.constant 0 : index
+  // CHECK:   %[[VAL_23:.*]] = tensor.dim %[[ARG1_DIM0_BROADCAST]], %[[LOCAL_CONST0]] : tensor<?x?xf32>
   // CHECK:   %[[VAL_24:.*]] = tensor.empty(%[[VAL_23]], %[[MAX_DIM1]]) : tensor<?x?xf32>
   // CHECK:   %[[VAL_25:.*]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%[[ARG1_DIM0_BROADCAST]] : tensor<?x?xf32>) outs(%[[VAL_24]] : tensor<?x?xf32>) {
   // CHECK:   ^bb0(%[[VAL_26:.*]]: f32, %[[VAL_27:.*]]: f32):
diff --git a/mlir/test/Dialect/Affine/value-bounds-op-interface-impl.mlir b/mlir/test/Dialect/Affine/value-bounds-op-interface-impl.mlir
index 23c6872dcebe9..935c08aceff54 100644
--- a/mlir/test/Dialect/Affine/value-bounds-op-interface-impl.mlir
+++ b/mlir/test/Dialect/Affine/value-bounds-op-interface-impl.mlir
@@ -131,3 +131,27 @@ func.func @compare_affine_min(%a: index, %b: index) {
   "test.compare"(%0, %a) {cmp = "LE"} : (index, index) -> ()
   return
 }
+
+// -----
+
+func.func @compare_const_map() {
+  %c5 = arith.constant 5 : index
+  // expected-remark @below{{true}}
+  "test.compare"(%c5) {cmp = "GT", rhs_map = affine_map<() -> (4)>}
+      : (index) -> ()
+  // expected-remark @below{{true}}
+  "test.compare"(%c5) {cmp = "LT", lhs_map = affine_map<() -> (4)>}
+      : (index) -> ()
+  return
+}
+
+// -----
+
+func.func @compare_maps(%a: index, %b: index) {
+  // expected-remark @below{{true}}
+  "test.compare"(%a, %b, %b, %a)
+      {cmp = "GT", lhs_map = affine_map<(d0, d1) -> (1 + d0 + d1)>,
+       rhs_map = affine_map<(d0, d1) -> (d0 + d1)>}
+      : (index, index, index, index) -> ()
+  return
+}
diff --git a/mlir/test/Dialect/ArmSME/vector-legalization.mlir b/mlir/test/Dialect/ArmSME/vector-legalization.mlir
index f8be697548c19..f43ef1cce787c 100644
--- a/mlir/test/Dialect/ArmSME/vector-legalization.mlir
+++ b/mlir/test/Dialect/ArmSME/vector-legalization.mlir
@@ -433,3 +433,14 @@ func.func @lift_illegal_1d_shape_cast_to_memory(%a: index, %b: index, %memref: m
   %cast = vector.shape_cast %illegalRead : vector<[4]x1xf32> to vector<[4]xf32>
   return %cast : vector<[4]xf32>
 }
+
+// -----
+
+// CHECK-LABEL: @multi_tile_splat
+func.func @multi_tile_splat() -> vector<[8]x[8]xi32>
+{
+  // CHECK: %[[SPLAT:.*]] = arith.constant dense<42> : vector<[4]x[4]xi32>
+  // CHECK-NEXT: return %[[SPLAT]], %[[SPLAT]], %[[SPLAT]], %[[SPLAT]] : vector<[4]x[4]xi32>, vector<[4]x[4]xi32>, vector<[4]x[4]xi32>, vector<[4]x[4]xi32>
+  %0 = arith.constant dense<42> : vector<[8]x[8]xi32>
+  return %0 : vector<[8]x[8]xi32>
+}
diff --git a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
index 9127eac5da951..5d3c07c8e23c1 100644
--- a/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
@@ -109,3 +109,20 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+  // -----
+
+func.func @test_pack_no_vectorize_dynamic_shape(%arg0: tensor<?xf32>, %arg1: tensor<4x16xf32>) -> tensor<4x16xf32> {
+  %pad = arith.constant 0.000000e+00 : f32
+  // expected-error @+1 {{Attempted to vectorize, but failed}}
+  %pack = tensor.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [0] inner_tiles = [16] into %arg1 : tensor<?xf32> -> tensor<4x16xf32>
+  return %pack : tensor<4x16xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir
index fd7d3b4767eb2..80a5a4c6702ac 100644
--- a/mlir/test/Dialect/Linalg/vectorization.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization.mlir
@@ -930,3 +930,58 @@ func.func @test_vectorize_unpack_no_masks(%source: tensor<8x8x32x16xf32>, %dest:
     transform.yield
   } 
 }
+
+  // -----
+
+// CHECK-LABEL: test_vectorize_pack_no_vector_sizes
+func.func @test_vectorize_pack_no_vector_sizes(%arg0: tensor<64x4xf32>, %arg1: tensor<2x4x16x2xf32>) -> tensor<2x4x16x2xf32> {
+  %pack = tensor.pack %arg0 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [16, 2] into %arg1 : tensor<64x4xf32> -> tensor<2x4x16x2xf32>
+  return %pack : tensor<2x4x16x2xf32>
+}
+//  CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
+//  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
+//      CHECK: %[[read:.*]] = vector.transfer_read %{{.*}}[%[[c0]], %[[c0]]], %[[cst]]
+// CHECK-SAME:    {in_bounds = [true, true]} : tensor<64x4xf32>, vector<64x4xf32>
+//      CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[read]] : vector<64x4xf32> to vector<4x16x2x2xf32>
+//      CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [2, 0, 1, 3] : vector<4x16x2x2xf32> to vector<2x4x16x2xf32>
+//  CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
+//  CHECK-DAG: %[[empty:.*]] = tensor.empty() : tensor<2x4x16x2xf32>
+//      CHECK: %[[write:.*]] = vector.transfer_write %[[transpose]], %[[empty]][%[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]]]
+// CHECK-SAME:   {in_bounds = [true, true, true, true]} : vector<2x4x16x2xf32>, tensor<2x4x16x2xf32>
+//      CHECK: return %[[write]] : tensor<2x4x16x2xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 : !transform.any_op
+    transform.yield
+  }
+}
+
+  // -----
+
+// CHECK-LABEL: test_vectorize_padded_pack_no_vector_sizes
+func.func @test_vectorize_padded_pack_no_vector_sizes(%arg0: tensor<32x7x15xf32>, %arg1: tensor<32x4x1x16x2xf32>) -> tensor<32x4x1x16x2xf32> {
+  %pad = arith.constant 0.000000e+00 : f32
+  %pack = tensor.pack %arg0 padding_value(%pad : f32) inner_dims_pos = [2, 1] inner_tiles = [16, 2] into %arg1 : tensor<32x7x15xf32> -> tensor<32x4x1x16x2xf32>
+  return %pack : tensor<32x4x1x16x2xf32>
+}
+//  CHECK-DAG: %[[cst:.*]] = arith.constant 0.000000e+00 : f32
+//  CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
+//      CHECK: %[[transfer_read:.*]] =  vector.transfer_read %{{.*}}[%[[c0]], %[[c0]], %[[c0]]], %[[cst]]
+// CHECK-SAME:   {in_bounds = [true, false, false]} : tensor<32x7x15xf32>, vector<32x8x16xf32> 
+//      CHECK: %[[shape_cast:.*]] = vector.shape_cast %[[transfer_read]] : vector<32x8x16xf32> to vector<32x4x2x1x16xf32>
+//      CHECK: %[[transpose:.*]] = vector.transpose %[[shape_cast]], [0, 1, 3, 4, 2] : vector<32x4x2x1x16xf32> to vector<32x4x1x16x2xf32>
+//  CHECK-DAG: %[[c0_1:.*]] = arith.constant 0 : index
+//  CHECK-DAG: %[[empty:.*]] = tensor.empty() : tensor<32x4x1x16x2xf32>
+//      CHECK: %[[write:.*]] = vector.transfer_write %[[transpose]], %[[empty]][%[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]], %[[c0_1]]]
+// CHECK-SAME:   {in_bounds = [true, true, true, true, true]} : vector<32x4x1x16x2xf32>, tensor<32x4x1x16x2xf32>
+//      CHECK: return %[[write]] : tensor<32x4x1x16x2xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["tensor.pack"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    transform.structured.vectorize %0 : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 88dca1b85ee5f..9323beadf4549 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -243,145 +243,168 @@ llvm.func @test_omp_wsloop_dynamic_wrong_modifier3(%lb : i64, %ub : i64, %step :
 
 // -----
 
-func.func @omp_simdloop(%lb : index, %ub : index, %step : i32) -> () {
-  // expected-error @below {{op failed to verify that all of {lowerBound, upperBound, step} have same type}}
-  "omp.simdloop" (%lb, %ub, %step) ({
-    ^bb0(%iv: index):
-      omp.yield
-  }) {operandSegmentSizes = array<i32: 1,1,1,0,0,0>} :
-    (index, index, i32) -> ()
+func.func @omp_simd() -> () {
+  // expected-error @below {{op must be a loop wrapper}}
+  omp.simd {
+    omp.terminator
+  }
+  return
+}
+
+// -----
 
+func.func @omp_simd_nested_wrapper() -> () {
+  // expected-error @below {{op must wrap an 'omp.loop_nest' directly}}
+  omp.simd {
+    omp.distribute {
+      omp.terminator
+    }
+  }
   return
 }
 
 // -----
 
-func.func @omp_simdloop_pretty_aligned(%lb : index, %ub : index, %step : index,
-                                       %data_var : memref<i32>) -> () {
+func.func @omp_simd_pretty_aligned(%lb : index, %ub : index, %step : index,
+                                   %data_var : memref<i32>) -> () {
   //  expected-error @below {{expected '->'}}
-  omp.simdloop aligned(%data_var : memref<i32>)
-  for (%iv) : index = (%lb) to (%ub) step (%step) {
-    omp.yield
+  omp.simd aligned(%data_var : memref<i32>) {
+    omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
+      omp.yield
+    }
   }
   return
 }
 
 // -----
 
-func.func @omp_simdloop_aligned_mismatch(%arg0 : index, %arg1 : index,
-                                         %arg2 : index, %arg3 : memref<i32>,
-                                         %arg4 : memref<i32>) -> () {
+func.func @omp_simd_aligned_mismatch(%arg0 : index, %arg1 : index,
+                                     %arg2 : index, %arg3 : memref<i32>,
+                                     %arg4 : memref<i32>) -> () {
   //  expected-error @below {{op expected as many alignment values as aligned variables}}
-  "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg4) ({
-    ^bb0(%arg5: index):
-      "omp.yield"() : () -> ()
+  "omp.simd"(%arg3, %arg4) ({
+    omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
+      omp.yield
+    }
   }) {alignment_values = [128],
-      operandSegmentSizes = array<i32: 1, 1, 1, 2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
+      operandSegmentSizes = array<i32: 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
 // -----
 
-func.func @omp_simdloop_aligned_negative(%arg0 : index, %arg1 : index,
-                                         %arg2 : index, %arg3 : memref<i32>,
-                                         %arg4 : memref<i32>) -> () {
+func.func @omp_simd_aligned_negative(%arg0 : index, %arg1 : index,
+                                     %arg2 : index, %arg3 : memref<i32>,
+                                     %arg4 : memref<i32>) -> () {
   //  expected-error @below {{op alignment should be greater than 0}}
-  "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg4) ({
-    ^bb0(%arg5: index):
-      "omp.yield"() : () -> ()
-  }) {alignment_values = [-1, 128], operandSegmentSizes = array<i32: 1, 1, 1,2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
+  "omp.simd"(%arg3, %arg4) ({
+    omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
+      omp.yield
+    }
+  }) {alignment_values = [-1, 128], operandSegmentSizes = array<i32: 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
 // -----
 
-func.func @omp_simdloop_unexpected_alignment(%arg0 : index, %arg1 : index,
-                                             %arg2 : index, %arg3 : memref<i32>,
-                                             %arg4 : memref<i32>) -> () {
+func.func @omp_simd_unexpected_alignment(%arg0 : index, %arg1 : index,
+                                         %arg2 : index, %arg3 : memref<i32>,
+                                         %arg4 : memref<i32>) -> () {
   //  expected-error @below {{unexpected alignment values attribute}}
-  "omp.simdloop"(%arg0, %arg1, %arg2) ({
-    ^bb0(%arg5: index):
-      "omp.yield"() : () -> ()
-  }) {alignment_values = [1, 128], operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 0>} : (index, index, index) -> ()
+  "omp.simd"() ({
+    omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
+      omp.yield
+    }
+  }) {alignment_values = [1, 128]} : () -> ()
   return
 }
 
 // -----
 
-func.func @omp_simdloop_aligned_float(%arg0 : index, %arg1 : index,
-                                      %arg2 : index, %arg3 : memref<i32>,
-                                      %arg4 : memref<i32>) -> () {
+func.func @omp_simd_aligned_float(%arg0 : index, %arg1 : index,
+                                  %arg2 : index, %arg3 : memref<i32>,
+                                  %arg4 : memref<i32>) -> () {
   //  expected-error @below {{failed to satisfy constraint: 64-bit integer array attribute}}
-  "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg4) ({
-    ^bb0(%arg5: index):
-      "omp.yield"() : () -> ()
-  }) {alignment_values = [1.5, 128], operandSegmentSizes = array<i32: 1, 1, 1,2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
+  "omp.simd"(%arg3, %arg4) ({
+    omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
+      omp.yield
+    }
+  }) {alignment_values = [1.5, 128], operandSegmentSizes = array<i32: 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
 // -----
 
-func.func @omp_simdloop_aligned_the_same_var(%arg0 : index, %arg1 : index,
-                                             %arg2 : index, %arg3 : memref<i32>,
-                                             %arg4 : memref<i32>) -> () {
+func.func @omp_simd_aligned_the_same_var(%arg0 : index, %arg1 : index,
+                                         %arg2 : index, %arg3 : memref<i32>,
+                                         %arg4 : memref<i32>) -> () {
   //  expected-error @below {{aligned variable used more than once}}
-  "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg3) ({
-    ^bb0(%arg5: index):
-      "omp.yield"() : () -> ()
-  }) {alignment_values = [1, 128], operandSegmentSizes = array<i32: 1, 1, 1,2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
+  "omp.simd"(%arg3, %arg3) ({
+    omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
+      omp.yield
+    }
+  }) {alignment_values = [1, 128], operandSegmentSizes = array<i32: 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
 // -----
 
-func.func @omp_simdloop_nontemporal_the_same_var(%arg0 : index,
-                                                 %arg1 : index,
-                                                 %arg2 : index,
-                                                 %arg3 : memref<i32>) -> () {
+func.func @omp_simd_nontemporal_the_same_var(%arg0 : index,  %arg1 : index,
+                                             %arg2 : index,
+                                             %arg3 : memref<i32>) -> () {
   //  expected-error @below {{nontemporal variable used more than once}}
-  "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg3) ({
-    ^bb0(%arg5: index):
-      "omp.yield"() : () -> ()
-  }) {operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 2>} : (index, index, index, memref<i32>, memref<i32>) -> ()
+  "omp.simd"(%arg3, %arg3) ({
+    omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
+      omp.yield
+    }
+  }) {operandSegmentSizes = array<i32: 0, 0, 2>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
 // -----
 
-func.func @omp_simdloop_order_value(%lb : index, %ub : index, %step : index) {
+func.func @omp_simd_order_value(%lb : index, %ub : index, %step : index) {
   // expected-error @below {{invalid clause value: 'default'}}
-  omp.simdloop order(default) for (%iv): index = (%lb) to (%ub) step (%step) {
-    omp.yield
+  omp.simd order(default) {
+    omp.loop_nest (%iv) : index = (%arg0) to (%arg1) step (%arg2) {
+      omp.yield
+    }
   }
   return
 }
 
 // -----
 
-func.func @omp_simdloop_pretty_simdlen(%lb : index, %ub : index, %step : index) -> () {
+func.func @omp_simd_pretty_simdlen(%lb : index, %ub : index, %step : index) -> () {
   // expected-error @below {{op attribute 'simdlen' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive}}
-  omp.simdloop simdlen(0) for (%iv): index = (%lb) to (%ub) step (%step) {
-    omp.yield
+  omp.simd simdlen(0) {
+    omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
+      omp.yield
+    }
   }
   return
 }
 
 // -----
 
-func.func @omp_simdloop_pretty_safelen(%lb : index, %ub : index, %step : index) -> () {
+func.func @omp_simd_pretty_safelen(%lb : index, %ub : index, %step : index) -> () {
   // expected-error @below {{op attribute 'safelen' failed to satisfy constraint: 64-bit signless integer attribute whose value is positive}}
-  omp.simdloop safelen(0) for (%iv): index = (%lb) to (%ub) step (%step) {
-    omp.yield
+  omp.simd safelen(0) {
+    omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
+      omp.yield
+    }
   }
   return
 }
 
 // -----
 
-func.func @omp_simdloop_pretty_simdlen_safelen(%lb : index, %ub : index, %step : index) -> () {
-  // expected-error @below {{'omp.simdloop' op simdlen clause and safelen clause are both present, but the simdlen value is not less than or equal to safelen value}}
-  omp.simdloop simdlen(2) safelen(1) for (%iv): index = (%lb) to (%ub) step (%step) {
-    omp.yield
+func.func @omp_simd_pretty_simdlen_safelen(%lb : index, %ub : index, %step : index) -> () {
+  // expected-error @below {{op simdlen clause and safelen clause are both present, but the simdlen value is not less than or equal to safelen value}}
+  omp.simd simdlen(2) safelen(1) {
+    omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
+      omp.yield
+    }
   }
   return
 }
@@ -1580,10 +1603,11 @@ func.func @omp_cancellationpoint2() {
 func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testmemref = "test.memref"() : () -> (memref<i32>)
   // expected-error @below {{expected equal sizes for allocate and allocator variables}}
-  "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testmemref) ({
-  ^bb0(%arg3: i32, %arg4: i32):
-    "omp.terminator"() : () -> ()
-  }) {operandSegmentSizes = array<i32: 2, 2, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0>} : (i32, i32, i32, i32, i32, i32, memref<i32>) -> ()
+  "omp.taskloop"(%testmemref) ({
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      omp.yield
+    }
+  }) {operandSegmentSizes = array<i32: 0, 0, 0, 0, 0, 1, 0, 0, 0>} : (memref<i32>) -> ()
   return
 }
 
@@ -1593,10 +1617,11 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testf32 = "test.f32"() : () -> (!llvm.ptr)
   %testf32_2 = "test.f32"() : () -> (!llvm.ptr)
   // expected-error @below {{expected as many reduction symbol references as reduction variables}}
-  "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testf32, %testf32_2) ({
-  ^bb0(%arg3: i32, %arg4: i32):
-    "omp.terminator"() : () -> ()
-  }) {operandSegmentSizes = array<i32: 2, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0>, reductions = [@add_f32]} : (i32, i32, i32, i32, i32, i32, !llvm.ptr, !llvm.ptr) -> ()
+  "omp.taskloop"(%testf32, %testf32_2) ({
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      omp.yield
+    }
+  }) {operandSegmentSizes = array<i32: 0, 0, 0, 2, 0, 0, 0, 0, 0>, reductions = [@add_f32]} : (!llvm.ptr, !llvm.ptr) -> ()
   return
 }
 
@@ -1604,12 +1629,12 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
 
 func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testf32 = "test.f32"() : () -> (!llvm.ptr)
-  %testf32_2 = "test.f32"() : () -> (!llvm.ptr)
   // expected-error @below {{expected as many reduction symbol references as reduction variables}}
-  "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testf32) ({
-  ^bb0(%arg3: i32, %arg4: i32):
-    "omp.terminator"() : () -> ()
-  }) {operandSegmentSizes = array<i32: 2, 2, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0>, reductions = [@add_f32, @add_f32]} : (i32, i32, i32, i32, i32, i32, !llvm.ptr) -> ()
+  "omp.taskloop"(%testf32) ({
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      omp.yield
+    }
+  }) {operandSegmentSizes = array<i32: 0, 0, 0, 1, 0, 0, 0, 0, 0>, reductions = [@add_f32, @add_f32]} : (!llvm.ptr) -> ()
   return
 }
 
@@ -1619,10 +1644,11 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testf32 = "test.f32"() : () -> (!llvm.ptr)
   %testf32_2 = "test.f32"() : () -> (!llvm.ptr)
   // expected-error @below {{expected as many reduction symbol references as reduction variables}}
-  "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testf32, %testf32_2) ({
-  ^bb0(%arg3: i32, %arg4: i32):
-    "omp.terminator"() : () -> ()
-  }) {in_reductions = [@add_f32], operandSegmentSizes = array<i32: 2, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0>} : (i32, i32, i32, i32, i32, i32, !llvm.ptr, !llvm.ptr) -> ()
+  "omp.taskloop"(%testf32, %testf32_2) ({
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      omp.yield
+    }
+  }) {in_reductions = [@add_f32], operandSegmentSizes = array<i32: 0, 0, 2, 0, 0, 0, 0, 0, 0>} : (!llvm.ptr, !llvm.ptr) -> ()
   return
 }
 
@@ -1630,12 +1656,12 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
 
 func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testf32 = "test.f32"() : () -> (!llvm.ptr)
-  %testf32_2 = "test.f32"() : () -> (!llvm.ptr)
   // expected-error @below {{expected as many reduction symbol references as reduction variables}}
-  "omp.taskloop"(%lb, %ub, %ub, %lb, %step, %step, %testf32_2) ({
-  ^bb0(%arg3: i32, %arg4: i32):
-    "omp.terminator"() : () -> ()
-  }) {in_reductions = [@add_f32, @add_f32], operandSegmentSizes = array<i32: 2, 2, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0>} : (i32, i32, i32, i32, i32, i32, !llvm.ptr) -> ()
+  "omp.taskloop"(%testf32) ({
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      omp.yield
+    }
+  }) {in_reductions = [@add_f32, @add_f32], operandSegmentSizes = array<i32: 0, 0, 1, 0, 0, 0, 0, 0, 0>} : (!llvm.ptr) -> ()
   return
 }
 
@@ -1657,9 +1683,10 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testf32 = "test.f32"() : () -> (!llvm.ptr)
   %testf32_2 = "test.f32"() : () -> (!llvm.ptr)
   // expected-error @below {{if a reduction clause is present on the taskloop directive, the nogroup clause must not be specified}}
-  omp.taskloop reduction(@add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr) nogroup
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    omp.terminator
+  omp.taskloop reduction(@add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr) nogroup {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      omp.yield
+    }
   }
   return
 }
@@ -1681,9 +1708,10 @@ combiner {
 func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testf32 = "test.f32"() : () -> (!llvm.ptr)
   // expected-error @below {{the same list item cannot appear in both a reduction and an in_reduction clause}}
-  omp.taskloop reduction(@add_f32 -> %testf32 : !llvm.ptr) in_reduction(@add_f32 -> %testf32 : !llvm.ptr)
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    omp.terminator
+  omp.taskloop reduction(@add_f32 -> %testf32 : !llvm.ptr) in_reduction(@add_f32 -> %testf32 : !llvm.ptr) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      omp.yield
+    }
   }
   return
 }
@@ -1693,8 +1721,20 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
 func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
   %testi64 = "test.i64"() : () -> (i64)
   // expected-error @below {{the grainsize clause and num_tasks clause are mutually exclusive and may not appear on the same taskloop directive}}
-  omp.taskloop grain_size(%testi64: i64) num_tasks(%testi64: i64)
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+  omp.taskloop grain_size(%testi64: i64) num_tasks(%testi64: i64) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      omp.yield
+    }
+  }
+  return
+}
+
+// -----
+
+func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
+  // expected-error @below {{op must be a loop wrapper}}
+  omp.taskloop {
+    %0 = arith.constant 0 : i32
     omp.terminator
   }
   return
@@ -1702,6 +1742,21 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
 
 // -----
 
+func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
+  // expected-error @below {{only supported nested wrapper is 'omp.simd'}}
+  omp.taskloop {
+    omp.distribute {
+      omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+        omp.yield
+      }
+      omp.terminator
+    }
+  }
+  return
+}
+
+// -----
+
 func.func @omp_threadprivate() {
   %1 = llvm.mlir.addressof @_QFsubEx : !llvm.ptr
   // expected-error @below {{op failed to verify that all of {sym_addr, tls_addr} have same type}}
@@ -1866,7 +1921,16 @@ func.func @omp_target_depend(%data_var: memref<i32>) {
 
 // -----
 
-func.func @omp_distribute(%data_var : memref<i32>) -> () {
+func.func @omp_distribute_schedule(%chunk_size : i32) -> () {
+  // expected-error @below {{op chunk size set without dist_schedule_static being present}}
+  "omp.distribute"(%chunk_size) <{operandSegmentSizes = array<i32: 1, 0, 0>}> ({
+      "omp.terminator"() : () -> ()
+    }) : (i32) -> ()
+}
+
+// -----
+
+func.func @omp_distribute_allocate(%data_var : memref<i32>) -> () {
   // expected-error @below {{expected equal sizes for allocate and allocator variables}}
   "omp.distribute"(%data_var) <{operandSegmentSizes = array<i32: 0, 1, 0>}> ({
       "omp.terminator"() : () -> ()
@@ -1875,6 +1939,29 @@ func.func @omp_distribute(%data_var : memref<i32>) -> () {
 
 // -----
 
+func.func @omp_distribute_wrapper() -> () {
+  // expected-error @below {{op must be a loop wrapper}}
+  "omp.distribute"() ({
+      %0 = arith.constant 0 : i32
+      "omp.terminator"() : () -> ()
+    }) : () -> ()
+}
+
+// -----
+
+func.func @omp_distribute_nested_wrapper(%data_var : memref<i32>) -> () {
+  // expected-error @below {{only supported nested wrappers are 'omp.parallel' and 'omp.simd'}}
+  "omp.distribute"() ({
+      "omp.wsloop"() ({
+        %0 = arith.constant 0 : i32
+        "omp.terminator"() : () -> ()
+      }) : () -> ()
+      "omp.terminator"() : () -> ()
+    }) : () -> ()
+}
+
+// -----
+
 omp.private {type = private} @x.privatizer : i32 alloc {
 ^bb0(%arg0: i32):
   %0 = arith.constant 0.0 : f32
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 851d44ad984ee..e2ca12afc14bd 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -171,6 +171,23 @@ func.func @omp_loop_nest(%lb : index, %ub : index, %step : index) -> () {
     omp.yield
   }
 
+  // TODO Remove induction variables from omp.wsloop.
+  omp.wsloop for (%iv) : index = (%lb) to (%ub) step (%step) {
+    // CHECK: omp.loop_nest
+    // CHECK-SAME: (%{{.*}}) : index =
+    // CHECK-SAME: (%{{.*}}) to (%{{.*}}) step (%{{.*}})
+    "omp.loop_nest" (%lb, %ub, %step) ({
+    ^bb0(%iv2: index):
+      // CHECK: test.op1
+      "test.op1"(%lb) : (index) -> ()
+      // CHECK: test.op2
+      "test.op2"() : () -> ()
+      // CHECK: omp.yield
+      omp.yield
+    }) : (index, index, index) -> ()
+    omp.yield
+  }
+
   return
 }
 
@@ -209,6 +226,22 @@ func.func @omp_loop_nest_pretty(%lb : index, %ub : index, %step : index) -> () {
     omp.yield
   }
 
+  // TODO Remove induction variables from omp.wsloop.
+  omp.wsloop for (%iv) : index = (%lb) to (%ub) step (%step) {
+    // CHECK: omp.loop_nest
+    // CHECK-SAME: (%{{.*}}) : index =
+    // CHECK-SAME: (%{{.*}}) to (%{{.*}}) step (%{{.*}})
+    omp.loop_nest (%iv2) : index = (%lb) to (%ub) step (%step)  {
+      // CHECK: test.op1
+      "test.op1"(%lb) : (index) -> ()
+      // CHECK: test.op2
+      "test.op2"() : () -> ()
+      // CHECK: omp.yield
+      omp.yield
+    }
+    omp.yield
+  }
+
   return
 }
 
@@ -406,185 +439,214 @@ func.func @omp_wsloop_pretty_multiple(%lb1 : i32, %ub1 : i32, %step1 : i32, %lb2
   return
 }
 
-// CHECK-LABEL: omp_simdloop
-func.func @omp_simdloop(%lb : index, %ub : index, %step : index) -> () {
-  // CHECK: omp.simdloop for (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}})
-  "omp.simdloop" (%lb, %ub, %step) ({
-    ^bb0(%iv: index):
-      omp.yield
-  }) {operandSegmentSizes = array<i32: 1,1,1,0,0,0>} :
-    (index, index, index) -> ()
+// CHECK-LABEL: omp_simd
+func.func @omp_simd(%lb : index, %ub : index, %step : index) -> () {
+  // CHECK: omp.simd
+  "omp.simd" () ({
+    "omp.loop_nest" (%lb, %ub, %step) ({
+    ^bb1(%iv2: index):
+      "omp.yield"() : () -> ()
+    }) : (index, index, index) -> ()
+    "omp.terminator"() : () -> ()
+  }) : () -> ()
 
   return
 }
 
-// CHECK-LABEL: omp_simdloop_aligned_list
-func.func @omp_simdloop_aligned_list(%arg0 : index, %arg1 : index, %arg2 : index,
-                                     %arg3 : memref<i32>, %arg4 : memref<i32>) -> () {
-  // CHECK:      omp.simdloop   aligned(%{{.*}} : memref<i32> -> 32 : i64,
+// CHECK-LABEL: omp_simd_aligned_list
+func.func @omp_simd_aligned_list(%arg0 : index, %arg1 : index, %arg2 : index,
+                                 %arg3 : memref<i32>, %arg4 : memref<i32>) -> () {
+  // CHECK:      omp.simd aligned(
+  // CHECK-SAME: %{{.*}} : memref<i32> -> 32 : i64,
   // CHECK-SAME: %{{.*}} : memref<i32> -> 128 : i64)
-  // CHECK-SAME: for  (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) {
-  "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg4) ({
-    ^bb0(%arg5: index):
+  "omp.simd"(%arg3, %arg4) ({
+    "omp.loop_nest" (%arg0, %arg1, %arg2) ({
+    ^bb1(%iv2: index):
       "omp.yield"() : () -> ()
+    }) : (index, index, index) -> ()
+    "omp.terminator"() : () -> ()
   }) {alignment_values = [32, 128],
-      operandSegmentSizes = array<i32: 1, 1, 1, 2, 0, 0>} : (index, index, index, memref<i32>, memref<i32>) -> ()
+      operandSegmentSizes = array<i32: 2, 0, 0>} : (memref<i32>, memref<i32>) -> ()
   return
 }
 
-// CHECK-LABEL: omp_simdloop_aligned_single
-func.func @omp_simdloop_aligned_single(%arg0 : index, %arg1 : index, %arg2 : index,
-                                       %arg3 : memref<i32>, %arg4 : memref<i32>) -> () {
-  // CHECK:      omp.simdloop   aligned(%{{.*}} : memref<i32> -> 32 : i64)
-  // CHECK-SAME: for  (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) {
-  "omp.simdloop"(%arg0, %arg1, %arg2, %arg3) ({
-    ^bb0(%arg5: index):
+// CHECK-LABEL: omp_simd_aligned_single
+func.func @omp_simd_aligned_single(%arg0 : index, %arg1 : index, %arg2 : index,
+                                   %arg3 : memref<i32>, %arg4 : memref<i32>) -> () {
+  // CHECK: omp.simd aligned(%{{.*}} : memref<i32> -> 32 : i64)
+  "omp.simd"(%arg3) ({
+    "omp.loop_nest" (%arg0, %arg1, %arg2) ({
+    ^bb1(%iv2: index):
       "omp.yield"() : () -> ()
+    }) : (index, index, index) -> ()
+    "omp.terminator"() : () -> ()
   }) {alignment_values = [32],
-      operandSegmentSizes = array<i32: 1, 1, 1, 1, 0, 0>} : (index, index, index, memref<i32>) -> ()
+      operandSegmentSizes = array<i32: 1, 0, 0>} : (memref<i32>) -> ()
   return
 }
 
-// CHECK-LABEL: omp_simdloop_nontemporal_list
-func.func @omp_simdloop_nontemporal_list(%arg0 : index,
-                                         %arg1 : index,
-                                         %arg2 : index,
-                                         %arg3 : memref<i32>,
-                                         %arg4 : memref<i64>) -> () {
-  // CHECK:      omp.simdloop   nontemporal(%{{.*}}, %{{.*}} : memref<i32>, memref<i64>)
-  // CHECK-SAME: for  (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) {
-  "omp.simdloop"(%arg0, %arg1, %arg2, %arg3, %arg4) ({
-    ^bb0(%arg5: index):
+// CHECK-LABEL: omp_simd_nontemporal_list
+func.func @omp_simd_nontemporal_list(%arg0 : index, %arg1 : index,
+                                     %arg2 : index, %arg3 : memref<i32>,
+                                     %arg4 : memref<i64>) -> () {
+  // CHECK: omp.simd nontemporal(%{{.*}}, %{{.*}} : memref<i32>, memref<i64>)
+  "omp.simd"(%arg3, %arg4) ({
+    "omp.loop_nest" (%arg0, %arg1, %arg2) ({
+    ^bb1(%iv2: index):
       "omp.yield"() : () -> ()
-  }) {operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 2>} : (index, index, index, memref<i32>, memref<i64>) -> ()
+    }) : (index, index, index) -> ()
+    "omp.terminator"() : () -> ()
+  }) {operandSegmentSizes = array<i32: 0, 0, 2>} : (memref<i32>, memref<i64>) -> ()
   return
 }
 
-// CHECK-LABEL: omp_simdloop_nontemporal_single
-func.func @omp_simdloop_nontemporal_single(%arg0 : index,
-                                           %arg1 : index,
-                                           %arg2 : index,
-                                           %arg3 : memref<i32>,
-                                           %arg4 : memref<i64>) -> () {
-  // CHECK:      omp.simdloop   nontemporal(%{{.*}} : memref<i32>)
-  // CHECK-SAME: for  (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) {
-  "omp.simdloop"(%arg0, %arg1, %arg2, %arg3) ({
-    ^bb0(%arg5: index):
+// CHECK-LABEL: omp_simd_nontemporal_single
+func.func @omp_simd_nontemporal_single(%arg0 : index, %arg1 : index,
+                                       %arg2 : index, %arg3 : memref<i32>,
+                                       %arg4 : memref<i64>) -> () {
+  // CHECK: omp.simd nontemporal(%{{.*}} : memref<i32>)
+  "omp.simd"(%arg3) ({
+    "omp.loop_nest" (%arg0, %arg1, %arg2) ({
+    ^bb1(%iv2: index):
       "omp.yield"() : () -> ()
-  }) {operandSegmentSizes = array<i32: 1, 1, 1, 0, 0, 1>} : (index, index, index, memref<i32>) -> ()
+    }) : (index, index, index) -> ()
+    "omp.terminator"() : () -> ()
+  }) {operandSegmentSizes = array<i32: 0, 0, 1>} : (memref<i32>) -> ()
   return
 }
 
-// CHECK-LABEL: omp_simdloop_pretty
-func.func @omp_simdloop_pretty(%lb : index, %ub : index, %step : index) -> () {
-  // CHECK: omp.simdloop for (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}})
-  omp.simdloop for (%iv) : index = (%lb) to (%ub) step (%step) {
-    omp.yield
+// CHECK-LABEL: omp_simd_pretty
+func.func @omp_simd_pretty(%lb : index, %ub : index, %step : index) -> () {
+  // CHECK: omp.simd {
+  omp.simd {
+    omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
+      omp.yield
+    }
   }
   return
 }
 
-// CHECK-LABEL:   func.func @omp_simdloop_pretty_aligned(
-func.func @omp_simdloop_pretty_aligned(%lb : index, %ub : index, %step : index,
-                                       %data_var : memref<i32>,
-                                       %data_var1 : memref<i32>) -> () {
-  // CHECK:      omp.simdloop   aligned(%{{.*}} : memref<i32> -> 32 : i64,
+// CHECK-LABEL:   func.func @omp_simd_pretty_aligned(
+func.func @omp_simd_pretty_aligned(%lb : index, %ub : index, %step : index,
+                                   %data_var : memref<i32>,
+                                   %data_var1 : memref<i32>) -> () {
+  // CHECK:      omp.simd aligned(
+  // CHECK-SAME: %{{.*}} : memref<i32> -> 32 : i64,
   // CHECK-SAME: %{{.*}} : memref<i32> -> 128 : i64)
-  // CHECK-SAME: for  (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) {
-  omp.simdloop aligned(%data_var :  memref<i32> -> 32, %data_var1 : memref<i32> -> 128)
-    for (%iv) : index = (%lb) to (%ub) step (%step) {
+  omp.simd aligned(%data_var :  memref<i32> -> 32, %data_var1 : memref<i32> -> 128) {
+    omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
       omp.yield
+    }
   }
   return
 }
 
-// CHECK-LABEL: omp_simdloop_pretty_if
-func.func @omp_simdloop_pretty_if(%lb : index, %ub : index, %step : index, %if_cond : i1) -> () {
-  // CHECK: omp.simdloop if(%{{.*}}) for (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}})
-  omp.simdloop if(%if_cond) for (%iv): index = (%lb) to (%ub) step (%step) {
-    omp.yield
+// CHECK-LABEL: omp_simd_pretty_if
+func.func @omp_simd_pretty_if(%lb : index, %ub : index, %step : index, %if_cond : i1) -> () {
+  // CHECK: omp.simd if(%{{.*}})
+  omp.simd if(%if_cond) {
+    omp.loop_nest (%iv): index = (%lb) to (%ub) step (%step) {
+      omp.yield
+    }
   }
   return
 }
 
-// CHECK-LABEL:   func.func @omp_simdloop_pretty_nontemporal
-func.func @omp_simdloop_pretty_nontemporal(%lb : index,
-                                           %ub : index,
-                                           %step : index,
-                                           %data_var : memref<i32>,
-                                           %data_var1 : memref<i32>) -> () {
-  // CHECK:      omp.simdloop   nontemporal(%{{.*}}, %{{.*}} : memref<i32>, memref<i32>)
-  // CHECK-SAME: for  (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) {
-  omp.simdloop nontemporal(%data_var, %data_var1 : memref<i32>, memref<i32>)
-    for (%iv) : index = (%lb) to (%ub) step (%step) {
+// CHECK-LABEL: func.func @omp_simd_pretty_nontemporal
+func.func @omp_simd_pretty_nontemporal(%lb : index, %ub : index, %step : index,
+                                       %data_var : memref<i32>,
+                                       %data_var1 : memref<i32>) -> () {
+  // CHECK: omp.simd nontemporal(%{{.*}}, %{{.*}} : memref<i32>, memref<i32>)
+  omp.simd nontemporal(%data_var, %data_var1 : memref<i32>, memref<i32>) {
+    omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
       omp.yield
-  }
-  return
-}
-// CHECK-LABEL: omp_simdloop_pretty_order
-func.func @omp_simdloop_pretty_order(%lb : index, %ub : index, %step : index) -> () {
-  // CHECK: omp.simdloop order(concurrent)
-  // CHECK-SAME: for (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}})
-  omp.simdloop order(concurrent) for (%iv): index = (%lb) to (%ub) step (%step) {
-    omp.yield
+    }
   }
   return
 }
 
-// CHECK-LABEL: omp_simdloop_pretty_simdlen
-func.func @omp_simdloop_pretty_simdlen(%lb : index, %ub : index, %step : index) -> () {
-  // CHECK: omp.simdloop simdlen(2) for (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}})
-  omp.simdloop simdlen(2) for (%iv): index = (%lb) to (%ub) step (%step) {
-    omp.yield
+// CHECK-LABEL: omp_simd_pretty_order
+func.func @omp_simd_pretty_order(%lb : index, %ub : index, %step : index) -> () {
+  // CHECK: omp.simd order(concurrent)
+  omp.simd order(concurrent) {
+    omp.loop_nest (%iv): index = (%lb) to (%ub) step (%step) {
+      omp.yield
+    }
   }
   return
 }
 
-// CHECK-LABEL: omp_simdloop_pretty_safelen
-func.func @omp_simdloop_pretty_safelen(%lb : index, %ub : index, %step : index) -> () {
-  // CHECK: omp.simdloop safelen(2) for (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}})
-  omp.simdloop safelen(2) for (%iv): index = (%lb) to (%ub) step (%step) {
-    omp.yield
+// CHECK-LABEL: omp_simd_pretty_simdlen
+func.func @omp_simd_pretty_simdlen(%lb : index, %ub : index, %step : index) -> () {
+  // CHECK: omp.simd simdlen(2)
+  omp.simd simdlen(2) {
+    omp.loop_nest (%iv): index = (%lb) to (%ub) step (%step) {
+      omp.yield
+    }
   }
   return
 }
 
-// CHECK-LABEL: omp_simdloop_pretty_multiple
-func.func @omp_simdloop_pretty_multiple(%lb1 : index, %ub1 : index, %step1 : index, %lb2 : index, %ub2 : index, %step2 : index) -> () {
-  // CHECK: omp.simdloop for (%{{.*}}, %{{.*}}) : index = (%{{.*}}, %{{.*}}) to (%{{.*}}, %{{.*}}) step (%{{.*}}, %{{.*}})
-  omp.simdloop for (%iv1, %iv2) : index = (%lb1, %lb2) to (%ub1, %ub2) step (%step1, %step2) {
-    omp.yield
+// CHECK-LABEL: omp_simd_pretty_safelen
+func.func @omp_simd_pretty_safelen(%lb : index, %ub : index, %step : index) -> () {
+  // CHECK: omp.simd safelen(2)
+  omp.simd safelen(2) {
+    omp.loop_nest (%iv): index = (%lb) to (%ub) step (%step) {
+      omp.yield
+    }
   }
   return
 }
 
 // CHECK-LABEL: omp_distribute
-func.func @omp_distribute(%chunk_size : i32, %data_var : memref<i32>) -> () {
+func.func @omp_distribute(%chunk_size : i32, %data_var : memref<i32>, %arg0 : i32) -> () {
   // CHECK: omp.distribute
   "omp.distribute" () ({
-    omp.terminator
+    "omp.loop_nest" (%arg0, %arg0, %arg0) ({
+    ^bb0(%iv: i32):
+      "omp.yield"() : () -> ()
+    }) : (i32, i32, i32) -> ()
+    "omp.terminator"() : () -> ()
   }) {} : () -> ()
   // CHECK: omp.distribute
   omp.distribute {
-    omp.terminator
+    omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
+      omp.yield
+    }
   }
   // CHECK: omp.distribute dist_schedule_static
   omp.distribute dist_schedule_static {
-    omp.terminator
+    omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
+      omp.yield
+    }
   }
   // CHECK: omp.distribute dist_schedule_static chunk_size(%{{.+}} : i32)
   omp.distribute dist_schedule_static chunk_size(%chunk_size : i32) {
-    omp.terminator
+    omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
+      omp.yield
+    }
   }
   // CHECK: omp.distribute order(concurrent)
   omp.distribute order(concurrent) {
-    omp.terminator
+    omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
+      omp.yield
+    }
   }
   // CHECK: omp.distribute allocate(%{{.+}} : memref<i32> -> %{{.+}} : memref<i32>)
   omp.distribute allocate(%data_var : memref<i32> -> %data_var : memref<i32>) {
-    omp.terminator
+    omp.loop_nest (%iv) : i32 = (%arg0) to (%arg0) step (%arg0) {
+      omp.yield
+    }
+  }
+  // CHECK: omp.distribute
+  omp.distribute {
+    omp.simd {
+      omp.loop_nest (%iv2) : i32 = (%arg0) to (%arg0) step (%arg0) {
+        omp.yield
+      }
+    }
   }
-return
+  return
 }
 
 
@@ -2000,135 +2062,125 @@ func.func @omp_taskgroup_clauses() -> () {
 // CHECK-LABEL: @omp_taskloop
 func.func @omp_taskloop(%lb: i32, %ub: i32, %step: i32) -> () {
 
-  // CHECK: omp.taskloop for (%{{.+}}) : i32 = (%{{.+}}) to (%{{.+}}) step (%{{.+}}) {
-  omp.taskloop for (%i) : i32 = (%lb) to (%ub) step (%step)  {
-    // CHECK: omp.terminator
-    omp.terminator
-  }
-
-  // CHECK: omp.taskloop for (%{{.+}}) : i32 = (%{{.+}}) to (%{{.+}}) step (%{{.+}}) {
-  omp.taskloop for (%i) : i32 = (%lb) to (%ub) step (%step)  {
-    // CHECK: test.op1
-    "test.op1"(%lb) : (i32) -> ()
-    // CHECK: test.op2
-    "test.op2"() : () -> ()
-    // CHECK: omp.terminator
-    omp.terminator
-  }
-
-  // CHECK: omp.taskloop for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
-  }
-
-  // CHECK: omp.taskloop for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) inclusive step (%{{.+}}, %{{.+}}) {
-  omp.taskloop for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) inclusive step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop {
+  omp.taskloop {
+    omp.loop_nest (%i) : i32 = (%lb) to (%ub) step (%step)  {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
   %testbool = "test.bool"() : () -> (i1)
 
-  // CHECK: omp.taskloop if(%{{[^)]+}})
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop if(%testbool)
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop if(%{{[^)]+}}) {
+  omp.taskloop if(%testbool) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
-  // CHECK: omp.taskloop final(%{{[^)]+}})
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop final(%testbool)
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop final(%{{[^)]+}}) {
+  omp.taskloop final(%testbool) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
-  // CHECK: omp.taskloop untied
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop untied
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop untied {
+  omp.taskloop untied {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
-  // CHECK: omp.taskloop mergeable
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop mergeable
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop mergeable {
+  omp.taskloop mergeable {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
   %testf32 = "test.f32"() : () -> (!llvm.ptr)
   %testf32_2 = "test.f32"() : () -> (!llvm.ptr)
-  // CHECK: omp.taskloop in_reduction(@add_f32 -> %{{.+}} : !llvm.ptr, @add_f32 -> %{{.+}} : !llvm.ptr)
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop in_reduction(@add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr)
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop in_reduction(@add_f32 -> %{{.+}} : !llvm.ptr, @add_f32 -> %{{.+}} : !llvm.ptr) {
+  omp.taskloop in_reduction(@add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
-  // CHECK: omp.taskloop reduction(@add_f32 -> %{{.+}} : !llvm.ptr, @add_f32 -> %{{.+}} : !llvm.ptr)
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop reduction(@add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr)
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop reduction(@add_f32 -> %{{.+}} : !llvm.ptr, @add_f32 -> %{{.+}} : !llvm.ptr) {
+  omp.taskloop reduction(@add_f32 -> %testf32 : !llvm.ptr, @add_f32 -> %testf32_2 : !llvm.ptr) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
-  // CHECK: omp.taskloop in_reduction(@add_f32 -> %{{.+}} : !llvm.ptr) reduction(@add_f32 -> %{{.+}} : !llvm.ptr)
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop in_reduction(@add_f32 -> %testf32 : !llvm.ptr) reduction(@add_f32 -> %testf32_2 : !llvm.ptr)
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop in_reduction(@add_f32 -> %{{.+}} : !llvm.ptr) reduction(@add_f32 -> %{{.+}} : !llvm.ptr) {
+  omp.taskloop in_reduction(@add_f32 -> %testf32 : !llvm.ptr) reduction(@add_f32 -> %testf32_2 : !llvm.ptr) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
   %testi32 = "test.i32"() : () -> (i32)
-  // CHECK: omp.taskloop priority(%{{[^:]+}}: i32)
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop priority(%testi32: i32)
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop priority(%{{[^:]+}}: i32) {
+  omp.taskloop priority(%testi32: i32) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
   %testmemref = "test.memref"() : () -> (memref<i32>)
-  // CHECK: omp.taskloop allocate(%{{.+}} : memref<i32> -> %{{.+}} : memref<i32>)
-  omp.taskloop allocate(%testmemref : memref<i32> -> %testmemref : memref<i32>)
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop allocate(%{{.+}} : memref<i32> -> %{{.+}} : memref<i32>) {
+  omp.taskloop allocate(%testmemref : memref<i32> -> %testmemref : memref<i32>) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
   %testi64 = "test.i64"() : () -> (i64)
-  // CHECK: omp.taskloop grain_size(%{{[^:]+}}: i64)
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop grain_size(%testi64: i64)
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop grain_size(%{{[^:]+}}: i64) {
+  omp.taskloop grain_size(%testi64: i64) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
-  // CHECK: omp.taskloop num_tasks(%{{[^:]+}}: i64)
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop num_tasks(%testi64: i64)
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop num_tasks(%{{[^:]+}}: i64) {
+  omp.taskloop num_tasks(%testi64: i64) {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
   }
 
-  // CHECK: omp.taskloop nogroup
-  // CHECK-SAME: for (%{{.+}}, %{{.+}}) : i32 = (%{{.+}}, %{{.+}}) to (%{{.+}}, %{{.+}}) step (%{{.+}}, %{{.+}}) {
-  omp.taskloop nogroup
-  for (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
-    // CHECK: omp.terminator
-    omp.terminator
+  // CHECK: omp.taskloop nogroup {
+  omp.taskloop nogroup {
+    omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+      // CHECK: omp.yield
+      omp.yield
+    }
+  }
+
+  // CHECK: omp.taskloop {
+  omp.taskloop {
+    omp.simd {
+      omp.loop_nest (%i, %j) : i32 = (%lb, %ub) to (%ub, %lb) step (%step, %step) {
+        // CHECK: omp.yield
+        omp.yield
+      }
+    }
   }
 
   // CHECK: return
diff --git a/mlir/test/Dialect/SparseTensor/invalid.mlir b/mlir/test/Dialect/SparseTensor/invalid.mlir
index 7f5c05190fc9a..3fa696e1600a9 100644
--- a/mlir/test/Dialect/SparseTensor/invalid.mlir
+++ b/mlir/test/Dialect/SparseTensor/invalid.mlir
@@ -1012,3 +1012,85 @@ func.func @sparse_print(%arg0: tensor<10x10xf64>) {
   sparse_tensor.print %arg0 : tensor<10x10xf64>
   return
 }
+
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 2>) {
+  // expected-error@+1 {{'sparse_tensor.extract_iteration_space' expect larger level upper bound than lower bound}}
+  %l1 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 2 to 0 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 2>
+  return
+}
+
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 0>) {
+  // expected-error@+1 {{'sparse_tensor.extract_iteration_space' op parent iterator should be specified iff level lower bound equals 0}}
+  %l1 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 0 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0>
+  return
+}
+
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>) {
+  // expected-error@+1 {{'sparse_tensor.extract_iteration_space' op parent iterator should be specified iff level lower bound equals 0}}
+  %l1 = sparse_tensor.extract_iteration_space %sp lvls = 1 : tensor<4x8xf32, #COO>
+  return
+}
+
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+#CSR = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : dense,
+    j : compressed
+  )
+}>
+
+func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#CSR, lvls = 0>) {
+  // expected-error@+1 {{'sparse_tensor.extract_iteration_space' op mismatch in parent iterator encoding and iteration space encoding.}}
+  %l1 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 1 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#CSR, lvls = 0>
+  return
+}
+
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 0>) {
+  // expected-error@+1 {{'sparse_tensor.extract_iteration_space' op parent iterator should be used to extract an iteration space from a consecutive level.}}
+  %l1 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 2 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0>
+  return
+}
diff --git a/mlir/test/Dialect/SparseTensor/roundtrip.mlir b/mlir/test/Dialect/SparseTensor/roundtrip.mlir
index 12f69c1d37b9c..d34071279e512 100644
--- a/mlir/test/Dialect/SparseTensor/roundtrip.mlir
+++ b/mlir/test/Dialect/SparseTensor/roundtrip.mlir
@@ -738,3 +738,28 @@ func.func @sparse_has_runtime() -> i1 {
   %has_runtime = sparse_tensor.has_runtime_library
   return %has_runtime : i1
 }
+
+// -----
+
+#COO = #sparse_tensor.encoding<{
+  map = (i, j) -> (
+    i : compressed(nonunique),
+    j : singleton(soa)
+  )
+}>
+
+// CHECK-LABEL:   func.func @sparse_extract_iter_space(
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<4x8xf32, #sparse{{[0-9]*}}>,
+// CHECK-SAME:      %[[VAL_1:.*]]: !sparse_tensor.iterator<#sparse{{[0-9]*}}, lvls = 0>)
+// CHECK:           %[[VAL_2:.*]] = sparse_tensor.extract_iteration_space %[[VAL_0]] lvls = 0
+// CHECK:           %[[VAL_3:.*]] = sparse_tensor.extract_iteration_space %[[VAL_0]] at %[[VAL_1]] lvls = 1
+// CHECK:           return %[[VAL_2]], %[[VAL_3]] : !sparse_tensor.iter_space<#sparse{{[0-9]*}}, lvls = 0>, !sparse_tensor.iter_space<#sparse{{[0-9]*}}, lvls = 1>
+// CHECK:         }
+func.func @sparse_extract_iter_space(%sp : tensor<4x8xf32, #COO>, %it1 : !sparse_tensor.iterator<#COO, lvls = 0>)
+  -> (!sparse_tensor.iter_space<#COO, lvls = 0>, !sparse_tensor.iter_space<#COO, lvls = 1>) {
+  // Extracting the iteration space for the first level needs no parent iterator.
+  %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor<4x8xf32, #COO>
+  // Extracting the iteration space for the second level needs a parent iterator.
+  %l2 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 1 : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0>
+  return %l1, %l2 : !sparse_tensor.iter_space<#COO, lvls = 0>, !sparse_tensor.iter_space<#COO, lvls = 1>
+}
diff --git a/mlir/test/Dialect/Transform/test-pattern-application.mlir b/mlir/test/Dialect/Transform/test-pattern-application.mlir
index fa8a555af9218..f78b4b6f6798c 100644
--- a/mlir/test/Dialect/Transform/test-pattern-application.mlir
+++ b/mlir/test/Dialect/Transform/test-pattern-application.mlir
@@ -26,6 +26,36 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// CHECK-LABEL: @limited_updates
+func.func @limited_updates() {
+  "test.container"() ({
+    // Only one is replaced.
+    // CHECK: "test.foo"() {replace_with_new_op = "test.foo"}
+    // CHECK: "test.foo"() : ()
+    %0 = "test.foo"() {replace_with_new_op = "test.foo"} : () -> (i32)
+    %1 = "test.foo"() {replace_with_new_op = "test.foo"} : () -> (i32)
+  }) : () -> ()
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    // Pattern application will fail because of the upper limit, wrap in
+    // sequence to suppress the error message.
+    transform.sequence %arg0 : !transform.any_op failures(suppress) {
+    ^bb0(%arg1: !transform.any_op):
+      %0 = transform.structured.match ops{["test.container"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+      %1 = transform.structured.match ops{["test.foo"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+      transform.apply_patterns to %0 {
+        transform.apply_patterns.transform.test_patterns
+      }  {max_num_rewrites = 1} : !transform.any_op
+    }
+    transform.yield
+  }
+}
+
+// -----
+
 func.func @replacement_op_not_found() {
   "test.container"() ({
     // expected-note @below {{[0] replaced op}}
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index 627ac54cf145b..61a5f2a96e1c1 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -1943,14 +1943,6 @@ func.func @shuffle_nofold1(%v0 : vector<4xi32>, %v1 : vector<2xi32>) -> vector<5
   return %shuffle : vector<5xi32>
 }
 
-// CHECK-LABEL: func @shuffle_nofold2
-//       CHECK:   %[[V:.+]] = vector.shuffle %arg0, %arg1 [0, 1, 2, 3] : vector<[4]xi32>, vector<[2]xi32>
-//       CHECK:   return %[[V]]
-func.func @shuffle_nofold2(%v0 : vector<[4]xi32>, %v1 : vector<[2]xi32>) -> vector<4xi32> {
-  %shuffle = vector.shuffle %v0, %v1 [0, 1, 2, 3] : vector<[4]xi32>, vector<[2]xi32>
-  return %shuffle : vector<4xi32>
-}
-
 // -----
 
 // CHECK-LABEL: func @transpose_scalar_broadcast1
diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
index c16f1cb2876db..c9f7e9c6e2fb0 100644
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -84,6 +84,13 @@ func.func @shuffle_index_out_of_range(%arg0: vector<2xf32>, %arg1: vector<2xf32>
 
 // -----
 
+func.func @shuffle_scalable_vec(%arg0: vector<[2]xf32>, %arg1: vector<[2]xf32>) {
+  // expected-error@+1 {{'vector.shuffle' op operand #0 must be fixed-length vector of any type values}}
+  %1 = vector.shuffle %arg0, %arg1 [0, 1, 2, 3] : vector<[2]xf32>, vector<[2]xf32>
+}
+
+// -----
+
 func.func @shuffle_empty_mask(%arg0: vector<2xf32>, %arg1: vector<2xf32>) {
   // expected-error@+1 {{'vector.shuffle' op invalid mask length}}
   %1 = vector.shuffle %arg0, %arg1 [] : vector<2xf32>, vector<2xf32>
diff --git a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
index 039346adbb851..f0945c79a94ac 100644
--- a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
+++ b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
@@ -59,4 +59,66 @@ gpu.func @test_store_nd_vc(%dst: memref<24x32xf16>) {
   gpu.return
 }
 
+// CHECK: gpu.func @test_create_update_nd_tdesc_vc(%[[arg0:.*]]: memref<24x32xf32>) {
+gpu.func @test_create_update_nd_tdesc_vc(%src: memref<24x32xf32>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf32>
+  // CHECK: %[[R1:.*]] = xegpu.update_nd_offset %[[REG]], [0, 16] : !xegpu.tensor_desc<8x16xf32>
+  %2 = xegpu.update_nd_offset %1, [0, 16]: !xegpu.tensor_desc<8x16xf32>
+  gpu.return
+}
+
+// CHECK: gpu.func @test_create_tdesc_vc(%[[arg0:.*]]: ui64) {
+gpu.func @test_create_tdesc_vc(%src: ui64) {
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64  -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  gpu.return
+}
+
+// CHECK: gpu.func @test_prefetch_vc(%[[arg0:.*]]: ui64) {
+gpu.func @test_prefetch_vc(%src: ui64) {
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64  -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>> 
+  gpu.return
+}
+
+// CHECK: gpu.func @test_load_gather_vc(%[[arg0:.*]]: ui64) {
+gpu.func @test_load_gather_vc(%src: ui64) {
+  //CHECK: %[[cst:.*]] = arith.constant dense<true> : vector<4xi1>
+  %0 = arith.constant dense<1>: vector<4xi1>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64  -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+  //CHECK-SAME: !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>, vector<4xi1> -> vector<4x2xf32>
+  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
+        : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>, vector<4xi1> -> vector<4x2xf32>
+  gpu.return
+}
+
+// CHECK: gpu.func @test_store_scatter_vc(%[[arg0:.*]]: ui64) {
+gpu.func @test_store_scatter_vc(%src: ui64) {
+  //CHECK: %[[c0:.*]] = arith.constant dense<true> : vector<4xi1>
+  %0 = arith.constant dense<1>: vector<4xi1>
+  //CHECK: %[[c1:.*]] = arith.constant dense<2.900000e+00> : vector<4x2xf32>
+  %1 = arith.constant dense<2.9>: vector<4x2xf32>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  %2 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64  -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  //CHECK: xegpu.store %[[c1]], %[[R0]], %[[c0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>
+  //CHECK-SAME: vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>, vector<4xi1>
+  xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>
+        : vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>, vector<4xi1>
+  gpu.return
+}
+
+// CHECK: gpu.func @test_create_update_tdesc_vc(%[[arg0:.*]]: ui64) {
+gpu.func @test_create_update_tdesc_vc(%src: ui64) {
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64  -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], [32, 32, 32, 32] : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  %2 = xegpu.update_offset %1, [32, 32, 32, 32] : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  gpu.return
+}
+
 }
\ No newline at end of file
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
new file mode 100644
index 0000000000000..5e29361ec6908
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -0,0 +1,159 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+// -----
+func.func @test_create_nd_tdesc_vc_1(%src: memref<24xf32>) {
+  // expected-error@+1 {{Expecting the rank of shape, strides, offsets, source memref type (if source is a memref) and TensorDesc should match with each other. They currenlty are 2D.}}
+  %1 = xegpu.create_nd_tdesc %src[0] : memref<24xf32> -> !xegpu.tensor_desc<8x16xf32>
+  return
+}
+
+// -----
+
+func.func @test_create_nd_tdesc_vc_2(%src: memref<24x32xf32>) {
+  // expected-error@+1 {{TensorDesc should have the same element type with the source if it is a memref}}
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<8x16xf16>
+  return
+}
+
+// -----
+func.func @test_prefetch_nd_vc_1(%src: memref<24x32xf16>) {
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint<write_back>}}
+  xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<write_back>}>: !xegpu.tensor_desc<8x16xf16>
+  return
+}
+
+// -----
+func.func @test_prefetch_nd_vc_2(%src: memref<24xf16>) {
+  %1 = xegpu.create_tdesc %src[0, 1, 2, 3, 4, 5, 6, 7]
+        : memref<24xf16> -> !xegpu.tensor_desc<8xf16, #xegpu.tdesc_attr<scattered=true>>
+  // expected-error@+1 {{Expects a non-scattered TensorDesc}}
+  xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>}>
+        : !xegpu.tensor_desc<8xf16, #xegpu.tdesc_attr<scattered=true>>
+  return
+}
+
+// -----
+func.func @test_load_nd_vc_1(%src: memref<8x16xf16>) {
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16>
+  // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint<write_back>}}
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<write_back>}>
+      : !xegpu.tensor_desc<8x16xf16> -> vector<4x16x2xf16>
+  return
+}
+
+// -----
+func.func @test_load_nd_vc_2(%src: memref<16xf16>) {
+  %1 = xegpu.create_tdesc %src[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2}
+        : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr<scattered=true>>
+  // expected-error@+1 {{Expects a non-scattered TensorDesc.}}
+  %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>}>
+      : !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr<scattered=true>> -> vector<8x2xf16>
+  return
+}
+
+// -----
+func.func @test_store_nd_vc_1(%dst: memref<24x32xf16>) {
+  %1 = arith.constant dense<1.0>: vector<24x32xf16>
+  %2 = xegpu.create_nd_tdesc %dst[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+  // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint<streaming>}}
+  xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<streaming>}>: vector<24x32xf16>, !xegpu.tensor_desc<24x32xf16>
+  return
+}
+
+// -----
+func.func @test_store_nd_vc_2(%dst: memref<16xf16>) {
+  %1 = arith.constant dense<1.0>: vector<8x2xf16>
+  %2 = xegpu.create_tdesc %dst[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2}
+        : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr<scattered=true>>
+  // expected-error@+1 {{Expects a non-scattered TensorDesc}}
+  xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<streaming>}>
+        : vector<8x2xf16>, !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr<scattered=true>>
+  return
+}
+
+// -----
+func.func @test_update_nd_offset_1(%dst: memref<16xf16>) {
+  %1 = xegpu.create_tdesc %dst[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2}
+        : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr<scattered=true>>
+  // expected-error@+1 {{Expects a non-scattered TensorDesc}}
+  xegpu.update_nd_offset %1, [0, 2] : !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr<scattered=true>>
+  return
+}
+
+// -----
+func.func @test_create_tdesc_vc_1(%src: ui64) {
+  // expected-error@+1 {{Expects a scattered TensorDesc}}
+  %1 = xegpu.create_tdesc %src[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2}
+        : ui64 -> !xegpu.tensor_desc<8x2xf16>
+  return
+}
+
+// -----
+func.func @test_create_tdesc_vc_2(%src: ui64) {
+  // expected-error@+1 {{Incorrect TensorDesc shape}}
+  %1 = xegpu.create_tdesc %src[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2}
+        : ui64 -> !xegpu.tensor_desc<8x4xf16, #xegpu.tdesc_attr<scattered = true>>
+  return
+}
+
+// -----
+func.func @test_prefetch_vc_1(%src: memref<24x32xf16>) {
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16>
+  // expected-error@+1 {{Expects a scattered TensorDesc}}
+  xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<write_back>}>: !xegpu.tensor_desc<24x32xf16>
+  return
+}
+
+// -----
+func.func @test_prefetch_vc_2(%src: ui64) {
+  %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64  -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint<write_back>}}
+  xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<write_back>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  return
+}
+
+// -----
+func.func @test_load_gather_vc_1(%src: memref<24x32xf16>) {
+  %0 = arith.constant dense<1>: vector<4xi1>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<4x2xf16>
+  // expected-error@+1 {{Expects a scattered TensorDesc}}
+  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>}>
+      : !xegpu.tensor_desc<4x2xf16>, vector<4xi1> -> vector<4x2xf16>
+  return
+}
+
+// -----
+func.func @test_load_gather_vc_2(%src: ui64) {
+  %0 = arith.constant dense<1>: vector<4xi1>
+  %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64
+        -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint<write_back>}}
+  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<write_back>}>
+        : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>, vector<4xi1>
+          -> vector<4x2xf32>
+  return
+}
+
+// -----
+func.func @test_store_scatter_vc_1(%src: memref<24x32xf32>) {
+  %0 = arith.constant dense<1>: vector<4xi1>
+  %1 = arith.constant dense<2.9>: vector<4x2xf32>
+  %2 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<4x2xf32>
+  // expected-error@+1 {{Expects a scattered TensorDesc}}
+  xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint<cached>}>
+        : vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32>, vector<4xi1>
+  return
+}
+
+// -----
+func.func @test_store_scatter_vc_2(%src: ui64) {
+  %0 = arith.constant dense<1>: vector<4xi1>
+  %1 = arith.constant dense<2.9>: vector<4x2xf32>
+  %2 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2}
+          : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint<streaming>}}
+  xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint<streaming>}> : vector<4x2xf32>,
+          !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>, vector<4xi1>
+  return
+}
\ No newline at end of file
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir
index 2ff73923c8327..467b671500e17 100755
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir
@@ -30,6 +30,10 @@
 // Do the same run, but now with direct IR generation and VLA vectorization.
 // RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | %{run_sve} | FileCheck %s %}
 
+// Test that test-bufferization-analysis-only works. This option is useful
+// for understanding why buffer copies were inserted.
+// RUN: mlir-opt %s --sparsifier="test-bufferization-analysis-only" -o /dev/null
+
 #Sparse1 = #sparse_tensor.encoding<{
   map = (i, j, k) -> (
     j : compressed,
diff --git a/mlir/test/Integration/Dialect/Tosa/CPU/test-maxpool-dynamic.mlir b/mlir/test/Integration/Dialect/Tosa/CPU/test-maxpool-dynamic.mlir
new file mode 100644
index 0000000000000..05a78e32b9e11
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Tosa/CPU/test-maxpool-dynamic.mlir
@@ -0,0 +1,112 @@
+// DEFINE: %{tosa-to-linalg-pipeline} = -pass-pipeline="builtin.module(func.func(tosa-infer-shapes,tosa-to-linalg-named,tosa-to-linalg,tosa-to-arith))"
+
+// RUN:   mlir-opt %s \
+// RUN:     %{tosa-to-linalg-pipeline} \
+// RUN: | mlir-opt \
+// RUN:     -one-shot-bufferize="bufferize-function-boundaries" \
+// RUN:     -buffer-deallocation-pipeline \
+// RUN:     -test-lower-to-llvm \
+// RUN: | mlir-cpu-runner \
+// RUN:     -entry-point-result=void \
+// RUN:     -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils \
+// RUN: | FileCheck %s
+
+// Validate that the TOSA lowering for tosa.max_pool2d produces the same results when
+// for fully static and fully dynamic inputs.
+
+!tensor_type = tensor<1x4x4x1xf32>
+!memref_type = memref<1x4x4x1xf32>
+
+// Utility functions
+func.func private @printMemrefF32(memref<*xf32>) attributes { llvm.emit_c_interface }
+
+func.func @max_pool_static(%arg0: !tensor_type) -> (!tensor_type) {
+  %0 = tosa.max_pool2d %arg0 {
+    pad = array<i64: 1, 1, 1, 1>,
+    kernel = array<i64: 3, 3>,
+    stride = array<i64: 1, 1>
+  } : (tensor<1x4x4x1xf32>) -> tensor<1x4x4x1xf32>
+  return %0 : tensor<1x4x4x1xf32>
+}
+
+func.func @max_pool_dynamic(%arg0: tensor<?x?x?x?xf32>) -> (tensor<?x?x?x?xf32>) {
+  %0 = tosa.max_pool2d %arg0 {
+    pad = array<i64: 1, 1, 1, 1>,
+    kernel = array<i64: 3, 3>,
+    stride = array<i64: 1, 1>
+  } : (tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+  return %0 : tensor<?x?x?x?xf32>
+}
+
+// Test harness to compare the results of a fully statically shaped max_pool2d with
+// a fully dynamically shaped max_pool2d on the same inputs.
+func.func @main() {
+  %A = arith.constant dense<[[
+    [[0.0], [0.1], [0.2], [0.3]], // H = 0
+    [[1.0], [1.1], [1.2], [1.3]], // H = 1
+    [[2.0], [2.1], [2.2], [2.3]], // H = 2
+    [[3.0], [3.1], [3.2], [3.3]]  // H = 3
+  ]]> : tensor<1x4x4x1xf32>
+
+  %A_dynamic = tensor.cast %A : !tensor_type to tensor<?x?x?x?xf32>
+
+  // Call both static and dynamically sized variants
+  %result_static  = func.call @max_pool_static(%A) : (!tensor_type) -> !tensor_type
+  %result_dynamic = func.call @max_pool_dynamic(%A_dynamic) : (tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+
+  %static_buffer = bufferization.to_memref %result_static : !memref_type
+  %unranked_static_buffer = memref.cast %static_buffer : !memref_type to memref<*xf32>
+
+  // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 4, 4, 1] strides = [16, 4, 1, 1] data =
+
+  // CHECK-NEXT: 1.1
+  // CHECK-NEXT: 1.2
+  // CHECK-NEXT: 1.3
+  // CHECK-NEXT: 1.3
+
+  // CHECK-NEXT: 2.1
+  // CHECK-NEXT: 2.2
+  // CHECK-NEXT: 2.3
+  // CHECK-NEXT: 2.3
+
+  // CHECK-NEXT: 3.1
+  // CHECK-NEXT: 3.2
+  // CHECK-NEXT: 3.3
+  // CHECK-NEXT: 3.3
+
+  // CHECK-NEXT: 3.1
+  // CHECK-NEXT: 3.2
+  // CHECK-NEXT: 3.3
+  // CHECK-NEXT: 3.3
+
+  func.call @printMemrefF32(%unranked_static_buffer) : (memref<*xf32>) -> ()
+
+  %dynamic_buffer = bufferization.to_memref %result_dynamic : memref<?x?x?x?xf32>
+  %unranked_dynamic_buffer = memref.cast %dynamic_buffer : memref<?x?x?x?xf32> to memref<*xf32>
+
+  // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 4, 4, 1] strides = [16, 4, 1, 1] data =
+  // CHECK-NEXT: 1.1
+  // CHECK-NEXT: 1.2
+  // CHECK-NEXT: 1.3
+  // CHECK-NEXT: 1.3
+
+  // CHECK-NEXT: 2.1
+  // CHECK-NEXT: 2.2
+  // CHECK-NEXT: 2.3
+  // CHECK-NEXT: 2.3
+
+  // CHECK-NEXT: 3.1
+  // CHECK-NEXT: 3.2
+  // CHECK-NEXT: 3.3
+  // CHECK-NEXT: 3.3
+
+  // CHECK-NEXT: 3.1
+  // CHECK-NEXT: 3.2
+  // CHECK-NEXT: 3.3
+  // CHECK-NEXT: 3.3
+
+  func.call @printMemrefF32(%unranked_dynamic_buffer) : (memref<*xf32>) -> ()
+
+  return
+}
+
diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
index 81a6eadbadd3f..bf6847a32ff4f 100644
--- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll
+++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
@@ -597,7 +597,7 @@ define void @ushl_sat_test(i32 %0, i32 %1, <8 x i32> %2, <8 x i32> %3) {
 }
 
 ; CHECK-LABEL: llvm.func @va_intrinsics_test
-define void @va_intrinsics_test(ptr %0, ptr %1) {
+define void @va_intrinsics_test(ptr %0, ptr %1, ...) {
 ; CHECK: llvm.intr.vastart %{{.*}}
   call void @llvm.va_start.p0(ptr %0)
 ; CHECK: llvm.intr.vacopy %{{.*}} to %{{.*}}
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index 4cb99c1f1a285..d1390022c1dc4 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -638,10 +638,10 @@ llvm.func @test_omp_wsloop_guided_simd(%lb : i64, %ub : i64, %step : i64) -> ()
 
 // -----
 
-// CHECK-LABEL: @simdloop_simple
-llvm.func @simdloop_simple(%lb : i64, %ub : i64, %step : i64, %arg0: !llvm.ptr) {
-  "omp.simdloop" (%lb, %ub, %step) ({
-    ^bb0(%iv: i64):
+// CHECK-LABEL: @simd_simple
+llvm.func @simd_simple(%lb : i64, %ub : i64, %step : i64, %arg0: !llvm.ptr) {
+  "omp.simd" () ({
+    omp.loop_nest (%iv) : i64 = (%lb) to (%ub) step (%step) {
       %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32
       // The form of the emitted IR is controlled by OpenMPIRBuilder and
       // tested there. Just check that the right metadata is added.
@@ -649,8 +649,9 @@ llvm.func @simdloop_simple(%lb : i64, %ub : i64, %step : i64, %arg0: !llvm.ptr)
       %4 = llvm.getelementptr %arg0[%iv] : (!llvm.ptr, i64) -> !llvm.ptr, f32
       llvm.store %3, %4 : f32, !llvm.ptr
       omp.yield
-  }) {operandSegmentSizes = array<i32: 1,1,1,0,0,0>} :
-    (i64, i64, i64) -> ()
+    }
+    "omp.terminator"() : () -> ()
+  }) : () -> ()
 
   llvm.return
 }
@@ -659,34 +660,36 @@ llvm.func @simdloop_simple(%lb : i64, %ub : i64, %step : i64, %arg0: !llvm.ptr)
 
 // -----
 
-// CHECK-LABEL: @simdloop_simple_multiple
-llvm.func @simdloop_simple_multiple(%lb1 : i64, %ub1 : i64, %step1 : i64, %lb2 : i64, %ub2 : i64, %step2 : i64, %arg0: !llvm.ptr, %arg1: !llvm.ptr) {
-  omp.simdloop for (%iv1, %iv2) : i64 = (%lb1, %lb2) to (%ub1, %ub2) step (%step1, %step2) {
-    %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32
-    // The form of the emitted IR is controlled by OpenMPIRBuilder and
-    // tested there. Just check that the right metadata is added and collapsed
-    // loop bound is generated (Collapse clause is represented as a loop with
-    // list of indices, bounds and steps where the size of the list is equal
-    // to the collapse value.)
-    // CHECK: icmp slt i64
-    // CHECK-COUNT-3: select
-    // CHECK: %[[TRIPCOUNT0:.*]] = select
-    // CHECK: br label %[[PREHEADER:.*]]
-    // CHECK: [[PREHEADER]]:
-    // CHECK: icmp slt i64
-    // CHECK-COUNT-3: select
-    // CHECK: %[[TRIPCOUNT1:.*]] = select
-    // CHECK: mul nuw i64 %[[TRIPCOUNT0]], %[[TRIPCOUNT1]]
-    // CHECK: br label %[[COLLAPSED_PREHEADER:.*]]
-    // CHECK: [[COLLAPSED_PREHEADER]]:
-    // CHECK: br label %[[COLLAPSED_HEADER:.*]]
-    // CHECK: llvm.access.group
-    // CHECK-NEXT: llvm.access.group
-    %4 = llvm.getelementptr %arg0[%iv1] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-    %5 = llvm.getelementptr %arg1[%iv2] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-    llvm.store %3, %4 : f32, !llvm.ptr
-    llvm.store %3, %5 : f32, !llvm.ptr
-    omp.yield
+// CHECK-LABEL: @simd_simple_multiple
+llvm.func @simd_simple_multiple(%lb1 : i64, %ub1 : i64, %step1 : i64, %lb2 : i64, %ub2 : i64, %step2 : i64, %arg0: !llvm.ptr, %arg1: !llvm.ptr) {
+  omp.simd {
+    omp.loop_nest (%iv1, %iv2) : i64 = (%lb1, %lb2) to (%ub1, %ub2) step (%step1, %step2) {
+      %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32
+      // The form of the emitted IR is controlled by OpenMPIRBuilder and
+      // tested there. Just check that the right metadata is added and collapsed
+      // loop bound is generated (Collapse clause is represented as a loop with
+      // list of indices, bounds and steps where the size of the list is equal
+      // to the collapse value.)
+      // CHECK: icmp slt i64
+      // CHECK-COUNT-3: select
+      // CHECK: %[[TRIPCOUNT0:.*]] = select
+      // CHECK: br label %[[PREHEADER:.*]]
+      // CHECK: [[PREHEADER]]:
+      // CHECK: icmp slt i64
+      // CHECK-COUNT-3: select
+      // CHECK: %[[TRIPCOUNT1:.*]] = select
+      // CHECK: mul nuw i64 %[[TRIPCOUNT0]], %[[TRIPCOUNT1]]
+      // CHECK: br label %[[COLLAPSED_PREHEADER:.*]]
+      // CHECK: [[COLLAPSED_PREHEADER]]:
+      // CHECK: br label %[[COLLAPSED_HEADER:.*]]
+      // CHECK: llvm.access.group
+      // CHECK-NEXT: llvm.access.group
+      %4 = llvm.getelementptr %arg0[%iv1] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %5 = llvm.getelementptr %arg1[%iv2] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %3, %4 : f32, !llvm.ptr
+      llvm.store %3, %5 : f32, !llvm.ptr
+      omp.yield
+    }
   }
   llvm.return
 }
@@ -695,19 +698,21 @@ llvm.func @simdloop_simple_multiple(%lb1 : i64, %ub1 : i64, %step1 : i64, %lb2 :
 
 // -----
 
-// CHECK-LABEL: @simdloop_simple_multiple_simdlen
-llvm.func @simdloop_simple_multiple_simdlen(%lb1 : i64, %ub1 : i64, %step1 : i64, %lb2 : i64, %ub2 : i64, %step2 : i64, %arg0: !llvm.ptr, %arg1: !llvm.ptr) {
-  omp.simdloop simdlen(2) for (%iv1, %iv2) : i64 = (%lb1, %lb2) to (%ub1, %ub2) step (%step1, %step2) {
-    %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32
-    // The form of the emitted IR is controlled by OpenMPIRBuilder and
-    // tested there. Just check that the right metadata is added.
-    // CHECK: llvm.access.group
-    // CHECK-NEXT: llvm.access.group
-    %4 = llvm.getelementptr %arg0[%iv1] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-    %5 = llvm.getelementptr %arg1[%iv2] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-    llvm.store %3, %4 : f32, !llvm.ptr
-    llvm.store %3, %5 : f32, !llvm.ptr
-    omp.yield
+// CHECK-LABEL: @simd_simple_multiple_simdlen
+llvm.func @simd_simple_multiple_simdlen(%lb1 : i64, %ub1 : i64, %step1 : i64, %lb2 : i64, %ub2 : i64, %step2 : i64, %arg0: !llvm.ptr, %arg1: !llvm.ptr) {
+  omp.simd simdlen(2) {
+    omp.loop_nest (%iv1, %iv2) : i64 = (%lb1, %lb2) to (%ub1, %ub2) step (%step1, %step2) {
+      %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32
+      // The form of the emitted IR is controlled by OpenMPIRBuilder and
+      // tested there. Just check that the right metadata is added.
+      // CHECK: llvm.access.group
+      // CHECK-NEXT: llvm.access.group
+      %4 = llvm.getelementptr %arg0[%iv1] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %5 = llvm.getelementptr %arg1[%iv2] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %3, %4 : f32, !llvm.ptr
+      llvm.store %3, %5 : f32, !llvm.ptr
+      omp.yield
+    }
   }
   llvm.return
 }
@@ -717,15 +722,17 @@ llvm.func @simdloop_simple_multiple_simdlen(%lb1 : i64, %ub1 : i64, %step1 : i64
 
 // -----
 
-// CHECK-LABEL: @simdloop_simple_multiple_safelen
-llvm.func @simdloop_simple_multiple_safelen(%lb1 : i64, %ub1 : i64, %step1 : i64, %lb2 : i64, %ub2 : i64, %step2 : i64, %arg0: !llvm.ptr, %arg1: !llvm.ptr) {
-  omp.simdloop safelen(2) for (%iv1, %iv2) : i64 = (%lb1, %lb2) to (%ub1, %ub2) step (%step1, %step2) {
-    %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32
-    %4 = llvm.getelementptr %arg0[%iv1] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-    %5 = llvm.getelementptr %arg1[%iv2] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-    llvm.store %3, %4 : f32, !llvm.ptr
-    llvm.store %3, %5 : f32, !llvm.ptr
-    omp.yield
+// CHECK-LABEL: @simd_simple_multiple_safelen
+llvm.func @simd_simple_multiple_safelen(%lb1 : i64, %ub1 : i64, %step1 : i64, %lb2 : i64, %ub2 : i64, %step2 : i64, %arg0: !llvm.ptr, %arg1: !llvm.ptr) {
+  omp.simd safelen(2) {
+    omp.loop_nest (%iv1, %iv2) : i64 = (%lb1, %lb2) to (%ub1, %ub2) step (%step1, %step2) {
+      %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32
+      %4 = llvm.getelementptr %arg0[%iv1] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %5 = llvm.getelementptr %arg1[%iv2] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %3, %4 : f32, !llvm.ptr
+      llvm.store %3, %5 : f32, !llvm.ptr
+      omp.yield
+    }
   }
   llvm.return
 }
@@ -734,15 +741,17 @@ llvm.func @simdloop_simple_multiple_safelen(%lb1 : i64, %ub1 : i64, %step1 : i64
 
 // -----
 
-// CHECK-LABEL: @simdloop_simple_multiple_simdlen_safelen
-llvm.func @simdloop_simple_multiple_simdlen_safelen(%lb1 : i64, %ub1 : i64, %step1 : i64, %lb2 : i64, %ub2 : i64, %step2 : i64, %arg0: !llvm.ptr, %arg1: !llvm.ptr) {
-  omp.simdloop simdlen(1) safelen(2) for (%iv1, %iv2) : i64 = (%lb1, %lb2) to (%ub1, %ub2) step (%step1, %step2) {
-    %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32
-    %4 = llvm.getelementptr %arg0[%iv1] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-    %5 = llvm.getelementptr %arg1[%iv2] : (!llvm.ptr, i64) -> !llvm.ptr, f32
-    llvm.store %3, %4 : f32, !llvm.ptr
-    llvm.store %3, %5 : f32, !llvm.ptr
-    omp.yield
+// CHECK-LABEL: @simd_simple_multiple_simdlen_safelen
+llvm.func @simd_simple_multiple_simdlen_safelen(%lb1 : i64, %ub1 : i64, %step1 : i64, %lb2 : i64, %ub2 : i64, %step2 : i64, %arg0: !llvm.ptr, %arg1: !llvm.ptr) {
+  omp.simd simdlen(1) safelen(2) {
+    omp.loop_nest (%iv1, %iv2) : i64 = (%lb1, %lb2) to (%ub1, %ub2) step (%step1, %step2) {
+      %3 = llvm.mlir.constant(2.000000e+00 : f32) : f32
+      %4 = llvm.getelementptr %arg0[%iv1] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      %5 = llvm.getelementptr %arg1[%iv2] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %3, %4 : f32, !llvm.ptr
+      llvm.store %3, %5 : f32, !llvm.ptr
+      omp.yield
+    }
   }
   llvm.return
 }
@@ -751,8 +760,8 @@ llvm.func @simdloop_simple_multiple_simdlen_safelen(%lb1 : i64, %ub1 : i64, %ste
 
 // -----
 
-// CHECK-LABEL: @simdloop_if
-llvm.func @simdloop_if(%arg0: !llvm.ptr {fir.bindc_name = "n"}, %arg1: !llvm.ptr {fir.bindc_name = "threshold"}) {
+// CHECK-LABEL: @simd_if
+llvm.func @simd_if(%arg0: !llvm.ptr {fir.bindc_name = "n"}, %arg1: !llvm.ptr {fir.bindc_name = "threshold"}) {
   %0 = llvm.mlir.constant(1 : i64) : i64
   %1 = llvm.alloca %0 x i32 {adapt.valuebyref, in_type = i32, operandSegmentSizes = array<i32: 0, 0>} : (i64) -> !llvm.ptr
   %2 = llvm.mlir.constant(1 : i64) : i64
@@ -763,12 +772,14 @@ llvm.func @simdloop_if(%arg0: !llvm.ptr {fir.bindc_name = "n"}, %arg1: !llvm.ptr
   %7 = llvm.load %arg0 : !llvm.ptr -> i32
   %8 = llvm.load %arg1 : !llvm.ptr -> i32
   %9 = llvm.icmp "sge" %7, %8 : i32
-  omp.simdloop   if(%9) for  (%arg2) : i32 = (%4) to (%5) inclusive step (%6) {
-    // The form of the emitted IR is controlled by OpenMPIRBuilder and
-    // tested there. Just check that the right metadata is added.
-    // CHECK: llvm.access.group
-    llvm.store %arg2, %1 : i32, !llvm.ptr
-    omp.yield
+  omp.simd if(%9) {
+    omp.loop_nest (%arg2) : i32 = (%4) to (%5) inclusive step (%6) {
+      // The form of the emitted IR is controlled by OpenMPIRBuilder and
+      // tested there. Just check that the right metadata is added.
+      // CHECK: llvm.access.group
+      llvm.store %arg2, %1 : i32, !llvm.ptr
+      omp.yield
+    }
   }
   llvm.return
 }
diff --git a/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp b/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp
index 6730f9b292ad9..b098a5a23fd31 100644
--- a/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp
+++ b/mlir/test/lib/Dialect/Affine/TestReifyValueBounds.cpp
@@ -109,7 +109,7 @@ static LogicalResult testReifyValueBounds(func::FuncOp funcOp,
     FailureOr<OpFoldResult> reified = failure();
     if (constant) {
       auto reifiedConst = ValueBoundsConstraintSet::computeConstantBound(
-          boundType, value, dim, /*stopCondition=*/nullptr);
+          boundType, {value, dim}, /*stopCondition=*/nullptr);
       if (succeeded(reifiedConst))
         reified = FailureOr<OpFoldResult>(rewriter.getIndexAttr(*reifiedConst));
     } else if (scalable) {
@@ -128,22 +128,12 @@ static LogicalResult testReifyValueBounds(func::FuncOp funcOp,
             rewriter, loc, reifiedScalable->map, vscaleOperand);
       }
     } else {
-      if (dim) {
-        if (useArithOps) {
-          reified = arith::reifyShapedValueDimBound(
-              rewriter, op->getLoc(), boundType, value, *dim, stopCondition);
-        } else {
-          reified = reifyShapedValueDimBound(rewriter, op->getLoc(), boundType,
-                                             value, *dim, stopCondition);
-        }
+      if (useArithOps) {
+        reified = arith::reifyValueBound(rewriter, op->getLoc(), boundType,
+                                         op.getVariable(), stopCondition);
       } else {
-        if (useArithOps) {
-          reified = arith::reifyIndexValueBound(
-              rewriter, op->getLoc(), boundType, value, stopCondition);
-        } else {
-          reified = reifyIndexValueBound(rewriter, op->getLoc(), boundType,
-                                         value, stopCondition);
-        }
+        reified = reifyValueBound(rewriter, op->getLoc(), boundType,
+                                  op.getVariable(), stopCondition);
       }
     }
     if (failed(reified)) {
@@ -188,9 +178,7 @@ static LogicalResult testEquality(func::FuncOp funcOp) {
     }
 
     auto compare = [&](ValueBoundsConstraintSet::ComparisonOperator cmp) {
-      return ValueBoundsConstraintSet::compare(
-          /*lhs=*/op.getLhs(), /*lhsDim=*/std::nullopt, cmp,
-          /*rhs=*/op.getRhs(), /*rhsDim=*/std::nullopt);
+      return ValueBoundsConstraintSet::compare(op.getLhs(), cmp, op.getRhs());
     };
     if (compare(cmpType)) {
       op->emitRemark("true");
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp
index 25c5190ca0ef3..a23ed89c4b04d 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.cpp
+++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp
@@ -549,6 +549,12 @@ LogicalResult ReifyBoundOp::verify() {
   return success();
 }
 
+::mlir::ValueBoundsConstraintSet::Variable ReifyBoundOp::getVariable() {
+  if (getDim().has_value())
+    return ValueBoundsConstraintSet::Variable(getVar(), *getDim());
+  return ValueBoundsConstraintSet::Variable(getVar());
+}
+
 ::mlir::ValueBoundsConstraintSet::ComparisonOperator
 CompareOp::getComparisonOperator() {
   if (getCmp() == "EQ")
@@ -564,6 +570,37 @@ CompareOp::getComparisonOperator() {
   llvm_unreachable("invalid comparison operator");
 }
 
+::mlir::ValueBoundsConstraintSet::Variable CompareOp::getLhs() {
+  if (!getLhsMap())
+    return ValueBoundsConstraintSet::Variable(getVarOperands()[0]);
+  SmallVector<Value> mapOperands(
+      getVarOperands().slice(0, getLhsMap()->getNumInputs()));
+  return ValueBoundsConstraintSet::Variable(*getLhsMap(), mapOperands);
+}
+
+::mlir::ValueBoundsConstraintSet::Variable CompareOp::getRhs() {
+  int64_t rhsOperandsBegin = getLhsMap() ? getLhsMap()->getNumInputs() : 1;
+  if (!getRhsMap())
+    return ValueBoundsConstraintSet::Variable(
+        getVarOperands()[rhsOperandsBegin]);
+  SmallVector<Value> mapOperands(
+      getVarOperands().slice(rhsOperandsBegin, getRhsMap()->getNumInputs()));
+  return ValueBoundsConstraintSet::Variable(*getRhsMap(), mapOperands);
+}
+
+LogicalResult CompareOp::verify() {
+  if (getCompose() && (getLhsMap() || getRhsMap()))
+    return emitOpError(
+        "'compose' not supported when 'lhs_map' or 'rhs_map' is present");
+  int64_t expectedNumOperands = getLhsMap() ? getLhsMap()->getNumInputs() : 1;
+  expectedNumOperands += getRhsMap() ? getRhsMap()->getNumInputs() : 1;
+  if (getVarOperands().size() != size_t(expectedNumOperands))
+    return emitOpError("expected ")
+           << expectedNumOperands << " operands, but got "
+           << getVarOperands().size();
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // Test removing op with inner ops.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index ebf158b8bb820..b641b3da719c7 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -2207,6 +2207,7 @@ def ReifyBoundOp : TEST_Op<"reify_bound", [Pure]> {
 
   let extraClassDeclaration = [{
     ::mlir::presburger::BoundType getBoundType();
+    ::mlir::ValueBoundsConstraintSet::Variable getVariable();
   }];
 
   let hasVerifier = 1;
@@ -2217,18 +2218,29 @@ def CompareOp : TEST_Op<"compare"> {
     Compare `lhs` and `rhs`. A remark is emitted which indicates whether the
     specified comparison operator was proven to hold. The remark also indicates
     whether the opposite comparison operator was proven to hold.
+
+    `var_operands` must have exactly two operands: one for the LHS operand and
+    one for the RHS operand. If `lhs_map` is specified, as many operands as
+    `lhs_map` has inputs are expected instead of the first operand. If `rhs_map`
+    is specified, as many operands as `rhs_map` has inputs are expected instead
+    of the second operand.
   }];
 
-  let arguments = (ins Index:$lhs,
-                       Index:$rhs,
+  let arguments = (ins Variadic<Index>:$var_operands,
                        DefaultValuedAttr<StrAttr, "\"EQ\"">:$cmp,
+                       OptionalAttr<AffineMapAttr>:$lhs_map,
+                       OptionalAttr<AffineMapAttr>:$rhs_map,
                        UnitAttr:$compose);
   let results = (outs);
 
   let extraClassDeclaration = [{
     ::mlir::ValueBoundsConstraintSet::ComparisonOperator
         getComparisonOperator();
+    ::mlir::ValueBoundsConstraintSet::Variable getLhs();
+    ::mlir::ValueBoundsConstraintSet::Variable getRhs();
   }];
+
+  let hasVerifier = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/python/dialects/nvgpu.py b/mlir/test/python/dialects/nvgpu.py
index 3158388f0e686..6df32bdd3c273 100644
--- a/mlir/test/python/dialects/nvgpu.py
+++ b/mlir/test/python/dialects/nvgpu.py
@@ -15,6 +15,23 @@ def constructAndPrintInModule(f):
     return f
 
 
+# CHECK-LABEL: testTypes
+@constructAndPrintInModule
+def testTypes():
+    tensorMemrefType = MemRefType.get(
+        (128, 64), F16Type.get(), memory_space=Attribute.parse("3")
+    )
+    # CHECK: !nvgpu.tensormap.descriptor<tensor = memref<128x64xf16, 3>, swizzle = swizzle_128b, l2promo = l2promo_256b, oob = nan, interleave = none>
+    tma_desc = nvgpu.TensorMapDescriptorType.get(
+        tensorMemrefType,
+        nvgpu.TensorMapSwizzleKind.SWIZZLE_128B,
+        nvgpu.TensorMapL2PromoKind.L2PROMO_256B,
+        nvgpu.TensorMapOOBKind.OOB_NAN,
+        nvgpu.TensorMapInterleaveKind.INTERLEAVE_NONE,
+    )
+    print(tma_desc)
+
+
 # CHECK-LABEL: testSmoke
 @constructAndPrintInModule
 def testSmoke():
diff --git a/mlir/test/python/dialects/transform_interpreter.py b/mlir/test/python/dialects/transform_interpreter.py
index 740c49f76a26c..807a98c493279 100644
--- a/mlir/test/python/dialects/transform_interpreter.py
+++ b/mlir/test/python/dialects/transform_interpreter.py
@@ -54,3 +54,79 @@ def failed():
         assert (
             "must implement TransformOpInterface to be used as transform root" in str(e)
         )
+
+
+print_root_via_include_module = """
+module @print_root_via_include_module attributes {transform.with_named_sequence} {
+  transform.named_sequence private @callee1(%root: !transform.any_op {transform.readonly})
+  transform.named_sequence private @callee2(%root: !transform.any_op {transform.readonly})
+  transform.named_sequence @__transform_main(%root: !transform.any_op) {
+    transform.include @callee2 failures(propagate)
+        (%root) : (!transform.any_op) -> ()
+    transform.yield
+  }
+}"""
+
+callee2_definition = """
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence private @callee1(%root: !transform.any_op {transform.readonly})
+  transform.named_sequence @callee2(%root: !transform.any_op {transform.readonly}) {
+    transform.include @callee1 failures(propagate)
+        (%root) : (!transform.any_op) -> ()
+    transform.yield
+  }
+}
+"""
+
+callee1_definition = """
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @callee1(%root: !transform.any_op {transform.readonly}) {
+    transform.print %root { name = \"from interpreter\" }: !transform.any_op
+    transform.yield
+  }
+}
+"""
+
+
+@test_in_context
+def include():
+    main = ir.Module.parse(print_root_via_include_module)
+    callee1 = ir.Module.parse(callee1_definition)
+    callee2 = ir.Module.parse(callee2_definition)
+    interp.copy_symbols_and_merge_into(main, callee1)
+    interp.copy_symbols_and_merge_into(main, callee2)
+
+    # CHECK: @print_root_via_include_module
+    # CHECK: transform.named_sequence @__transform_main
+    # CHECK: transform.include @callee2
+    #
+    # CHECK: transform.named_sequence @callee1
+    # CHECK: transform.print
+    #
+    # CHECK: transform.named_sequence @callee2
+    # CHECK: transform.include @callee1
+    interp.apply_named_sequence(main, main.body.operations[0], main)
+
+
+@test_in_context
+def partial_include():
+    main = ir.Module.parse(print_root_via_include_module)
+    callee2 = ir.Module.parse(callee2_definition)
+    interp.copy_symbols_and_merge_into(main, callee2)
+
+    try:
+        interp.apply_named_sequence(main, main.body.operations[0], main)
+    except ValueError as e:
+        assert "Failed to apply" in str(e)
+
+
+@test_in_context
+def repeated_include():
+    main = ir.Module.parse(print_root_via_include_module)
+    callee2 = ir.Module.parse(callee2_definition)
+    interp.copy_symbols_and_merge_into(main, callee2)
+
+    try:
+        interp.copy_symbols_and_merge_into(main, callee2)
+    except ValueError as e:
+        assert "doubly defined symbol @callee2" in str(e)
diff --git a/mlir/test/python/ir/operation.py b/mlir/test/python/ir/operation.py
index 04f8a9936e31f..9666e63bda1e0 100644
--- a/mlir/test/python/ir/operation.py
+++ b/mlir/test/python/ir/operation.py
@@ -1015,3 +1015,78 @@ def testOperationParse():
         print(
             f"op_with_source_name: {o.get_asm(enable_debug_info=True, use_local_scope=True)}"
         )
+
+
+# CHECK-LABEL: TEST: testOpWalk
+@run
+def testOpWalk():
+    ctx = Context()
+    ctx.allow_unregistered_dialects = True
+    module = Module.parse(
+        r"""
+    builtin.module {
+      func.func @f() {
+        func.return
+      }
+    }
+  """,
+        ctx,
+    )
+
+    def callback(op):
+        print(op.name)
+        return WalkResult.ADVANCE
+
+    # Test post-order walk (default).
+    # CHECK-NEXT:  Post-order
+    # CHECK-NEXT:  func.return
+    # CHECK-NEXT:  func.func
+    # CHECK-NEXT:  builtin.module
+    print("Post-order")
+    module.operation.walk(callback)
+
+    # Test pre-order walk.
+    # CHECK-NEXT:  Pre-order
+    # CHECK-NEXT:  builtin.module
+    # CHECK-NEXT:  func.fun
+    # CHECK-NEXT:  func.return
+    print("Pre-order")
+    module.operation.walk(callback, WalkOrder.PRE_ORDER)
+
+    # Test interrput.
+    # CHECK-NEXT:  Interrupt post-order
+    # CHECK-NEXT:  func.return
+    print("Interrupt post-order")
+
+    def callback(op):
+        print(op.name)
+        return WalkResult.INTERRUPT
+
+    module.operation.walk(callback)
+
+    # Test skip.
+    # CHECK-NEXT:  Skip pre-order
+    # CHECK-NEXT:  builtin.module
+    print("Skip pre-order")
+
+    def callback(op):
+        print(op.name)
+        return WalkResult.SKIP
+
+    module.operation.walk(callback, WalkOrder.PRE_ORDER)
+
+    # Test exception.
+    # CHECK: Exception
+    # CHECK-NEXT: func.return
+    # CHECK-NEXT: Exception raised
+    print("Exception")
+
+    def callback(op):
+        print(op.name)
+        raise ValueError
+        return WalkResult.ADVANCE
+
+    try:
+        module.operation.walk(callback)
+    except ValueError:
+        print("Exception raised")
diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp
index ac85b2b3f2fcd..fc33376511817 100644
--- a/openmp/runtime/src/kmp_dispatch.cpp
+++ b/openmp/runtime/src/kmp_dispatch.cpp
@@ -2397,6 +2397,8 @@ static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
           sh->u.s.ordered_iteration = 0;
         }
 
+        KMP_MB(); /* Flush all pending memory write invalidates.  */
+
         sh->buffer_index += __kmp_dispatch_num_buffers;
         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
                        gtid, sh->buffer_index));
diff --git a/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp b/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp
index 3f2ceef0c4add..36825dbebafb5 100644
--- a/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp
+++ b/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp
@@ -1,7 +1,4 @@
 // RUN: %libomp-cxx-compile-and-run
-//
-// AIX runs out of resource in 32-bit with 4*omp_get_max_threads() threads.
-// XFAIL: aix && ppc
 
 #include <omp.h>
 
@@ -11,6 +8,12 @@
 #include <thread>
 #include <vector>
 
+// AIX runs out of resource in 32-bit if 4*omp_get_max_threads() is more
+// than 64 threads with the default stack size.
+#if defined(_AIX) && !__LP64__
+#define MAX_THREADS 64
+#endif
+
 void dummy_root() {
   // omp_get_max_threads() will do middle initialization
   int nthreads = omp_get_max_threads();
@@ -18,9 +21,14 @@ void dummy_root() {
 }
 
 int main(int argc, char *argv[]) {
-  const int N = std::min(std::max(std::max(32, 4 * omp_get_max_threads()),
-                                  4 * omp_get_num_procs()),
-                         std::numeric_limits<int>::max());
+  int N = std::min(std::max(std::max(32, 4 * omp_get_max_threads()),
+                            4 * omp_get_num_procs()),
+                   std::numeric_limits<int>::max());
+
+#if defined(_AIX) && !__LP64__
+  if (N > MAX_THREADS)
+    N = MAX_THREADS;
+#endif
 
   std::vector<int> data(N);
 
diff --git a/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp b/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp
index f7405d00255cb..1cceee95e704b 100644
--- a/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp
+++ b/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp
@@ -1,7 +1,4 @@
 // RUN: %libomp-cxx-compile-and-run
-//
-// AIX runs out of resource in 32-bit with 4*omp_get_max_threads() threads.
-// XFAIL: aix && ppc
 
 #include <omp.h>
 
@@ -10,10 +7,21 @@
 #include <limits>
 #include <vector>
 
+// AIX runs out of resource in 32-bit if 4*omp_get_max_threads() is more
+// than 64 threads with the default stacksize.
+#if defined(_AIX) && !__LP64__
+#define MAX_THREADS 64
+#endif
+
 int main(int argc, char *argv[]) {
-  const int N = std::min(std::max(std::max(32, 4 * omp_get_max_threads()),
-                                  4 * omp_get_num_procs()),
-                         std::numeric_limits<int>::max());
+  int N = std::min(std::max(std::max(32, 4 * omp_get_max_threads()),
+                            4 * omp_get_num_procs()),
+                   std::numeric_limits<int>::max());
+
+#if defined(_AIX) && !__LP64__
+  if (N > MAX_THREADS)
+    N = MAX_THREADS;
+#endif
 
   std::vector<int> data(N);
 
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index c2f77e3abca0e..725ac6bb38120 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -1136,6 +1136,7 @@ cc_library(
         "//llvm:AllTargetsAsmParsers",
         "//llvm:AllTargetsCodeGens",
         "//llvm:Core",
+        "//llvm:Demangle",
         "//llvm:FrontendHLSL",
         "//llvm:FrontendOpenMP",
         "//llvm:MC",
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index fb37f113b310a..be02c22704321 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -786,7 +786,7 @@ libc_support_library(
         ":errno",
         ":hdr_fenv_macros",
         ":hdr_math_macros",
-	":types_fenv_t"
+        ":types_fenv_t",
     ],
 )
 
@@ -1145,6 +1145,17 @@ libc_function(
     ],
 )
 
+libc_function(
+    name = "fetestexceptflag",
+    srcs = ["src/fenv/fetestexceptflag.cpp"],
+    hdrs = ["src/fenv/fetestexceptflag.h"],
+    deps = [
+        ":__support_common",
+        ":__support_fputil_fenv_impl",
+        ":types_fexcept_t",
+    ],
+)
+
 libc_function(
     name = "feclearexcept",
     srcs = ["src/fenv/feclearexcept.cpp"],
@@ -1262,7 +1273,7 @@ libc_function(
     deps = [
         ":__support_common",
         ":__support_fputil_fenv_impl",
-	":types_fexcept_t"
+        ":types_fexcept_t",
     ],
 )
 
@@ -1273,7 +1284,7 @@ libc_function(
     deps = [
         ":__support_common",
         ":__support_fputil_fenv_impl",
-	":types_fexcept_t",
+        ":types_fexcept_t",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/fenv/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/fenv/BUILD.bazel
index 359db0723dfd3..fc3ab3da3587c 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/fenv/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/fenv/BUILD.bazel
@@ -75,6 +75,7 @@ libc_test(
     libc_function_deps = [
         "//libc:fegetexceptflag",
         "//libc:fesetexceptflag",
+        "//libc:fetestexceptflag",
     ],
     deps = [
         "//libc:__support_fputil_fenv_impl",
diff --git a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
index 6dfe8085b9285..1f2b5b476bcc1 100644
--- a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
@@ -702,6 +702,9 @@ cc_library(
             "//lldb/source/Plugins:PluginSymbolLocatorDebugSymbols",
             "//lldb/source/Plugins:PluginSymbolVendorMacOSX",
         ],
+        "@platforms//os:linux": [
+            "//lldb/source/Plugins:PluginProcessLinux",
+        ],
         "//conditions:default": [],
     }),
 )
@@ -752,7 +755,13 @@ cc_binary(
     data = [
         ":lldb-argdumper",
     ] + select({
-        "@platforms//os:macos": [":debugserver"],
+        "@platforms//os:macos": [
+            ":debugserver",
+            ":lldb-server",
+        ],
+        "@platforms//os:linux": [
+            ":lldb-server",
+        ],
         "//conditions:default": [],
     }),
     deps = [
@@ -799,8 +808,8 @@ cc_library(
         ["tools/debugserver/source/**/*.cpp"],
         exclude = ["tools/debugserver/source/debugserver.cpp"],
     ),
-    tags = ["nobuildkite"],
     local_defines = ["LLDB_USE_OS_LOG"],
+    tags = ["nobuildkite"],
     deps = [
         ":DebugServerCommonHeaders",
         ":DebugServerCommonMacOSXHeaders",
@@ -852,3 +861,63 @@ cc_binary(
     srcs = glob(["tools/argdumper/*.cpp"]),
     deps = ["//llvm:Support"],
 )
+
+gentbl_cc_library(
+    name = "lldb_server_opts_gen",
+    strip_include_prefix = ".",
+    tbl_outs = [(
+        ["-gen-opt-parser-defs"],
+        "LLGSOptions.inc",
+    )],
+    tblgen = "//llvm:llvm-tblgen",
+    td_file = "tools/lldb-server/LLGSOptions.td",
+    deps = ["//llvm:OptParserTdFiles"],
+)
+
+cc_binary(
+    name = "lldb-server",
+    srcs = glob([
+        "tools/lldb-server/*.cpp",
+        "tools/lldb-server/*.h",
+    ]),
+    target_compatible_with = select({
+        "@platforms//os:linux": [],
+        "@platforms//os:macos": [],
+        # TODO: This can theoretically support more platforms, but it hasn't been tested yet
+        "//conditions:default": ["@platforms//:incompatible"],
+    }),
+    deps = [
+        ":Host",
+        ":Initialization",
+        ":Utility",
+        ":Version",
+        ":lldb_server_opts_gen",
+        "//lldb:Target",
+        "//lldb:TargetHeaders",
+        "//lldb/source/Plugins:PluginCPlusPlusLanguage",
+        "//lldb/source/Plugins:PluginExpressionParserClang",
+        "//lldb/source/Plugins:PluginInstructionARM",
+        "//lldb/source/Plugins:PluginInstructionARM64",
+        "//lldb/source/Plugins:PluginInstructionLoongArch",
+        "//lldb/source/Plugins:PluginInstructionMIPS",
+        "//lldb/source/Plugins:PluginInstructionMIPS64",
+        "//lldb/source/Plugins:PluginInstructionRISCV",
+        "//lldb/source/Plugins:PluginObjCLanguage",
+        "//lldb/source/Plugins:PluginProcessGDBRemote",
+        "//lldb/source/Plugins:PluginSymbolFileDWARF",
+        "//lldb/source/Plugins:PluginSymbolFileNativePDB",
+        "//lldb/source/Plugins:PluginSymbolFilePDB",
+        "//lldb/source/Plugins:PluginTypeSystemClang",
+        "//llvm:Option",
+        "//llvm:Support",
+    ] + select({
+        "@platforms//os:linux": [
+            "//lldb/source/Plugins:PluginObjectFileELF",
+            "//lldb/source/Plugins:PluginProcessLinux",
+        ],
+        "@platforms//os:macos": [
+            "//lldb/source/Plugins:PluginObjectFileMachO",
+        ],
+        "//conditions:default": [],
+    }),
+)
diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
index bbc523f54a190..b5f5bed1698a6 100644
--- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
@@ -2100,6 +2100,25 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "PluginProcessLinux",
+    srcs = glob(["Process/Linux/*.cpp"]),
+    hdrs = glob(["Process/Linux/*.h"]),
+    include_prefix = "Plugins",
+    deps = [
+        ":PluginProcessPOSIX",
+        ":PluginProcessUtility",
+        "//lldb:Core",
+        "//lldb:Headers",
+        "//lldb:Host",
+        "//lldb:SymbolHeaders",
+        "//lldb:TargetHeaders",
+        "//lldb:Utility",
+        "//llvm:Support",
+        "//llvm:TargetParser",
+    ],
+)
+
 cc_library(
     name = "PluginScriptedProcess",
     srcs = glob(["Process/scripted/*.cpp"]),
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 653c4bd30600a..58538b66c5e0c 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -577,6 +577,7 @@ mlir_c_api_cc_library(
     ],
     includes = ["include"],
     deps = [
+        ":IR",
         ":NVGPUDialect",
     ],
 )
@@ -11631,6 +11632,7 @@ cc_library(
         ":DialectUtils",
         ":FuncDialect",
         ":IR",
+        ":InferTypeOpInterface",
         ":LinalgDialect",
         ":MathDialect",
         ":Pass",